LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
159 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
301 setOperationAction(ISD::LRINT, MVT::f32, Custom);
302 setOperationAction(ISD::LRINT, MVT::f64, Custom);
303 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
304 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
305
306 if (!Subtarget.is64Bit()) {
307 setOperationAction(ISD::LRINT, MVT::i64, Custom);
308 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
331 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
332 MVT::v4i64}) {
335 }
336 if (Subtarget.is64Bit()) {
339 }
340 }
341
342 // Handle address space casts between mixed sized pointers.
343 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
344 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
345
346 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
347 if (!Subtarget.hasSSE2()) {
348 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
349 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
352 if (Subtarget.is64Bit()) {
353 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
354 // Without SSE, i64->f64 goes through memory.
355 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
356 }
357 } else if (!Subtarget.is64Bit())
358 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
359
360 // Scalar integer divide and remainder are lowered to use operations that
361 // produce two results, to match the available instructions. This exposes
362 // the two-result form to trivial CSE, which is able to combine x/y and x%y
363 // into a single instruction.
364 //
365 // Scalar integer multiply-high is also lowered to use two-result
366 // operations, to match the available instructions. However, plain multiply
367 // (low) operations are left as Legal, as there are single-result
368 // instructions for this in x86. Using the two-result multiply instructions
369 // when both high and low results are needed must be arranged by dagcombine.
370 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
377 }
378
379 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
380 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
381 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
383 setOperationAction(ISD::BR_CC, VT, Expand);
385 }
386 if (Subtarget.is64Bit())
391
392 setOperationAction(ISD::FREM , MVT::f32 , Expand);
393 setOperationAction(ISD::FREM , MVT::f64 , Expand);
394 setOperationAction(ISD::FREM , MVT::f80 , Expand);
395 setOperationAction(ISD::FREM , MVT::f128 , Expand);
396
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
399 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
400 setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
401 setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
402 setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
403 }
404
405 // Promote the i8 variants and force them on up to i32 which has a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
409 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
410 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
411 // promote that too.
412 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
414
415 if (!Subtarget.hasBMI()) {
416 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
418 if (Subtarget.is64Bit()) {
419 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
421 }
422 }
423
424 if (Subtarget.hasLZCNT()) {
425 // When promoting the i8 variants, force them to i32 for a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
429 } else {
430 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 continue;
435 }
436 }
437
438 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
439 ISD::STRICT_FP_TO_FP16}) {
440 // Special handling for half-precision floating point conversions.
441 // If we don't have F16C support, then lower half float conversions
442 // into library calls.
444 Op, MVT::f32,
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
446 // There's never any support for operations beyond MVT::f32.
447 setOperationAction(Op, MVT::f64, Expand);
448 setOperationAction(Op, MVT::f80, Expand);
449 setOperationAction(Op, MVT::f128, Expand);
450 }
451
452 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
453 setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand);
454 setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand);
455 }
456
457 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
460 setTruncStoreAction(VT, MVT::f16, Expand);
461 setTruncStoreAction(VT, MVT::bf16, Expand);
462
463 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
464 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
465 }
466
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
473 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
474 // popcntw is longer to encode than popcntl and also has a false dependency
475 // on the dest that popcntl hasn't had since Cannon Lake.
476 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
477 } else {
482 }
483
484 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
485
486 if (!Subtarget.hasMOVBE())
488
489 // X86 wants to expand cmov itself.
490 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 }
496 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
498 continue;
501 }
502
503 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
506
508 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
509 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
513
514 // Darwin ABI issue.
515 for (auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
524 }
525
526 // 64-bit shl, sra, srl (iff 32-bit x86)
527 for (auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
529 continue;
533 }
534
535 if (Subtarget.hasSSEPrefetch())
536 setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
537
538 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
539
540 // Expand certain atomics
541 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
544 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
545 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
546 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
547 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
548 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
549 }
550
551 if (!Subtarget.is64Bit())
552 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
553
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
555 // All CPUs supporting AVX will atomically load/store aligned 128-bit
556 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
557 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
558 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
559 }
560
561 if (Subtarget.canUseCMPXCHG16B())
562 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
563
564 // FIXME - use subtarget debug flags
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
567 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
568 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
569 }
570
573
574 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
575 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
576
577 setOperationAction(ISD::TRAP, MVT::Other, Legal);
578 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
579 if (Subtarget.isTargetPS())
580 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
581 else
582 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
583
584 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
585 setOperationAction(ISD::VASTART , MVT::Other, Custom);
586 setOperationAction(ISD::VAEND , MVT::Other, Expand);
587 bool Is64Bit = Subtarget.is64Bit();
588 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
589 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
590
591 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
592 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
593
594 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
595
596 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
597 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
598 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
599
601
602 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
603 setOperationAction(ISD::FABS, VT, Action);
604 setOperationAction(ISD::FNEG, VT, Action);
606 setOperationAction(ISD::FREM, VT, Action);
607 setOperationAction(ISD::FMA, VT, Action);
608 setOperationAction(ISD::FMINNUM, VT, Action);
609 setOperationAction(ISD::FMAXNUM, VT, Action);
610 setOperationAction(ISD::FMINIMUM, VT, Action);
611 setOperationAction(ISD::FMAXIMUM, VT, Action);
612 setOperationAction(ISD::FMINIMUMNUM, VT, Action);
613 setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
614 setOperationAction(ISD::FSIN, VT, Action);
615 setOperationAction(ISD::FCOS, VT, Action);
616 setOperationAction(ISD::FSINCOS, VT, Action);
617 setOperationAction(ISD::FTAN, VT, Action);
618 setOperationAction(ISD::FSQRT, VT, Action);
619 setOperationAction(ISD::FPOW, VT, Action);
620 setOperationAction(ISD::FPOWI, VT, Action);
621 setOperationAction(ISD::FLOG, VT, Action);
622 setOperationAction(ISD::FLOG2, VT, Action);
623 setOperationAction(ISD::FLOG10, VT, Action);
624 setOperationAction(ISD::FEXP, VT, Action);
625 setOperationAction(ISD::FEXP2, VT, Action);
626 setOperationAction(ISD::FEXP10, VT, Action);
627 setOperationAction(ISD::FCEIL, VT, Action);
628 setOperationAction(ISD::FFLOOR, VT, Action);
629 setOperationAction(ISD::FNEARBYINT, VT, Action);
630 setOperationAction(ISD::FRINT, VT, Action);
631 setOperationAction(ISD::BR_CC, VT, Action);
632 setOperationAction(ISD::SETCC, VT, Action);
635 setOperationAction(ISD::FROUND, VT, Action);
636 setOperationAction(ISD::FROUNDEVEN, VT, Action);
637 setOperationAction(ISD::FTRUNC, VT, Action);
638 setOperationAction(ISD::FLDEXP, VT, Action);
639 };
640
641 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
642 // f16, f32 and f64 use SSE.
643 // Set up the FP register classes.
644 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
645 : &X86::FR16RegClass);
646 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
647 : &X86::FR32RegClass);
648 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
649 : &X86::FR64RegClass);
650
651 // Disable f32->f64 extload as we can only generate this in one instruction
652 // under optsize. So its easier to pattern match (fpext (load)) for that
653 // case instead of needing to emit 2 instructions for extload in the
654 // non-optsize case.
655 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
656
657 for (auto VT : { MVT::f32, MVT::f64 }) {
658 // Use ANDPD to simulate FABS.
659 setOperationAction(ISD::FABS, VT, Custom);
660
661 // Use XORP to simulate FNEG.
662 setOperationAction(ISD::FNEG, VT, Custom);
663
664 // Use ANDPD and ORPD to simulate FCOPYSIGN.
666
667 // These might be better off as horizontal vector ops.
670
671 // We don't support sin/cos/fmod
672 setOperationAction(ISD::FSIN , VT, Expand);
673 setOperationAction(ISD::FCOS , VT, Expand);
674 setOperationAction(ISD::FSINCOS, VT, Expand);
675 }
676
677 // Half type will be promoted by default.
678 setF16Action(MVT::f16, Promote);
683 setOperationAction(ISD::FABS, MVT::f16, Custom);
684 setOperationAction(ISD::FNEG, MVT::f16, Custom);
687 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
688 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
689
720 setOperationAction(ISD::LRINT, MVT::f16, Expand);
721 setOperationAction(ISD::LLRINT, MVT::f16, Expand);
722
723 // Lower this to MOVMSK plus an AND.
726
727 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
728 (UseX87 || Is64Bit)) {
729 // Use SSE for f32, x87 for f64.
730 // Set up the FP register classes.
731 addRegisterClass(MVT::f32, &X86::FR32RegClass);
732 if (UseX87)
733 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
734
735 // Use ANDPS to simulate FABS.
736 setOperationAction(ISD::FABS , MVT::f32, Custom);
737
738 // Use XORP to simulate FNEG.
739 setOperationAction(ISD::FNEG , MVT::f32, Custom);
740
741 if (UseX87)
743
744 // Use ANDPS and ORPS to simulate FCOPYSIGN.
745 if (UseX87)
748
749 // We don't support sin/cos/fmod
750 setOperationAction(ISD::FSIN , MVT::f32, Expand);
751 setOperationAction(ISD::FCOS , MVT::f32, Expand);
752 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
753
754 if (UseX87) {
755 // Always expand sin/cos functions even though x87 has an instruction.
756 setOperationAction(ISD::FSIN, MVT::f64, Expand);
757 setOperationAction(ISD::FCOS, MVT::f64, Expand);
758 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
759 }
760 } else if (UseX87) {
761 // f32 and f64 in x87.
762 // Set up the FP register classes.
763 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
764 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
765
766 for (auto VT : { MVT::f32, MVT::f64 }) {
769
770 // Always expand sin/cos functions even though x87 has an instruction.
771 setOperationAction(ISD::FSIN , VT, Expand);
772 setOperationAction(ISD::FCOS , VT, Expand);
773 setOperationAction(ISD::FSINCOS, VT, Expand);
774 }
775 }
776
777 // Expand FP32 immediates into loads from the stack, save special cases.
778 if (isTypeLegal(MVT::f32)) {
779 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
780 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
781 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
782 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
783 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
784 } else // SSE immediates.
785 addLegalFPImmediate(APFloat(+0.0f)); // xorps
786 }
787 // Expand FP64 immediates into loads from the stack, save special cases.
788 if (isTypeLegal(MVT::f64)) {
789 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
790 addLegalFPImmediate(APFloat(+0.0)); // FLD0
791 addLegalFPImmediate(APFloat(+1.0)); // FLD1
792 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
793 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
794 } else // SSE immediates.
795 addLegalFPImmediate(APFloat(+0.0)); // xorpd
796 }
797 // Support fp16 0 immediate.
798 if (isTypeLegal(MVT::f16))
799 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
800
801 // Handle constrained floating-point operations of scalar.
814
815 // We don't support FMA.
818
819 // f80 always uses X87.
820 if (UseX87) {
821 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
824 {
826 addLegalFPImmediate(TmpFlt); // FLD0
827 TmpFlt.changeSign();
828 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
829
830 bool ignored;
831 APFloat TmpFlt2(+1.0);
833 &ignored);
834 addLegalFPImmediate(TmpFlt2); // FLD1
835 TmpFlt2.changeSign();
836 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
837 }
838
839 // Always expand sin/cos functions even though x87 has an instruction.
840 // clang-format off
841 setOperationAction(ISD::FSIN , MVT::f80, Expand);
842 setOperationAction(ISD::FCOS , MVT::f80, Expand);
843 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
844 setOperationAction(ISD::FTAN , MVT::f80, Expand);
845 setOperationAction(ISD::FASIN , MVT::f80, Expand);
846 setOperationAction(ISD::FACOS , MVT::f80, Expand);
847 setOperationAction(ISD::FATAN , MVT::f80, Expand);
848 setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
849 setOperationAction(ISD::FSINH , MVT::f80, Expand);
850 setOperationAction(ISD::FCOSH , MVT::f80, Expand);
851 setOperationAction(ISD::FTANH , MVT::f80, Expand);
852 // clang-format on
853
854 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
855 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
856 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
857 setOperationAction(ISD::FRINT, MVT::f80, Expand);
858 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
859 setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
861 setOperationAction(ISD::LROUND, MVT::f80, LibCall);
862 setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
863 setOperationAction(ISD::LRINT, MVT::f80, Custom);
864 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
865
866 // Handle constrained floating-point operations of scalar.
873 if (isTypeLegal(MVT::f16)) {
874 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
876 } else {
878 }
879 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
880 // as Custom.
882 }
883
884 // f128 uses xmm registers, but most operations require libcalls.
885 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
886 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
890
901
902 setOperationAction(ISD::FABS, MVT::f128, Custom);
903 setOperationAction(ISD::FNEG, MVT::f128, Custom);
905
906 // clang-format off
907 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
909 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
911 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
912 setOperationAction(ISD::FTAN, MVT::f128, LibCall);
914 // clang-format on
915 // No STRICT_FSINCOS
916 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
918
919 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
921 // We need to custom handle any FP_ROUND with an f128 input, but
922 // LegalizeDAG uses the result type to know when to run a custom handler.
923 // So we have to list all legal floating point result types here.
924 if (isTypeLegal(MVT::f32)) {
927 }
928 if (isTypeLegal(MVT::f64)) {
931 }
932 if (isTypeLegal(MVT::f80)) {
936 }
937
939
940 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
941 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
942 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
943 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
944 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
945 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
946 }
947
948 // Always use a library call for pow.
949 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
950 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
951 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
952 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
953
954 setOperationAction(ISD::FLOG, MVT::f80, Expand);
955 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
956 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
957 setOperationAction(ISD::FEXP, MVT::f80, Expand);
958 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
959 setOperationAction(ISD::FEXP10, MVT::f80, Expand);
960 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
961 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
962
963 // Some FP actions are always expanded for vector types.
964 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
965 MVT::v4f32, MVT::v8f32, MVT::v16f32,
966 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
967 // clang-format off
968 setOperationAction(ISD::FSIN, VT, Expand);
969 setOperationAction(ISD::FSINCOS, VT, Expand);
970 setOperationAction(ISD::FCOS, VT, Expand);
971 setOperationAction(ISD::FTAN, VT, Expand);
974 setOperationAction(ISD::FPOW, VT, Expand);
975 setOperationAction(ISD::FLOG, VT, Expand);
976 setOperationAction(ISD::FLOG2, VT, Expand);
977 setOperationAction(ISD::FLOG10, VT, Expand);
978 setOperationAction(ISD::FEXP, VT, Expand);
979 setOperationAction(ISD::FEXP2, VT, Expand);
980 setOperationAction(ISD::FEXP10, VT, Expand);
981 // clang-format on
982 }
983
984 // First set operation action for all vector types to either promote
985 // (for widening) or expand (for scalarization). Then we will selectively
986 // turn on ones that can be effectively codegen'd.
997 setOperationAction(ISD::FFLOOR, VT, Expand);
998 setOperationAction(ISD::FCEIL, VT, Expand);
999 setOperationAction(ISD::FTRUNC, VT, Expand);
1000 setOperationAction(ISD::FRINT, VT, Expand);
1001 setOperationAction(ISD::FNEARBYINT, VT, Expand);
1002 setOperationAction(ISD::FROUNDEVEN, VT, Expand);
1026 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1027 setTruncStoreAction(InnerVT, VT, Expand);
1028
1029 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1030 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1031
1032 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1033 // types, we have to deal with them whether we ask for Expansion or not.
1034 // Setting Expand causes its own optimisation problems though, so leave
1035 // them legal.
1036 if (VT.getVectorElementType() == MVT::i1)
1037 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1038
1039 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1040 // split/scalarized right now.
1041 if (VT.getVectorElementType() == MVT::f16 ||
1042 VT.getVectorElementType() == MVT::bf16)
1043 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1044 }
1045 }
1046
1047 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1048 // with -msoft-float, disable use of MMX as well.
1049 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1050 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1051 // No operations on x86mmx supported, everything uses intrinsics.
1052 }
1053
1054 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1055 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1056 : &X86::VR128RegClass);
1057
1058 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1059 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1060 setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
1061 setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
1062
1063 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1064 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1072
1073 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1074 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1076
1082 }
1083
1084 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1085 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1086 : &X86::VR128RegClass);
1087
1088 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1089 // registers cannot be used even for integer operations.
1090 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1091 : &X86::VR128RegClass);
1092 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1093 : &X86::VR128RegClass);
1094 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1097 : &X86::VR128RegClass);
1098 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1099 : &X86::VR128RegClass);
1100
1101 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1102 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1103 setOperationAction(ISD::FMINIMUM, VT, Custom);
1104 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1105 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1106 }
1107
1108 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1109 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1114 }
1115
1116 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1117 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1118 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1119
1120 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1121 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1122 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1123 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1125 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1126 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1127 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1128 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1129 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1132
1133 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1134 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1135 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1136
1137 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1139 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1141
1142 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1143 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1144
1145 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1146 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1147 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1148 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1149 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1150 }
1151
1162
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1174
1175 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1176 // setcc all the way to isel and prefer SETGT in some isel patterns.
1179 }
1180
1181 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1182 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1187
1188 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1194 }
1195
1196 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1200
1201 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1202 continue;
1203
1206 }
1207 setF16Action(MVT::v8f16, Expand);
1208 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1209 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1210 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1212 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1213 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1215
1216 // Custom lower v2i64 and v2f64 selects.
1223
1230
1231 // Custom legalize these to avoid over promotion or custom promotion.
1232 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1237 }
1238
1243
1246
1249
1250 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1255
1256 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1260
1261 // We want to legalize this to an f64 load rather than an i64 load on
1262 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1263 // store.
1264 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1265 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1266 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1267 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1268 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1269 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1270
1271 // Add 32-bit vector stores to help vectorization opportunities.
1272 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1273 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1274
1275 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1276 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1277 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1278 if (!Subtarget.hasAVX512())
1279 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1280
1284
1286
1303
1304 // In the customized shift lowering, the legal v4i32/v2i64 cases
1305 // in AVX2 will be recognized.
1306 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1310 if (VT == MVT::v2i64) continue;
1315 }
1316
1322 }
1323
1324 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1329
1330 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1332 }
1333 }
1334
1335 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1336 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1337 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1338 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1339
1340 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1343 }
1344
1345 // These might be better off as horizontal vector ops.
1350 }
1351
1352 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1353 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1354 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1356 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1358 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1360 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1362 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1364 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1366
1367 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1368 }
1369
1370 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1371 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1372 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1373 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1374 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1375 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1376 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1377 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1378
1382
1383 // FIXME: Do we need to handle scalar-to-vector here?
1384 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1385 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1386
1387 // We directly match byte blends in the backend as they match the VSELECT
1388 // condition form.
1390
1391 // SSE41 brings specific instructions for doing vector sign extend even in
1392 // cases where we don't have SRA.
1393 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1396 }
1397
1398 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1399 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1400 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1405 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1406 }
1407
1408 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1409 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1410 // do the pre and post work in the vector domain.
1413 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1414 // so that DAG combine doesn't try to turn it into uint_to_fp.
1417 }
1418 }
1419
1420 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1422 }
1423
1424 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1425 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1426 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1429 }
1430
1431 // XOP can efficiently perform BITREVERSE with VPPERM.
1432 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1434 }
1435
1436 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1437 bool HasInt256 = Subtarget.hasInt256();
1438
1439 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1440 : &X86::VR256RegClass);
1441 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1442 : &X86::VR256RegClass);
1443 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1444 : &X86::VR256RegClass);
1445 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1446 : &X86::VR256RegClass);
1447 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1448 : &X86::VR256RegClass);
1449 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1450 : &X86::VR256RegClass);
1451 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1452 : &X86::VR256RegClass);
1453
1454 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1455 setOperationAction(ISD::FFLOOR, VT, Legal);
1457 setOperationAction(ISD::FCEIL, VT, Legal);
1459 setOperationAction(ISD::FTRUNC, VT, Legal);
1461 setOperationAction(ISD::FRINT, VT, Legal);
1463 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1465 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1467
1468 setOperationAction(ISD::FROUND, VT, Custom);
1469
1470 setOperationAction(ISD::FNEG, VT, Custom);
1471 setOperationAction(ISD::FABS, VT, Custom);
1473
1474 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1475 setOperationAction(ISD::FMINIMUM, VT, Custom);
1476 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1477 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1479 }
1480
1481 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1482 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1483
1484 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1485 // even though v8i16 is a legal type.
1486 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1487 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1488 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1489 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1493
1496 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1498 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1500
1512
1513 if (!Subtarget.hasAVX512())
1514 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1515
1516 // In the customized shift lowering, the legal v8i32/v4i64 cases
1517 // in AVX2 will be recognized.
1518 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1524 if (VT == MVT::v4i64) continue;
1529 }
1530
1531 // These types need custom splitting if their input is a 128-bit vector.
1536
1540 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1541 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1544
1545 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1549 }
1550
1555
1556 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1561
1562 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1563 // setcc all the way to isel and prefer SETGT in some isel patterns.
1566 }
1567
1568 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1569 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1574
1575 if (Subtarget.hasAnyFMA()) {
1576 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1577 MVT::v2f64, MVT::v4f64 }) {
1580 }
1581 }
1582
1583 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1584 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1585 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1586 }
1587
1588 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1589 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1592
1593 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1594 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1595 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1596 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1597 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1598 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1599 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1601
1602 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1603 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1604
1605 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1606 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1607 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1608 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1609 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1610
1611 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1623
1624 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1625 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1630 }
1631
1632 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1635 }
1636
1637 if (HasInt256) {
1638 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1639 // when we have a 256bit-wide blend with immediate.
1642
1643 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1644 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1645 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1646 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1647 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1650 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1651 }
1652 }
1653
1654 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1655 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1656 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1657 setOperationAction(ISD::MSTORE, VT, Legal);
1658 }
1659
1660 // Extract subvector is special because the value type
1661 // (result) is 128-bit but the source is 256-bit wide.
1662 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1663 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1665 }
1666
1667 // Custom lower several nodes for 256-bit types.
1668 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1669 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1678 setOperationAction(ISD::STORE, VT, Custom);
1679 }
1680 setF16Action(MVT::v16f16, Expand);
1681 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1682 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1684 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1685 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1686 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1687 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1688
1689 if (HasInt256) {
1691
1692 // Custom legalize 2x32 to get a little better code.
1693 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1694 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1695
1696 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1697 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1698 setOperationAction(ISD::MGATHER, VT, Custom);
1699 }
1700 }
1701
1702 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1703 Subtarget.hasF16C()) {
1704 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1707 }
1708 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1709 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1711 }
1712 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1713 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1714 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1715 }
1716 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1717 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1718 }
1719
1720 // This block controls legalization of the mask vector sizes that are
1721 // available with AVX512. 512-bit vectors are in a separate block controlled
1722 // by useAVX512Regs.
1723 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1724 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1725 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1726 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1727 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1728 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1729
1733
1734 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1735 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1736 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1737 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1738 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1739 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1741 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1749
1750 // There is no byte sized k-register load or store without AVX512DQ.
1751 if (!Subtarget.hasDQI()) {
1752 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1753 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1754 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1755 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1756
1757 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1758 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1759 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1760 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1761 }
1762
1763 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1764 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1768 }
1769
1770 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1772
1773 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1777
1784 }
1785
1786 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1788 }
1789 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1790 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1791 setOperationAction(ISD::LRINT, VT, Legal);
1792 setOperationAction(ISD::LLRINT, VT, Legal);
1793 }
1794 }
1795
1796 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1797 // elements. 512-bits can be disabled based on prefer-vector-width and
1798 // required-vector-width function attributes.
1799 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1800 bool HasBWI = Subtarget.hasBWI();
1801
1802 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1808 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1809
1810 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1811 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1812 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1813 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1814 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1815 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1816 if (HasBWI)
1817 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1818 }
1819
1820 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1821 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1822 setOperationAction(ISD::FMINIMUM, VT, Custom);
1823 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1824 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1825 setOperationAction(ISD::FNEG, VT, Custom);
1826 setOperationAction(ISD::FABS, VT, Custom);
1831 }
1832 setOperationAction(ISD::LRINT, MVT::v16f32,
1833 Subtarget.hasDQI() ? Legal : Custom);
1834 setOperationAction(ISD::LRINT, MVT::v8f64,
1835 Subtarget.hasDQI() ? Legal : Custom);
1836 if (Subtarget.hasDQI())
1837 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1838
1839 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1844 }
1845
1846 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1851 }
1852
1857 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1859
1871
1872 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1873 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1874 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1875 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1876 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1877 if (HasBWI)
1878 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1879
1880 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1881 // to 512-bit rather than use the AVX2 instructions so that we can use
1882 // k-masks.
1883 if (!Subtarget.hasVLX()) {
1884 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1885 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1886 setOperationAction(ISD::MLOAD, VT, Custom);
1887 setOperationAction(ISD::MSTORE, VT, Custom);
1888 }
1889 }
1890
1892 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1893 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1903
1904 if (HasBWI) {
1905 // Extends from v64i1 masks to 512-bit vectors.
1909 }
1910
1911 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1912 setOperationAction(ISD::FFLOOR, VT, Legal);
1914 setOperationAction(ISD::FCEIL, VT, Legal);
1916 setOperationAction(ISD::FTRUNC, VT, Legal);
1918 setOperationAction(ISD::FRINT, VT, Legal);
1920 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1922 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1924
1925 setOperationAction(ISD::FROUND, VT, Custom);
1926 }
1927
1928 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1931 }
1932
1933 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1934 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1935 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1936 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1937
1938 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1939 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1940 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1941 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1942
1943 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1944 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1945 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1946 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1947 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1948 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1949 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1951
1952 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1953 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1954
1955 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1965
1966 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1967 // setcc all the way to isel and prefer SETGT in some isel patterns.
1970 }
1971
1972 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1973 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1978
1979 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1986 }
1987
1988 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1989 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1990 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1992 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1993 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1994 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1995 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2000 }
2001
2002 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2003 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2004 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2005 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2006 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2007 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2008
2009 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2013 setOperationAction(Opc, MVT::v8i64, Custom);
2014
2015 if (Subtarget.hasDQI())
2016 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2017
2018 if (Subtarget.hasCDI()) {
2019 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2020 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2022 }
2023 } // Subtarget.hasCDI()
2024
2025 if (Subtarget.hasVPOPCNTDQ()) {
2026 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2028 }
2029
2030 // Extract subvector is special because the value type
2031 // (result) is 256-bit but the source is 512-bit wide.
2032 // 128-bit was made Legal under AVX1.
2033 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2034 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2036
2037 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2038 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2048 }
2049 setF16Action(MVT::v32f16, Expand);
2052 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2054 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2055 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2056 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2057
2058 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2059 setOperationAction(ISD::MLOAD, VT, Legal);
2060 setOperationAction(ISD::MSTORE, VT, Legal);
2061 setOperationAction(ISD::MGATHER, VT, Custom);
2062 setOperationAction(ISD::MSCATTER, VT, Custom);
2063 }
2064 if (HasBWI) {
2065 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2066 setOperationAction(ISD::MLOAD, VT, Legal);
2067 setOperationAction(ISD::MSTORE, VT, Legal);
2068 }
2069 } else {
2070 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2071 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2072 }
2073
2074 if (Subtarget.hasVBMI2()) {
2075 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2078 }
2079
2080 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2081 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2082 }
2083
2084 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2085 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2087 }// useAVX512Regs
2088
2089 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2090 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2091 MVT::v4i64}) {
2094 }
2095 }
2096
2097 // This block controls legalization for operations that don't have
2098 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2099 // narrower widths.
2100 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2101 // These operations are handled on non-VLX by artificially widening in
2102 // isel patterns.
2103
2107
2108 if (Subtarget.hasDQI()) {
2109 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2110 // v2f32 UINT_TO_FP is already custom under SSE2.
2113 "Unexpected operation action!");
2114 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2119 }
2120
2121 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2127 }
2128
2129 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2132 }
2133
2134 // Custom legalize 2x32 to get a little better code.
2135 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2136 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2137
2138 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2139 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2140 setOperationAction(ISD::MSCATTER, VT, Custom);
2141
2142 if (Subtarget.hasDQI()) {
2146 setOperationAction(Opc, MVT::v2i64, Custom);
2147 setOperationAction(Opc, MVT::v4i64, Custom);
2148 }
2149 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2150 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2151 }
2152
2153 if (Subtarget.hasCDI()) {
2154 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2156 }
2157 } // Subtarget.hasCDI()
2158
2159 if (Subtarget.hasVPOPCNTDQ()) {
2160 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2162 }
2163
2164 // We can try to convert vectors to different sizes to leverage legal
2165 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2166 // then specialize to Legal below.
2167 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2168 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2169 MVT::v16i16, MVT::v8i8})
2171
2172 // Legal vpcompress depends on various AVX512 extensions.
2173 // Legal in AVX512F
2174 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2176
2177 // Legal in AVX512F + AVX512VL
2178 if (Subtarget.hasVLX())
2179 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2180 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2182
2183 // Legal in AVX512F + AVX512VBMI2
2184 if (Subtarget.hasVBMI2())
2185 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2187
2188 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2189 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2190 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2192 }
2193
2194 // This block control legalization of v32i1/v64i1 which are available with
2195 // AVX512BW..
2196 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2197 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2198 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2199
2200 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2211 }
2212
2213 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2215
2216 // Extends from v32i1 masks to 256-bit vectors.
2220
2221 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2222 MVT::v16f16, MVT::v8f16}) {
2223 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2224 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2225 }
2226
2227 // These operations are handled on non-VLX by artificially widening in
2228 // isel patterns.
2229 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2230
2231 if (Subtarget.hasBITALG()) {
2232 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2234 }
2235 }
2236
2237 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2238 auto setGroup = [&] (MVT VT) {
2247 setOperationAction(ISD::FSQRT, VT, Legal);
2249
2250 setOperationAction(ISD::FFLOOR, VT, Legal);
2252 setOperationAction(ISD::FCEIL, VT, Legal);
2254 setOperationAction(ISD::FTRUNC, VT, Legal);
2256 setOperationAction(ISD::FRINT, VT, Legal);
2258 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2260 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
2262
2263 setOperationAction(ISD::FROUND, VT, Custom);
2264
2265 setOperationAction(ISD::LOAD, VT, Legal);
2266 setOperationAction(ISD::STORE, VT, Legal);
2267
2273
2274 setOperationAction(ISD::FNEG, VT, Custom);
2275 setOperationAction(ISD::FABS, VT, Custom);
2279
2283 };
2284
2285 // AVX512_FP16 scalar operations
2286 setGroup(MVT::f16);
2290 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2292 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2296 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2297 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2298 setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
2299 setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
2300 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2302 setOperationAction(ISD::LRINT, MVT::f16, Legal);
2303 setOperationAction(ISD::LLRINT, MVT::f16, Legal);
2304
2307
2308 if (Subtarget.useAVX512Regs()) {
2309 setGroup(MVT::v32f16);
2315 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2317 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2319 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
2322
2327 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2329 MVT::v32i16);
2330 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2332 MVT::v32i16);
2333 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2335 MVT::v32i16);
2336 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2338 MVT::v32i16);
2339
2343
2344 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2345 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2346
2347 setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2348 setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
2349 setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
2350 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
2351 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2352 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2353 }
2354
2359
2360 if (Subtarget.hasVLX()) {
2361 setGroup(MVT::v8f16);
2362 setGroup(MVT::v16f16);
2363
2374
2377 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
2379 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
2381
2382 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2385
2389
2390 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2391 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2393 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2394
2395 // Need to custom widen these to prevent scalarization.
2396 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2397 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2398
2399 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2400 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2401 setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
2402 setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
2403
2404 setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2405 setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
2406 setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
2407 setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
2408 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2409 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2410 }
2411 }
2412
2413 if (!Subtarget.useSoftFloat() &&
2414 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2415 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2416 : &X86::VR128RegClass);
2417 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2418 : &X86::VR256RegClass);
2419 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2420 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2421 // Set the operation action Custom to do the customization later.
2424 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2425 setF16Action(VT, Expand);
2426 if (!Subtarget.hasBF16())
2432 }
2433 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2434 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2435 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2436 }
2437 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2438 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2440 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2441 }
2442
2443 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2444 Subtarget.useAVX512Regs()) {
2445 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2446 setF16Action(MVT::v32bf16, Expand);
2447 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2448 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2449 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2451 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2455 }
2456
2457 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2458 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2459 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2460 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2461 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2462 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2463 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2464 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2465 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2466 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2467 setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
2468 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
2469 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2474 setOperationAction(ISD::FSQRT, VT, Legal);
2477 setOperationAction(ISD::FMINIMUM, VT, Custom);
2478 setOperationAction(ISD::FMAXIMUM, VT, Custom);
2479 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
2480 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
2481 }
2482 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2485 }
2486 }
2487
2488 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2489 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2490 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2491 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2492 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2493 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2494
2495 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2496 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2497 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2498 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2499 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2500
2501 if (Subtarget.hasBWI()) {
2502 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2503 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2504 }
2505
2506 if (Subtarget.hasFP16()) {
2507 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2516 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2525 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2530 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2531 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2533 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2535 }
2536 }
2537
2538 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2539 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2540 }
2541
2542 // We want to custom lower some of our intrinsics.
2546 if (!Subtarget.is64Bit()) {
2548 }
2549
2550 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2551 // handle type legalization for these operations here.
2552 //
2553 // FIXME: We really should do custom legalization for addition and
2554 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2555 // than generic legalization for 64-bit multiplication-with-overflow, though.
2556 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2557 if (VT == MVT::i64 && !Subtarget.is64Bit())
2558 continue;
2559 // Add/Sub/Mul with overflow operations are custom lowered.
2566
2567 // Support carry in as value rather than glue.
2573 }
2574
2575 // Combine sin / cos into _sincos_stret if it is available.
2576 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2577 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2578 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2579 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2580 }
2581
2582 if (Subtarget.isTargetWin64()) {
2583 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2584 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2585 setOperationAction(ISD::SREM, MVT::i128, Custom);
2586 setOperationAction(ISD::UREM, MVT::i128, Custom);
2595 }
2596
2597 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2598 // is. We should promote the value to 64-bits to solve this.
2599 // This is what the CRT headers do - `fmodf` is an inline header
2600 // function casting to f64 and calling `fmod`.
2601 if (Subtarget.is32Bit() &&
2602 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2603 // clang-format off
2604 for (ISD::NodeType Op :
2605 {ISD::FACOS, ISD::STRICT_FACOS,
2606 ISD::FASIN, ISD::STRICT_FASIN,
2607 ISD::FATAN, ISD::STRICT_FATAN,
2608 ISD::FATAN2, ISD::STRICT_FATAN2,
2609 ISD::FCEIL, ISD::STRICT_FCEIL,
2610 ISD::FCOS, ISD::STRICT_FCOS,
2611 ISD::FCOSH, ISD::STRICT_FCOSH,
2612 ISD::FEXP, ISD::STRICT_FEXP,
2613 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2615 ISD::FLOG, ISD::STRICT_FLOG,
2616 ISD::FLOG10, ISD::STRICT_FLOG10,
2617 ISD::FPOW, ISD::STRICT_FPOW,
2618 ISD::FSIN, ISD::STRICT_FSIN,
2619 ISD::FSINH, ISD::STRICT_FSINH,
2620 ISD::FTAN, ISD::STRICT_FTAN,
2621 ISD::FTANH, ISD::STRICT_FTANH,
2622 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2623 ISD::FMODF})
2624 if (isOperationExpand(Op, MVT::f32))
2625 setOperationAction(Op, MVT::f32, Promote);
2626 // clang-format on
2627
2628 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2629 // it, but it's just a wrapper around ldexp.
2630 if (Subtarget.isOSWindows()) {
2631 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2632 if (isOperationExpand(Op, MVT::f32))
2633 setOperationAction(Op, MVT::f32, Promote);
2634 }
2635
2636 // We have target-specific dag combine patterns for the following nodes:
2644 ISD::BITCAST,
2647 ISD::SHL,
2648 ISD::SRA,
2649 ISD::SRL,
2650 ISD::OR,
2651 ISD::AND,
2657 ISD::ADD,
2658 ISD::FADD,
2659 ISD::FSUB,
2660 ISD::FNEG,
2661 ISD::FMA,
2663 ISD::FMINNUM,
2664 ISD::FMAXNUM,
2665 ISD::SUB,
2666 ISD::LOAD,
2667 ISD::LRINT,
2668 ISD::LLRINT,
2669 ISD::MLOAD,
2670 ISD::STORE,
2671 ISD::MSTORE,
2687 ISD::SETCC,
2688 ISD::MUL,
2689 ISD::XOR,
2690 ISD::MSCATTER,
2691 ISD::MGATHER,
2692 ISD::FP16_TO_FP,
2693 ISD::FP_EXTEND,
2700
2701 computeRegisterProperties(Subtarget.getRegisterInfo());
2702
2703 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2705 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2707 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2709
2710 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2711 // that needs to benchmarked and balanced with the potential use of vector
2712 // load/store types (PR33329, PR33914).
2715
2716 // Default loop alignment, which can be overridden by -align-loops.
2718
2719 // An out-of-order CPU can speculatively execute past a predictable branch,
2720 // but a conditional move could be stalled by an expensive earlier operation.
2721 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2722 EnableExtLdPromotion = true;
2724
2726
2727 // Default to having -disable-strictnode-mutation on
2728 IsStrictFPEnabled = true;
2729}
2730
2731// This has so far only been implemented for 64-bit MachO.
2733 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2734}
2735
2737 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2738 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2739}
2740
2742 const SDLoc &DL) const {
2743 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2744 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2745 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2746 return SDValue(Node, 0);
2747}
2748
2751 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2752 !Subtarget.hasBWI())
2753 return TypeSplitVector;
2754
2755 // Since v8f16 is legal, widen anything over v4f16.
2756 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2757 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2758 VT.getVectorElementType() == MVT::f16)
2759 return TypeSplitVector;
2760
2761 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2762 VT.getVectorElementType() != MVT::i1)
2763 return TypeWidenVector;
2764
2766}
2767
2768FastISel *
2770 const TargetLibraryInfo *libInfo) const {
2771 return X86::createFastISel(funcInfo, libInfo);
2772}
2773
2774//===----------------------------------------------------------------------===//
2775// Other Lowering Hooks
2776//===----------------------------------------------------------------------===//
2777
2779 bool AssumeSingleUse) {
2780 if (!AssumeSingleUse && !Op.hasOneUse())
2781 return false;
2782 if (!ISD::isNormalLoad(Op.getNode()))
2783 return false;
2784
2785 // If this is an unaligned vector, make sure the target supports folding it.
2786 auto *Ld = cast<LoadSDNode>(Op.getNode());
2787 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2788 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2789 return false;
2790
2791 // TODO: If this is a non-temporal load and the target has an instruction
2792 // for it, it should not be folded. See "useNonTemporalLoad()".
2793
2794 return true;
2795}
2796
2798 const X86Subtarget &Subtarget,
2799 bool AssumeSingleUse) {
2800 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2801 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2802 return false;
2803
2804 // We can not replace a wide volatile load with a broadcast-from-memory,
2805 // because that would narrow the load, which isn't legal for volatiles.
2806 auto *Ld = cast<LoadSDNode>(Op.getNode());
2807 return !Ld->isVolatile() ||
2808 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2809}
2810
2812 if (!Op.hasOneUse())
2813 return false;
2814 // Peek through (oneuse) bitcast users
2815 SDNode *User = *Op->user_begin();
2816 while (User->getOpcode() == ISD::BITCAST) {
2817 if (!User->hasOneUse())
2818 return false;
2819 User = *User->user_begin();
2820 }
2821 return ISD::isNormalStore(User);
2822}
2823
2825 if (Op.hasOneUse()) {
2826 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2827 return (ISD::ZERO_EXTEND == Opcode);
2828 }
2829 return false;
2830}
2831
2832static bool isLogicOp(unsigned Opcode) {
2833 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2834 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2835}
2836
2837static bool isTargetShuffle(unsigned Opcode) {
2838 switch(Opcode) {
2839 default: return false;
2840 case X86ISD::BLENDI:
2841 case X86ISD::PSHUFB:
2842 case X86ISD::PSHUFD:
2843 case X86ISD::PSHUFHW:
2844 case X86ISD::PSHUFLW:
2845 case X86ISD::SHUFP:
2846 case X86ISD::INSERTPS:
2847 case X86ISD::EXTRQI:
2848 case X86ISD::INSERTQI:
2849 case X86ISD::VALIGN:
2850 case X86ISD::PALIGNR:
2851 case X86ISD::VSHLDQ:
2852 case X86ISD::VSRLDQ:
2853 case X86ISD::MOVLHPS:
2854 case X86ISD::MOVHLPS:
2855 case X86ISD::MOVSHDUP:
2856 case X86ISD::MOVSLDUP:
2857 case X86ISD::MOVDDUP:
2858 case X86ISD::MOVSS:
2859 case X86ISD::MOVSD:
2860 case X86ISD::MOVSH:
2861 case X86ISD::UNPCKL:
2862 case X86ISD::UNPCKH:
2863 case X86ISD::VBROADCAST:
2864 case X86ISD::VPERMILPI:
2865 case X86ISD::VPERMILPV:
2866 case X86ISD::VPERM2X128:
2867 case X86ISD::SHUF128:
2868 case X86ISD::VPERMIL2:
2869 case X86ISD::VPERMI:
2870 case X86ISD::VPPERM:
2871 case X86ISD::VPERMV:
2872 case X86ISD::VPERMV3:
2873 case X86ISD::VZEXT_MOVL:
2874 return true;
2875 }
2876}
2877
2878static bool isTargetShuffleVariableMask(unsigned Opcode) {
2879 switch (Opcode) {
2880 default: return false;
2881 // Target Shuffles.
2882 case X86ISD::PSHUFB:
2883 case X86ISD::VPERMILPV:
2884 case X86ISD::VPERMIL2:
2885 case X86ISD::VPPERM:
2886 case X86ISD::VPERMV:
2887 case X86ISD::VPERMV3:
2888 return true;
2889 // 'Faux' Target Shuffles.
2890 case ISD::OR:
2891 case ISD::AND:
2892 case X86ISD::ANDNP:
2893 return true;
2894 }
2895}
2896
2899 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2901 int ReturnAddrIndex = FuncInfo->getRAIndex();
2902
2903 if (ReturnAddrIndex == 0) {
2904 // Set up a frame object for the return address.
2905 unsigned SlotSize = RegInfo->getSlotSize();
2906 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2907 -(int64_t)SlotSize,
2908 false);
2909 FuncInfo->setRAIndex(ReturnAddrIndex);
2910 }
2911
2912 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2913}
2914
2916 bool HasSymbolicDisplacement) {
2917 // Offset should fit into 32 bit immediate field.
2918 if (!isInt<32>(Offset))
2919 return false;
2920
2921 // If we don't have a symbolic displacement - we don't have any extra
2922 // restrictions.
2923 if (!HasSymbolicDisplacement)
2924 return true;
2925
2926 // We can fold large offsets in the large code model because we always use
2927 // 64-bit offsets.
2928 if (CM == CodeModel::Large)
2929 return true;
2930
2931 // For kernel code model we know that all object resist in the negative half
2932 // of 32bits address space. We may not accept negative offsets, since they may
2933 // be just off and we may accept pretty large positive ones.
2934 if (CM == CodeModel::Kernel)
2935 return Offset >= 0;
2936
2937 // For other non-large code models we assume that latest small object is 16MB
2938 // before end of 31 bits boundary. We may also accept pretty large negative
2939 // constants knowing that all objects are in the positive half of address
2940 // space.
2941 return Offset < 16 * 1024 * 1024;
2942}
2943
2944/// Return true if the condition is an signed comparison operation.
2945static bool isX86CCSigned(X86::CondCode X86CC) {
2946 switch (X86CC) {
2947 default:
2948 llvm_unreachable("Invalid integer condition!");
2949 case X86::COND_E:
2950 case X86::COND_NE:
2951 case X86::COND_B:
2952 case X86::COND_A:
2953 case X86::COND_BE:
2954 case X86::COND_AE:
2955 return false;
2956 case X86::COND_G:
2957 case X86::COND_GE:
2958 case X86::COND_L:
2959 case X86::COND_LE:
2960 return true;
2961 }
2962}
2963
2965 switch (SetCCOpcode) {
2966 // clang-format off
2967 default: llvm_unreachable("Invalid integer condition!");
2968 case ISD::SETEQ: return X86::COND_E;
2969 case ISD::SETGT: return X86::COND_G;
2970 case ISD::SETGE: return X86::COND_GE;
2971 case ISD::SETLT: return X86::COND_L;
2972 case ISD::SETLE: return X86::COND_LE;
2973 case ISD::SETNE: return X86::COND_NE;
2974 case ISD::SETULT: return X86::COND_B;
2975 case ISD::SETUGT: return X86::COND_A;
2976 case ISD::SETULE: return X86::COND_BE;
2977 case ISD::SETUGE: return X86::COND_AE;
2978 // clang-format on
2979 }
2980}
2981
2982/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2983/// condition code, returning the condition code and the LHS/RHS of the
2984/// comparison to make.
2986 bool isFP, SDValue &LHS, SDValue &RHS,
2987 SelectionDAG &DAG) {
2988 if (!isFP) {
2990 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2991 // X > -1 -> X == 0, jump !sign.
2992 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2993 return X86::COND_NS;
2994 }
2995 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2996 // X < 0 -> X == 0, jump on sign.
2997 return X86::COND_S;
2998 }
2999 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3000 // X >= 0 -> X == 0, jump on !sign.
3001 return X86::COND_NS;
3002 }
3003 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3004 // X < 1 -> X <= 0
3005 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3006 return X86::COND_LE;
3007 }
3008 }
3009
3010 return TranslateIntegerX86CC(SetCCOpcode);
3011 }
3012
3013 // First determine if it is required or is profitable to flip the operands.
3014
3015 // If LHS is a foldable load, but RHS is not, flip the condition.
3016 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3017 !ISD::isNON_EXTLoad(RHS.getNode())) {
3018 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3019 std::swap(LHS, RHS);
3020 }
3021
3022 switch (SetCCOpcode) {
3023 default: break;
3024 case ISD::SETOLT:
3025 case ISD::SETOLE:
3026 case ISD::SETUGT:
3027 case ISD::SETUGE:
3028 std::swap(LHS, RHS);
3029 break;
3030 }
3031
3032 // On a floating point condition, the flags are set as follows:
3033 // ZF PF CF op
3034 // 0 | 0 | 0 | X > Y
3035 // 0 | 0 | 1 | X < Y
3036 // 1 | 0 | 0 | X == Y
3037 // 1 | 1 | 1 | unordered
3038 switch (SetCCOpcode) {
3039 // clang-format off
3040 default: llvm_unreachable("Condcode should be pre-legalized away");
3041 case ISD::SETUEQ:
3042 case ISD::SETEQ: return X86::COND_E;
3043 case ISD::SETOLT: // flipped
3044 case ISD::SETOGT:
3045 case ISD::SETGT: return X86::COND_A;
3046 case ISD::SETOLE: // flipped
3047 case ISD::SETOGE:
3048 case ISD::SETGE: return X86::COND_AE;
3049 case ISD::SETUGT: // flipped
3050 case ISD::SETULT:
3051 case ISD::SETLT: return X86::COND_B;
3052 case ISD::SETUGE: // flipped
3053 case ISD::SETULE:
3054 case ISD::SETLE: return X86::COND_BE;
3055 case ISD::SETONE:
3056 case ISD::SETNE: return X86::COND_NE;
3057 case ISD::SETUO: return X86::COND_P;
3058 case ISD::SETO: return X86::COND_NP;
3059 case ISD::SETOEQ:
3060 case ISD::SETUNE: return X86::COND_INVALID;
3061 // clang-format on
3062 }
3063}
3064
3065/// Is there a floating point cmov for the specific X86 condition code?
3066/// Current x86 isa includes the following FP cmov instructions:
3067/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3068static bool hasFPCMov(unsigned X86CC) {
3069 switch (X86CC) {
3070 default:
3071 return false;
3072 case X86::COND_B:
3073 case X86::COND_BE:
3074 case X86::COND_E:
3075 case X86::COND_P:
3076 case X86::COND_A:
3077 case X86::COND_AE:
3078 case X86::COND_NE:
3079 case X86::COND_NP:
3080 return true;
3081 }
3082}
3083
3084static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3085 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3086 VT.is512BitVector();
3087}
3088
3090 const CallInst &I,
3091 MachineFunction &MF,
3092 unsigned Intrinsic) const {
3093 Info.flags = MachineMemOperand::MONone;
3094 Info.offset = 0;
3095
3097 if (!IntrData) {
3098 switch (Intrinsic) {
3099 case Intrinsic::x86_aesenc128kl:
3100 case Intrinsic::x86_aesdec128kl:
3101 Info.opc = ISD::INTRINSIC_W_CHAIN;
3102 Info.ptrVal = I.getArgOperand(1);
3103 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3104 Info.align = Align(1);
3105 Info.flags |= MachineMemOperand::MOLoad;
3106 return true;
3107 case Intrinsic::x86_aesenc256kl:
3108 case Intrinsic::x86_aesdec256kl:
3109 Info.opc = ISD::INTRINSIC_W_CHAIN;
3110 Info.ptrVal = I.getArgOperand(1);
3111 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3112 Info.align = Align(1);
3113 Info.flags |= MachineMemOperand::MOLoad;
3114 return true;
3115 case Intrinsic::x86_aesencwide128kl:
3116 case Intrinsic::x86_aesdecwide128kl:
3117 Info.opc = ISD::INTRINSIC_W_CHAIN;
3118 Info.ptrVal = I.getArgOperand(0);
3119 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3120 Info.align = Align(1);
3121 Info.flags |= MachineMemOperand::MOLoad;
3122 return true;
3123 case Intrinsic::x86_aesencwide256kl:
3124 case Intrinsic::x86_aesdecwide256kl:
3125 Info.opc = ISD::INTRINSIC_W_CHAIN;
3126 Info.ptrVal = I.getArgOperand(0);
3127 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3128 Info.align = Align(1);
3129 Info.flags |= MachineMemOperand::MOLoad;
3130 return true;
3131 case Intrinsic::x86_cmpccxadd32:
3132 case Intrinsic::x86_cmpccxadd64:
3133 case Intrinsic::x86_atomic_bts:
3134 case Intrinsic::x86_atomic_btc:
3135 case Intrinsic::x86_atomic_btr: {
3136 Info.opc = ISD::INTRINSIC_W_CHAIN;
3137 Info.ptrVal = I.getArgOperand(0);
3138 unsigned Size = I.getType()->getScalarSizeInBits();
3139 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3140 Info.align = Align(Size);
3143 return true;
3144 }
3145 case Intrinsic::x86_atomic_bts_rm:
3146 case Intrinsic::x86_atomic_btc_rm:
3147 case Intrinsic::x86_atomic_btr_rm: {
3148 Info.opc = ISD::INTRINSIC_W_CHAIN;
3149 Info.ptrVal = I.getArgOperand(0);
3150 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3151 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3152 Info.align = Align(Size);
3155 return true;
3156 }
3157 case Intrinsic::x86_aadd32:
3158 case Intrinsic::x86_aadd64:
3159 case Intrinsic::x86_aand32:
3160 case Intrinsic::x86_aand64:
3161 case Intrinsic::x86_aor32:
3162 case Intrinsic::x86_aor64:
3163 case Intrinsic::x86_axor32:
3164 case Intrinsic::x86_axor64:
3165 case Intrinsic::x86_atomic_add_cc:
3166 case Intrinsic::x86_atomic_sub_cc:
3167 case Intrinsic::x86_atomic_or_cc:
3168 case Intrinsic::x86_atomic_and_cc:
3169 case Intrinsic::x86_atomic_xor_cc: {
3170 Info.opc = ISD::INTRINSIC_W_CHAIN;
3171 Info.ptrVal = I.getArgOperand(0);
3172 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3173 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3174 Info.align = Align(Size);
3177 return true;
3178 }
3179 }
3180 return false;
3181 }
3182
3183 switch (IntrData->Type) {
3186 case TRUNCATE_TO_MEM_VI32: {
3187 Info.opc = ISD::INTRINSIC_VOID;
3188 Info.ptrVal = I.getArgOperand(0);
3189 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3191 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3192 ScalarVT = MVT::i8;
3193 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3194 ScalarVT = MVT::i16;
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3196 ScalarVT = MVT::i32;
3197
3198 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3199 Info.align = Align(1);
3200 Info.flags |= MachineMemOperand::MOStore;
3201 break;
3202 }
3203 case GATHER:
3204 case GATHER_AVX2: {
3205 Info.opc = ISD::INTRINSIC_W_CHAIN;
3206 Info.ptrVal = nullptr;
3207 MVT DataVT = MVT::getVT(I.getType());
3208 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3209 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3210 IndexVT.getVectorNumElements());
3211 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3212 Info.align = Align(1);
3213 Info.flags |= MachineMemOperand::MOLoad;
3214 break;
3215 }
3216 case SCATTER: {
3217 Info.opc = ISD::INTRINSIC_VOID;
3218 Info.ptrVal = nullptr;
3219 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3220 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3221 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3222 IndexVT.getVectorNumElements());
3223 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3224 Info.align = Align(1);
3225 Info.flags |= MachineMemOperand::MOStore;
3226 break;
3227 }
3228 default:
3229 return false;
3230 }
3231
3232 return true;
3233}
3234
3235/// Returns true if the target can instruction select the
3236/// specified FP immediate natively. If false, the legalizer will
3237/// materialize the FP immediate as a load from a constant pool.
3239 bool ForCodeSize) const {
3240 for (const APFloat &FPImm : LegalFPImmediates)
3241 if (Imm.bitwiseIsEqual(FPImm))
3242 return true;
3243 return false;
3244}
3245
3247 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3248 std::optional<unsigned> ByteOffset) const {
3249 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3250
3251 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3252 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3253 N = *N->user_begin();
3254 return N;
3255 };
3256
3257 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3258 // relocation target a movq or addq instruction: don't let the load shrink.
3259 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3260 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3261 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3262 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3263
3264 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3265 // those uses are extracted directly into a store, then the extract + store
3266 // can be store-folded, or (4) any use will be used by legal full width
3267 // instruction. Then, it's probably not worth splitting the load.
3268 EVT VT = Load->getValueType(0);
3269 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3270 !SDValue(Load, 0).hasOneUse()) {
3271 bool FullWidthUse = false;
3272 bool AllExtractStores = true;
3273 for (SDUse &Use : Load->uses()) {
3274 // Skip uses of the chain value. Result 0 of the node is the load value.
3275 if (Use.getResNo() != 0)
3276 continue;
3277
3278 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3279
3280 // If this use is an extract + store, it's probably not worth splitting.
3281 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3282 all_of(User->uses(), [&](const SDUse &U) {
3283 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3284 return Inner->getOpcode() == ISD::STORE;
3285 }))
3286 continue;
3287
3288 AllExtractStores = false;
3289
3290 // If any use is a full width legal/target bin op, then assume its legal
3291 // and won't split.
3292 if (isBinOp(User->getOpcode()) &&
3293 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3294 User->getOpcode() > ISD::BUILTIN_OP_END))
3295 FullWidthUse = true;
3296 }
3297
3298 if (AllExtractStores)
3299 return false;
3300
3301 // If we have an user that uses the full vector width, then this use is
3302 // only worth splitting if the offset isn't 0 (to avoid an
3303 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3304 if (FullWidthUse)
3305 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3306 }
3307
3308 return true;
3309}
3310
3311/// Returns true if it is beneficial to convert a load of a constant
3312/// to just the constant itself.
3314 Type *Ty) const {
3315 assert(Ty->isIntegerTy());
3316
3317 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3318 if (BitSize == 0 || BitSize > 64)
3319 return false;
3320 return true;
3321}
3322
3324 // If we are using XMM registers in the ABI and the condition of the select is
3325 // a floating-point compare and we have blendv or conditional move, then it is
3326 // cheaper to select instead of doing a cross-register move and creating a
3327 // load that depends on the compare result.
3328 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3329 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3330}
3331
3333 // TODO: It might be a win to ease or lift this restriction, but the generic
3334 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3335 if (VT.isVector() && Subtarget.hasAVX512())
3336 return false;
3337
3338 return true;
3339}
3340
3342 SDValue C) const {
3343 // TODO: We handle scalars using custom code, but generic combining could make
3344 // that unnecessary.
3345 APInt MulC;
3346 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3347 return false;
3348
3349 // Find the type this will be legalized too. Otherwise we might prematurely
3350 // convert this to shl+add/sub and then still have to type legalize those ops.
3351 // Another choice would be to defer the decision for illegal types until
3352 // after type legalization. But constant splat vectors of i64 can't make it
3353 // through type legalization on 32-bit targets so we would need to special
3354 // case vXi64.
3355 while (getTypeAction(Context, VT) != TypeLegal)
3356 VT = getTypeToTransformTo(Context, VT);
3357
3358 // If vector multiply is legal, assume that's faster than shl + add/sub.
3359 // Multiply is a complex op with higher latency and lower throughput in
3360 // most implementations, sub-vXi32 vector multiplies are always fast,
3361 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3362 // is always going to be slow.
3363 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3364 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3365 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3366 return false;
3367
3368 // shl+add, shl+sub, shl+add+neg
3369 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3370 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3371}
3372
3374 unsigned Index) const {
3376 return false;
3377
3378 // Mask vectors support all subregister combinations and operations that
3379 // extract half of vector.
3380 if (ResVT.getVectorElementType() == MVT::i1)
3381 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3382 (Index == ResVT.getVectorNumElements()));
3383
3384 return (Index % ResVT.getVectorNumElements()) == 0;
3385}
3386
3388 unsigned Opc = VecOp.getOpcode();
3389
3390 // Assume target opcodes can't be scalarized.
3391 // TODO - do we have any exceptions?
3392 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3393 return false;
3394
3395 // If the vector op is not supported, try to convert to scalar.
3396 EVT VecVT = VecOp.getValueType();
3398 return true;
3399
3400 // If the vector op is supported, but the scalar op is not, the transform may
3401 // not be worthwhile.
3402 EVT ScalarVT = VecVT.getScalarType();
3403 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3404}
3405
3407 bool) const {
3408 // TODO: Allow vectors?
3409 if (VT.isVector())
3410 return false;
3411 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3412}
3413
3415 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3416 // i32/i64 or can rely on BSF passthrough value.
3417 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3418 Subtarget.hasBitScanPassThrough() ||
3419 (!Ty->isVectorTy() &&
3420 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3421}
3422
3424 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3425 // passthrough value.
3426 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3427 Subtarget.hasBitScanPassThrough();
3428}
3429
3431 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3432 // expensive than a straight movsd. On the other hand, it's important to
3433 // shrink long double fp constant since fldt is very slow.
3434 return !Subtarget.hasSSE2() || VT == MVT::f80;
3435}
3436
3438 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3439 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3440}
3441
3443 const SelectionDAG &DAG,
3444 const MachineMemOperand &MMO) const {
3445 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3446 BitcastVT.getVectorElementType() == MVT::i1)
3447 return false;
3448
3449 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3450 return false;
3451
3452 // If both types are legal vectors, it's always ok to convert them.
3453 if (LoadVT.isVector() && BitcastVT.isVector() &&
3454 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3455 return true;
3456
3457 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3458}
3459
3461 const MachineFunction &MF) const {
3462 // Do not merge to float value size (128 bytes) if no implicit
3463 // float attribute is set.
3464 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3465
3466 if (NoFloat) {
3467 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3468 return (MemVT.getSizeInBits() <= MaxIntSize);
3469 }
3470 // Make sure we don't merge greater than our preferred vector
3471 // width.
3472 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3473 return false;
3474
3475 return true;
3476}
3477
3479 return Subtarget.hasFastLZCNT();
3480}
3481
3483 const Instruction &AndI) const {
3484 return true;
3485}
3486
3488 EVT VT = Y.getValueType();
3489
3490 if (VT.isVector())
3491 return false;
3492
3493 if (!Subtarget.hasBMI())
3494 return false;
3495
3496 // There are only 32-bit and 64-bit forms for 'andn'.
3497 if (VT != MVT::i32 && VT != MVT::i64)
3498 return false;
3499
3500 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3501}
3502
3504 EVT VT = Y.getValueType();
3505
3506 if (!VT.isVector())
3507 return hasAndNotCompare(Y);
3508
3509 // Vector.
3510
3511 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3512 return false;
3513
3514 if (VT == MVT::v4i32)
3515 return true;
3516
3517 return Subtarget.hasSSE2();
3518}
3519
3521 return X.getValueType().isScalarInteger(); // 'bt'
3522}
3523
3527 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3528 SelectionDAG &DAG) const {
3529 // Does baseline recommend not to perform the fold by default?
3531 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3532 return false;
3533 // For scalars this transform is always beneficial.
3534 if (X.getValueType().isScalarInteger())
3535 return true;
3536 // If all the shift amounts are identical, then transform is beneficial even
3537 // with rudimentary SSE2 shifts.
3538 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3539 return true;
3540 // If we have AVX2 with it's powerful shift operations, then it's also good.
3541 if (Subtarget.hasAVX2())
3542 return true;
3543 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3544 return NewShiftOpcode == ISD::SHL;
3545}
3546
3548 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3549 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3550 if (!VT.isInteger())
3551 return ShiftOpc;
3552
3553 bool PreferRotate = false;
3554 if (VT.isVector()) {
3555 // For vectors, if we have rotate instruction support, then its definetly
3556 // best. Otherwise its not clear what the best so just don't make changed.
3557 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3558 VT.getScalarType() == MVT::i64);
3559 } else {
3560 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3561 // rotate unless we have a zext mask+shr.
3562 PreferRotate = Subtarget.hasBMI2();
3563 if (!PreferRotate) {
3564 unsigned MaskBits =
3565 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3566 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3567 }
3568 }
3569
3570 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3571 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3572
3573 if (PreferRotate && MayTransformRotate)
3574 return ISD::ROTL;
3575
3576 // If vector we don't really get much benefit swapping around constants.
3577 // Maybe we could check if the DAG has the flipped node already in the
3578 // future.
3579 if (VT.isVector())
3580 return ShiftOpc;
3581
3582 // See if the beneficial to swap shift type.
3583 if (ShiftOpc == ISD::SHL) {
3584 // If the current setup has imm64 mask, then inverse will have
3585 // at least imm32 mask (or be zext i32 -> i64).
3586 if (VT == MVT::i64)
3587 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3588 : ShiftOpc;
3589
3590 // We can only benefit if req at least 7-bit for the mask. We
3591 // don't want to replace shl of 1,2,3 as they can be implemented
3592 // with lea/add.
3593 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3594 }
3595
3596 if (VT == MVT::i64)
3597 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3598 // extremely efficient.
3599 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3600
3601 // Keep small shifts as shl so we can generate add/lea.
3602 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3603 }
3604
3605 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3606 // (PreferRotate will be set in the latter case).
3607 if (PreferRotate || !MayTransformRotate || VT.isVector())
3608 return ShiftOpc;
3609
3610 // Non-vector type and we have a zext mask with SRL.
3611 return ISD::SRL;
3612}
3613
3616 const Value *Lhs,
3617 const Value *Rhs) const {
3618 using namespace llvm::PatternMatch;
3619 int BaseCost = BrMergingBaseCostThresh.getValue();
3620 // With CCMP, branches can be merged in a more efficient way.
3621 if (BaseCost >= 0 && Subtarget.hasCCMP())
3622 BaseCost += BrMergingCcmpBias;
3623 // a == b && a == c is a fast pattern on x86.
3624 if (BaseCost >= 0 && Opc == Instruction::And &&
3627 BaseCost += 1;
3628 return {BaseCost, BrMergingLikelyBias.getValue(),
3629 BrMergingUnlikelyBias.getValue()};
3630}
3631
3633 return N->getOpcode() != ISD::FP_EXTEND;
3634}
3635
3637 const SDNode *N, CombineLevel Level) const {
3638 assert(((N->getOpcode() == ISD::SHL &&
3639 N->getOperand(0).getOpcode() == ISD::SRL) ||
3640 (N->getOpcode() == ISD::SRL &&
3641 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3642 "Expected shift-shift mask");
3643 // TODO: Should we always create i64 masks? Or only folded immediates?
3644 EVT VT = N->getValueType(0);
3645 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3646 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3647 // Only fold if the shift values are equal - so it folds to AND.
3648 // TODO - we should fold if either is a non-uniform vector but we don't do
3649 // the fold for non-splats yet.
3650 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3651 }
3653}
3654
3656 EVT VT = Y.getValueType();
3657
3658 // For vectors, we don't have a preference, but we probably want a mask.
3659 if (VT.isVector())
3660 return false;
3661
3662 unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32;
3663 return VT.getScalarSizeInBits() <= MaxWidth;
3664}
3665
3668 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3670 !Subtarget.isOSWindows())
3673 ExpansionFactor);
3674}
3675
3677 // Any legal vector type can be splatted more efficiently than
3678 // loading/spilling from memory.
3679 return isTypeLegal(VT);
3680}
3681
3683 MVT VT = MVT::getIntegerVT(NumBits);
3684 if (isTypeLegal(VT))
3685 return VT;
3686
3687 // PMOVMSKB can handle this.
3688 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3689 return MVT::v16i8;
3690
3691 // VPMOVMSKB can handle this.
3692 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3693 return MVT::v32i8;
3694
3695 // TODO: Allow 64-bit type for 32-bit target.
3696 // TODO: 512-bit types should be allowed, but make sure that those
3697 // cases are handled in combineVectorSizedSetCCEquality().
3698
3700}
3701
3702/// Val is the undef sentinel value or equal to the specified value.
3703static bool isUndefOrEqual(int Val, int CmpVal) {
3704 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3705}
3706
3707/// Return true if every element in Mask is the undef sentinel value or equal to
3708/// the specified value.
3709static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3710 return llvm::all_of(Mask, [CmpVal](int M) {
3711 return (M == SM_SentinelUndef) || (M == CmpVal);
3712 });
3713}
3714
3715/// Return true if every element in Mask, beginning from position Pos and ending
3716/// in Pos+Size is the undef sentinel value or equal to the specified value.
3717static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3718 unsigned Size) {
3719 return llvm::all_of(Mask.slice(Pos, Size),
3720 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3721}
3722
3723/// Val is either the undef or zero sentinel value.
3724static bool isUndefOrZero(int Val) {
3725 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3726}
3727
3728/// Return true if every element in Mask, beginning from position Pos and ending
3729/// in Pos+Size is the undef sentinel value.
3730static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3731 return llvm::all_of(Mask.slice(Pos, Size),
3732 [](int M) { return M == SM_SentinelUndef; });
3733}
3734
3735/// Return true if the mask creates a vector whose lower half is undefined.
3737 unsigned NumElts = Mask.size();
3738 return isUndefInRange(Mask, 0, NumElts / 2);
3739}
3740
3741/// Return true if the mask creates a vector whose upper half is undefined.
3743 unsigned NumElts = Mask.size();
3744 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3745}
3746
3747/// Return true if Val falls within the specified range (L, H].
3748static bool isInRange(int Val, int Low, int Hi) {
3749 return (Val >= Low && Val < Hi);
3750}
3751
3752/// Return true if the value of any element in Mask falls within the specified
3753/// range (L, H].
3754static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3755 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3756}
3757
3758/// Return true if the value of any element in Mask is the zero sentinel value.
3759static bool isAnyZero(ArrayRef<int> Mask) {
3760 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3761}
3762
3763/// Return true if Val is undef or if its value falls within the
3764/// specified range (L, H].
3765static bool isUndefOrInRange(int Val, int Low, int Hi) {
3766 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3767}
3768
3769/// Return true if every element in Mask is undef or if its value
3770/// falls within the specified range (L, H].
3771static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3772 return llvm::all_of(
3773 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3774}
3775
3776/// Return true if Val is undef, zero or if its value falls within the
3777/// specified range (L, H].
3778static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3779 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3780}
3781
3782/// Return true if every element in Mask is undef, zero or if its value
3783/// falls within the specified range (L, H].
3784static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3785 return llvm::all_of(
3786 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3787}
3788
3789/// Return true if every element in Mask, is an in-place blend/select mask or is
3790/// undef.
3792 unsigned NumElts = Mask.size();
3793 for (auto [I, M] : enumerate(Mask))
3794 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3795 return false;
3796 return true;
3797}
3798
3799/// Return true if every element in Mask, beginning
3800/// from position Pos and ending in Pos + Size, falls within the specified
3801/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3802static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3803 unsigned Size, int Low, int Step = 1) {
3804 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3805 if (!isUndefOrEqual(Mask[i], Low))
3806 return false;
3807 return true;
3808}
3809
3810/// Return true if every element in Mask, beginning
3811/// from position Pos and ending in Pos+Size, falls within the specified
3812/// sequential range (Low, Low+Size], or is undef or is zero.
3814 unsigned Size, int Low,
3815 int Step = 1) {
3816 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3817 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3818 return false;
3819 return true;
3820}
3821
3822/// Return true if every element in Mask, beginning
3823/// from position Pos and ending in Pos+Size is undef or is zero.
3824static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3825 unsigned Size) {
3826 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3827}
3828
3829/// Return true if every element of a single input is referenced by the shuffle
3830/// mask. i.e. it just permutes them all.
3832 unsigned NumElts = Mask.size();
3833 APInt DemandedElts = APInt::getZero(NumElts);
3834 for (int M : Mask)
3835 if (isInRange(M, 0, NumElts))
3836 DemandedElts.setBit(M);
3837 return DemandedElts.isAllOnes();
3838}
3839
3840/// Helper function to test whether a shuffle mask could be
3841/// simplified by widening the elements being shuffled.
3842///
3843/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3844/// leaves it in an unspecified state.
3845///
3846/// NOTE: This must handle normal vector shuffle masks and *target* vector
3847/// shuffle masks. The latter have the special property of a '-2' representing
3848/// a zero-ed lane of a vector.
3850 SmallVectorImpl<int> &WidenedMask) {
3851 WidenedMask.assign(Mask.size() / 2, 0);
3852 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3853 int M0 = Mask[i];
3854 int M1 = Mask[i + 1];
3855
3856 // If both elements are undef, its trivial.
3857 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3858 WidenedMask[i / 2] = SM_SentinelUndef;
3859 continue;
3860 }
3861
3862 // Check for an undef mask and a mask value properly aligned to fit with
3863 // a pair of values. If we find such a case, use the non-undef mask's value.
3864 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3865 WidenedMask[i / 2] = M1 / 2;
3866 continue;
3867 }
3868 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3869 WidenedMask[i / 2] = M0 / 2;
3870 continue;
3871 }
3872
3873 // When zeroing, we need to spread the zeroing across both lanes to widen.
3874 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3875 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3877 WidenedMask[i / 2] = SM_SentinelZero;
3878 continue;
3879 }
3880 return false;
3881 }
3882
3883 // Finally check if the two mask values are adjacent and aligned with
3884 // a pair.
3885 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3886 WidenedMask[i / 2] = M0 / 2;
3887 continue;
3888 }
3889
3890 // Otherwise we can't safely widen the elements used in this shuffle.
3891 return false;
3892 }
3893 assert(WidenedMask.size() == Mask.size() / 2 &&
3894 "Incorrect size of mask after widening the elements!");
3895
3896 return true;
3897}
3898
3900 const APInt &Zeroable,
3901 bool V2IsZero,
3902 SmallVectorImpl<int> &WidenedMask) {
3903 // Create an alternative mask with info about zeroable elements.
3904 // Here we do not set undef elements as zeroable.
3905 SmallVector<int, 64> ZeroableMask(Mask);
3906 if (V2IsZero) {
3907 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3908 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3909 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3910 ZeroableMask[i] = SM_SentinelZero;
3911 }
3912 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3913}
3914
3916 SmallVector<int, 32> WidenedMask;
3917 return canWidenShuffleElements(Mask, WidenedMask);
3918}
3919
3920// Attempt to narrow/widen shuffle mask until it matches the target number of
3921// elements.
3922static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3923 SmallVectorImpl<int> &ScaledMask) {
3924 unsigned NumSrcElts = Mask.size();
3925 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3926 "Illegal shuffle scale factor");
3927
3928 // Narrowing is guaranteed to work.
3929 if (NumDstElts >= NumSrcElts) {
3930 int Scale = NumDstElts / NumSrcElts;
3931 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3932 return true;
3933 }
3934
3935 // We have to repeat the widening until we reach the target size, but we can
3936 // split out the first widening as it sets up ScaledMask for us.
3937 if (canWidenShuffleElements(Mask, ScaledMask)) {
3938 while (ScaledMask.size() > NumDstElts) {
3939 SmallVector<int, 16> WidenedMask;
3940 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3941 return false;
3942 ScaledMask = std::move(WidenedMask);
3943 }
3944 return true;
3945 }
3946
3947 return false;
3948}
3949
3950static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3951 SmallVector<int, 32> ScaledMask;
3952 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3953}
3954
3955// Helper to grow the shuffle mask for a larger value type.
3956// NOTE: This is different to scaleShuffleElements which is a same size type.
3957static void growShuffleMask(ArrayRef<int> SrcMask,
3958 SmallVectorImpl<int> &DstMask,
3959 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3960 assert(DstMask.empty() && "Expected an empty shuffle mas");
3961 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3962 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3963 unsigned NumSrcElts = SrcMask.size();
3964 DstMask.assign(SrcMask.begin(), SrcMask.end());
3965 for (int &M : DstMask) {
3966 if (M < 0)
3967 continue;
3968 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3969 }
3970 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3971}
3972
3973/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3975 return isNullConstant(Elt) || isNullFPConstant(Elt);
3976}
3977
3978// Build a vector of constants.
3979// Use an UNDEF node if MaskElt == -1.
3980// Split 64-bit constants in the 32-bit mode.
3982 const SDLoc &dl, bool IsMask = false) {
3983
3985 bool Split = false;
3986
3987 MVT ConstVecVT = VT;
3988 unsigned NumElts = VT.getVectorNumElements();
3989 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3990 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3991 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3992 Split = true;
3993 }
3994
3995 MVT EltVT = ConstVecVT.getVectorElementType();
3996 for (unsigned i = 0; i < NumElts; ++i) {
3997 bool IsUndef = Values[i] < 0 && IsMask;
3998 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3999 DAG.getConstant(Values[i], dl, EltVT);
4000 Ops.push_back(OpNode);
4001 if (Split)
4002 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4003 DAG.getConstant(0, dl, EltVT));
4004 }
4005 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4006 if (Split)
4007 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4008 return ConstsNode;
4009}
4010
4011static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4012 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4013 assert(Bits.size() == Undefs.getBitWidth() &&
4014 "Unequal constant and undef arrays");
4016 bool Split = false;
4017
4018 MVT ConstVecVT = VT;
4019 unsigned NumElts = VT.getVectorNumElements();
4020 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4021 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4022 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4023 Split = true;
4024 }
4025
4026 MVT EltVT = ConstVecVT.getVectorElementType();
4027 MVT EltIntVT = EltVT.changeTypeToInteger();
4028 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4029 if (Undefs[i]) {
4030 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4031 continue;
4032 }
4033 const APInt &V = Bits[i];
4034 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4035 if (Split) {
4036 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4037 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4038 } else {
4039 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4040 }
4041 }
4042
4043 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4044 return DAG.getBitcast(VT, ConstsNode);
4045}
4046
4048 SelectionDAG &DAG, const SDLoc &dl) {
4049 APInt Undefs = APInt::getZero(Bits.size());
4050 return getConstVector(Bits, Undefs, VT, DAG, dl);
4051}
4052
4053/// Returns a vector of specified type with all zero elements.
4054static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4055 SelectionDAG &DAG, const SDLoc &dl) {
4056 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4057 VT.getVectorElementType() == MVT::i1) &&
4058 "Unexpected vector type");
4059
4060 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4061 // type. This ensures they get CSE'd. But if the integer type is not
4062 // available, use a floating-point +0.0 instead.
4063 SDValue Vec;
4064 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4065 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4066 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4067 } else if (VT.isFloatingPoint() &&
4069 Vec = DAG.getConstantFP(+0.0, dl, VT);
4070 } else if (VT.getVectorElementType() == MVT::i1) {
4071 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4072 "Unexpected vector type");
4073 Vec = DAG.getConstant(0, dl, VT);
4074 } else {
4075 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4076 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4077 }
4078 return DAG.getBitcast(VT, Vec);
4079}
4080
4081// Helper to determine if the ops are all the extracted subvectors come from a
4082// single source. If we allow commute they don't have to be in order (Lo/Hi).
4083static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4084 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4085 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4086 LHS.getValueType() != RHS.getValueType() ||
4087 LHS.getOperand(0) != RHS.getOperand(0))
4088 return SDValue();
4089
4090 SDValue Src = LHS.getOperand(0);
4091 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4092 return SDValue();
4093
4094 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4095 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4096 RHS.getConstantOperandAPInt(1) == NumElts) ||
4097 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4098 LHS.getConstantOperandAPInt(1) == NumElts))
4099 return Src;
4100
4101 return SDValue();
4102}
4103
4104static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4105 const SDLoc &dl, unsigned vectorWidth) {
4106 EVT VT = Vec.getValueType();
4107 EVT ElVT = VT.getVectorElementType();
4108 unsigned ResultNumElts =
4109 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4110 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4111
4112 assert(ResultVT.getSizeInBits() == vectorWidth &&
4113 "Illegal subvector extraction");
4114
4115 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4116 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4117 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4118
4119 // This is the index of the first element of the vectorWidth-bit chunk
4120 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4121 IdxVal &= ~(ElemsPerChunk - 1);
4122
4123 // If the input is a buildvector just emit a smaller one.
4124 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4125 return DAG.getBuildVector(ResultVT, dl,
4126 Vec->ops().slice(IdxVal, ElemsPerChunk));
4127
4128 // Check if we're extracting the upper undef of a widening pattern.
4129 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4130 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4131 isNullConstant(Vec.getOperand(2)))
4132 return DAG.getUNDEF(ResultVT);
4133
4134 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4135}
4136
4137/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4138/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4139/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4140/// instructions or a simple subregister reference. Idx is an index in the
4141/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4142/// lowering EXTRACT_VECTOR_ELT operations easier.
4143static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4144 SelectionDAG &DAG, const SDLoc &dl) {
4146 Vec.getValueType().is512BitVector()) &&
4147 "Unexpected vector size!");
4148 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4149}
4150
4151/// Generate a DAG to grab 256-bits from a 512-bit vector.
4152static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4153 SelectionDAG &DAG, const SDLoc &dl) {
4154 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4155 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4156}
4157
4158static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4159 SelectionDAG &DAG, const SDLoc &dl,
4160 unsigned vectorWidth) {
4161 assert((vectorWidth == 128 || vectorWidth == 256) &&
4162 "Unsupported vector width");
4163 // Inserting UNDEF is Result
4164 if (Vec.isUndef())
4165 return Result;
4166
4167 // Insert the relevant vectorWidth bits.
4168 EVT VT = Vec.getValueType();
4169 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4170 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4171
4172 // This is the index of the first element of the vectorWidth-bit chunk
4173 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4174 IdxVal &= ~(ElemsPerChunk - 1);
4175 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4176}
4177
4178/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4179/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4180/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4181/// simple superregister reference. Idx is an index in the 128 bits
4182/// we want. It need not be aligned to a 128-bit boundary. That makes
4183/// lowering INSERT_VECTOR_ELT operations easier.
4184static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4185 SelectionDAG &DAG, const SDLoc &dl) {
4186 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4187 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4188}
4189
4190/// Widen a vector to a larger size with the same scalar type, with the new
4191/// elements either zero or undef.
4192static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4193 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4194 const SDLoc &dl) {
4195 EVT VecVT = Vec.getValueType();
4197 VecVT.getScalarType() == VT.getScalarType() &&
4198 "Unsupported vector widening type");
4199 // If the upper 128-bits of a build vector are already undef/zero, then try to
4200 // widen from the lower 128-bits.
4201 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4202 unsigned NumSrcElts = VecVT.getVectorNumElements();
4203 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4204 if (all_of(Hi, [&](SDValue V) {
4205 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4206 }))
4207 Vec = extract128BitVector(Vec, 0, DAG, dl);
4208 }
4209 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4210 : DAG.getUNDEF(VT);
4211 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4212}
4213
4214/// Widen a vector to a larger size with the same scalar type, with the new
4215/// elements either zero or undef.
4216static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4217 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4218 const SDLoc &dl, unsigned WideSizeInBits) {
4219 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4220 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4221 "Unsupported vector widening type");
4222 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4223 MVT SVT = Vec.getSimpleValueType().getScalarType();
4224 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4225 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4226}
4227
4228/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4229/// and bitcast with integer types.
4230static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4231 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4232 unsigned NumElts = VT.getVectorNumElements();
4233 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4234 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4235 return VT;
4236}
4237
4238/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4239/// bitcast with integer types.
4240static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4241 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4242 const SDLoc &dl) {
4243 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4244 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4245}
4246
4247// Helper function to collect subvector ops that are concatenated together,
4248// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4249// The subvectors in Ops are guaranteed to be the same type.
4251 SelectionDAG &DAG) {
4252 assert(Ops.empty() && "Expected an empty ops vector");
4253
4254 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4255 Ops.append(N->op_begin(), N->op_end());
4256 return true;
4257 }
4258
4259 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4260 SDValue Src = N->getOperand(0);
4261 SDValue Sub = N->getOperand(1);
4262 const APInt &Idx = N->getConstantOperandAPInt(2);
4263 EVT VT = Src.getValueType();
4264 EVT SubVT = Sub.getValueType();
4265
4266 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4267 // insert_subvector(undef, x, lo)
4268 if (Idx == 0 && Src.isUndef()) {
4269 Ops.push_back(Sub);
4270 Ops.push_back(DAG.getUNDEF(SubVT));
4271 return true;
4272 }
4273 if (Idx == (VT.getVectorNumElements() / 2)) {
4274 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4275 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4276 Src.getOperand(1).getValueType() == SubVT &&
4277 isNullConstant(Src.getOperand(2))) {
4278 // Attempt to recurse into inner (matching) concats.
4279 SDValue Lo = Src.getOperand(1);
4280 SDValue Hi = Sub;
4281 SmallVector<SDValue, 2> LoOps, HiOps;
4282 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4283 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4284 LoOps.size() == HiOps.size()) {
4285 Ops.append(LoOps);
4286 Ops.append(HiOps);
4287 return true;
4288 }
4289 Ops.push_back(Lo);
4290 Ops.push_back(Hi);
4291 return true;
4292 }
4293 // insert_subvector(x, extract_subvector(x, lo), hi)
4294 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4295 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4296 Ops.append(2, Sub);
4297 return true;
4298 }
4299 // insert_subvector(undef, x, hi)
4300 if (Src.isUndef()) {
4301 Ops.push_back(DAG.getUNDEF(SubVT));
4302 Ops.push_back(Sub);
4303 return true;
4304 }
4305 }
4306 }
4307 }
4308
4309 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4310 EVT VT = N->getValueType(0);
4311 SDValue Src = N->getOperand(0);
4312 uint64_t Idx = N->getConstantOperandVal(1);
4313
4314 // Collect all the subvectors from the source vector and slice off the
4315 // extraction.
4317 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4318 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4319 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4320 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4321 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4322 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4323 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4324 return true;
4325 }
4326 }
4327
4328 assert(Ops.empty() && "Expected an empty ops vector");
4329 return false;
4330}
4331
4332// Helper to check if \p V can be split into subvectors and the upper subvectors
4333// are all undef. In which case return the lower subvector.
4335 SelectionDAG &DAG) {
4336 SmallVector<SDValue> SubOps;
4337 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4338 return SDValue();
4339
4340 unsigned NumSubOps = SubOps.size();
4341 unsigned HalfNumSubOps = NumSubOps / 2;
4342 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4343
4344 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4345 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4346 return SDValue();
4347
4348 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4349 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4350 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4351}
4352
4353// Helper to check if we can access all the constituent subvectors without any
4354// extract ops.
4357 return collectConcatOps(V.getNode(), Ops, DAG);
4358}
4359
4360static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4361 const SDLoc &dl) {
4362 EVT VT = Op.getValueType();
4363 unsigned NumElems = VT.getVectorNumElements();
4364 unsigned SizeInBits = VT.getSizeInBits();
4365 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4366 "Can't split odd sized vector");
4367
4369 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4370 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4371 unsigned HalfOps = SubOps.size() / 2;
4372 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4373 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4374 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4375 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4376 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4377 return std::make_pair(Lo, Hi);
4378 }
4379
4380 // If this is a splat value (with no-undefs) then use the lower subvector,
4381 // which should be a free extraction.
4382 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4383 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4384 return std::make_pair(Lo, Lo);
4385
4386 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4387 return std::make_pair(Lo, Hi);
4388}
4389
4390/// Break an operation into 2 half sized ops and then concatenate the results.
4392 unsigned NumOps = Op.getNumOperands();
4393 EVT VT = Op.getValueType();
4394
4395 // Extract the LHS Lo/Hi vectors
4398 for (unsigned I = 0; I != NumOps; ++I) {
4399 SDValue SrcOp = Op.getOperand(I);
4400 if (!SrcOp.getValueType().isVector()) {
4401 LoOps[I] = HiOps[I] = SrcOp;
4402 continue;
4403 }
4404 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4405 }
4406
4407 EVT LoVT, HiVT;
4408 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4409 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4410 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4411 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4412}
4413
4414/// Break an unary integer operation into 2 half sized ops and then
4415/// concatenate the result back.
4417 const SDLoc &dl) {
4418 // Make sure we only try to split 256/512-bit types to avoid creating
4419 // narrow vectors.
4420 [[maybe_unused]] EVT VT = Op.getValueType();
4421 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4422 Op.getOperand(0).getValueType().is512BitVector()) &&
4423 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4424 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4425 VT.getVectorNumElements() &&
4426 "Unexpected VTs!");
4427 return splitVectorOp(Op, DAG, dl);
4428}
4429
4430/// Break a binary integer operation into 2 half sized ops and then
4431/// concatenate the result back.
4433 const SDLoc &dl) {
4434 // Assert that all the types match.
4435 [[maybe_unused]] EVT VT = Op.getValueType();
4436 assert(Op.getOperand(0).getValueType() == VT &&
4437 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4438 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4439 return splitVectorOp(Op, DAG, dl);
4440}
4441
4442// Helper for splitting operands of an operation to legal target size and
4443// apply a function on each part.
4444// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4445// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4446// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4447// The argument Builder is a function that will be applied on each split part:
4448// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4449template <typename F>
4451 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4452 F Builder, bool CheckBWI = true,
4453 bool AllowAVX512 = true) {
4454 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4455 unsigned NumSubs = 1;
4456 if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
4457 (!CheckBWI && Subtarget.useAVX512Regs()))) {
4458 if (VT.getSizeInBits() > 512) {
4459 NumSubs = VT.getSizeInBits() / 512;
4460 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4461 }
4462 } else if (Subtarget.hasAVX2()) {
4463 if (VT.getSizeInBits() > 256) {
4464 NumSubs = VT.getSizeInBits() / 256;
4465 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4466 }
4467 } else {
4468 if (VT.getSizeInBits() > 128) {
4469 NumSubs = VT.getSizeInBits() / 128;
4470 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4471 }
4472 }
4473
4474 if (NumSubs == 1)
4475 return Builder(DAG, DL, Ops);
4476
4478 for (unsigned i = 0; i != NumSubs; ++i) {
4480 for (SDValue Op : Ops) {
4481 EVT OpVT = Op.getValueType();
4482 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4483 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4484 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4485 }
4486 Subs.push_back(Builder(DAG, DL, SubOps));
4487 }
4488 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4489}
4490
4491// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4492// targets.
4493static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4495 const X86Subtarget &Subtarget) {
4496 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4497 MVT SVT = VT.getScalarType();
4498
4499 // If we have a 32/64 splatted constant, splat it to DstTy to
4500 // encourage a foldable broadcast'd operand.
4501 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4502 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4503 // AVX512 broadcasts 32/64-bit operands.
4504 // TODO: Support float once getAVX512Node is used by fp-ops.
4505 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4507 return SDValue();
4508 // If we're not widening, don't bother if we're not bitcasting.
4509 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4510 return SDValue();
4512 APInt SplatValue, SplatUndef;
4513 unsigned SplatBitSize;
4514 bool HasAnyUndefs;
4515 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4516 HasAnyUndefs, OpEltSizeInBits) &&
4517 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4518 return DAG.getConstant(SplatValue, DL, DstVT);
4519 }
4520 return SDValue();
4521 };
4522
4523 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4524
4525 MVT DstVT = VT;
4526 if (Widen)
4527 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4528
4529 // Canonicalize src operands.
4530 SmallVector<SDValue> SrcOps(Ops);
4531 for (SDValue &Op : SrcOps) {
4532 MVT OpVT = Op.getSimpleValueType();
4533 // Just pass through scalar operands.
4534 if (!OpVT.isVector())
4535 continue;
4536 assert(OpVT == VT && "Vector type mismatch");
4537
4538 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4539 Op = BroadcastOp;
4540 continue;
4541 }
4542
4543 // Just widen the subvector by inserting into an undef wide vector.
4544 if (Widen)
4545 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4546 }
4547
4548 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4549
4550 // Perform the 512-bit op then extract the bottom subvector.
4551 if (Widen)
4552 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4553 return Res;
4554}
4555
4556/// Insert i1-subvector to i1-vector.
4558 const X86Subtarget &Subtarget) {
4559
4560 SDLoc dl(Op);
4561 SDValue Vec = Op.getOperand(0);
4562 SDValue SubVec = Op.getOperand(1);
4563 SDValue Idx = Op.getOperand(2);
4564 unsigned IdxVal = Op.getConstantOperandVal(2);
4565
4566 // Inserting undef is a nop. We can just return the original vector.
4567 if (SubVec.isUndef())
4568 return Vec;
4569
4570 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4571 return Op;
4572
4573 MVT OpVT = Op.getSimpleValueType();
4574 unsigned NumElems = OpVT.getVectorNumElements();
4575 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4576
4577 // Extend to natively supported kshift.
4578 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4579
4580 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4581 // if necessary.
4582 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4583 // May need to promote to a legal type.
4584 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4585 DAG.getConstant(0, dl, WideOpVT),
4586 SubVec, Idx);
4587 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4588 }
4589
4590 MVT SubVecVT = SubVec.getSimpleValueType();
4591 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4592 assert(IdxVal + SubVecNumElems <= NumElems &&
4593 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4594 "Unexpected index value in INSERT_SUBVECTOR");
4595
4596 SDValue Undef = DAG.getUNDEF(WideOpVT);
4597
4598 if (IdxVal == 0) {
4599 // Zero lower bits of the Vec
4600 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4601 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4602 ZeroIdx);
4603 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4604 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4605 // Merge them together, SubVec should be zero extended.
4606 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4607 DAG.getConstant(0, dl, WideOpVT),
4608 SubVec, ZeroIdx);
4609 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4610 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4611 }
4612
4613 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4614 Undef, SubVec, ZeroIdx);
4615
4616 if (Vec.isUndef()) {
4617 assert(IdxVal != 0 && "Unexpected index");
4618 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4619 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4620 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4621 }
4622
4624 assert(IdxVal != 0 && "Unexpected index");
4625 // If upper elements of Vec are known undef, then just shift into place.
4626 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4627 [](SDValue V) { return V.isUndef(); })) {
4628 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4629 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4630 } else {
4631 NumElems = WideOpVT.getVectorNumElements();
4632 unsigned ShiftLeft = NumElems - SubVecNumElems;
4633 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4634 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4635 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4636 if (ShiftRight != 0)
4637 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4638 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4639 }
4640 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4641 }
4642
4643 // Simple case when we put subvector in the upper part
4644 if (IdxVal + SubVecNumElems == NumElems) {
4645 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4646 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4647 if (SubVecNumElems * 2 == NumElems) {
4648 // Special case, use legal zero extending insert_subvector. This allows
4649 // isel to optimize when bits are known zero.
4650 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4651 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4652 DAG.getConstant(0, dl, WideOpVT),
4653 Vec, ZeroIdx);
4654 } else {
4655 // Otherwise use explicit shifts to zero the bits.
4656 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4657 Undef, Vec, ZeroIdx);
4658 NumElems = WideOpVT.getVectorNumElements();
4659 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4660 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4661 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4662 }
4663 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4664 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4665 }
4666
4667 // Inserting into the middle is more complicated.
4668
4669 NumElems = WideOpVT.getVectorNumElements();
4670
4671 // Widen the vector if needed.
4672 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4673
4674 unsigned ShiftLeft = NumElems - SubVecNumElems;
4675 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4676
4677 // Do an optimization for the most frequently used types.
4678 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4679 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4680 Mask0.flipAllBits();
4681 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4682 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4683 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4684 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4685 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4686 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4687 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4688 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4689
4690 // Reduce to original width if needed.
4691 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4692 }
4693
4694 // Clear the upper bits of the subvector and move it to its insert position.
4695 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4696 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4697 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4698 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4699
4700 // Isolate the bits below the insertion point.
4701 unsigned LowShift = NumElems - IdxVal;
4702 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4703 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4704 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4705 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4706
4707 // Isolate the bits after the last inserted bit.
4708 unsigned HighShift = IdxVal + SubVecNumElems;
4709 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4710 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4711 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4712 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4713
4714 // Now OR all 3 pieces together.
4715 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4716 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4717
4718 // Reduce to original width if needed.
4719 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4720}
4721
4723 const SDLoc &dl) {
4724 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4725 EVT SubVT = V1.getValueType();
4726 EVT SubSVT = SubVT.getScalarType();
4727 unsigned SubNumElts = SubVT.getVectorNumElements();
4728 unsigned SubVectorWidth = SubVT.getSizeInBits();
4729 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4730 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4731 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4732}
4733
4734/// Returns a vector of specified type with all bits set.
4735/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4736/// Then bitcast to their original type, ensuring they get CSE'd.
4737static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4738 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4739 "Expected a 128/256/512-bit vector type");
4740 unsigned NumElts = VT.getSizeInBits() / 32;
4741 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4742 return DAG.getBitcast(VT, Vec);
4743}
4744
4745static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4746 SDValue In, SelectionDAG &DAG) {
4747 EVT InVT = In.getValueType();
4748 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4749
4750 // Canonicalize Opcode to general extension version.
4751 switch (Opcode) {
4752 case ISD::ANY_EXTEND:
4754 Opcode = ISD::ANY_EXTEND;
4755 break;
4756 case ISD::SIGN_EXTEND:
4758 Opcode = ISD::SIGN_EXTEND;
4759 break;
4760 case ISD::ZERO_EXTEND:
4762 Opcode = ISD::ZERO_EXTEND;
4763 break;
4764 default:
4765 llvm_unreachable("Unknown extension opcode");
4766 }
4767
4768 // For 256-bit vectors, we only need the lower (128-bit) input half.
4769 // For 512-bit vectors, we only need the lower input half or quarter.
4770 if (InVT.getSizeInBits() > 128) {
4771 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4772 "Expected VTs to be the same size!");
4773 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4774 In = extractSubVector(In, 0, DAG, DL,
4775 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4776 InVT = In.getValueType();
4777 }
4778
4779 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4780 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4781
4782 return DAG.getNode(Opcode, DL, VT, In);
4783}
4784
4785// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4787 SDValue Mask, SelectionDAG &DAG) {
4788 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4789 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4790 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4791}
4792
4794 bool Lo, bool Unary) {
4795 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4796 "Illegal vector type to unpack");
4797 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4798 int NumElts = VT.getVectorNumElements();
4799 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4800 for (int i = 0; i < NumElts; ++i) {
4801 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4802 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4803 Pos += (Unary ? 0 : NumElts * (i % 2));
4804 Pos += (Lo ? 0 : NumEltsInLane / 2);
4805 Mask.push_back(Pos);
4806 }
4807}
4808
4809/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4810/// imposed by AVX and specific to the unary pattern. Example:
4811/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4812/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4814 bool Lo) {
4815 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4816 int NumElts = VT.getVectorNumElements();
4817 for (int i = 0; i < NumElts; ++i) {
4818 int Pos = i / 2;
4819 Pos += (Lo ? 0 : NumElts / 2);
4820 Mask.push_back(Pos);
4821 }
4822}
4823
4824// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4825static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4826 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4829 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4830 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4831 int M = Mask[I];
4832 if (M < 0)
4833 continue;
4834 SDValue V = (M < NumElts) ? V1 : V2;
4835 if (V.isUndef())
4836 continue;
4837 Ops[I] = V.getOperand(M % NumElts);
4838 }
4839 return DAG.getBuildVector(VT, dl, Ops);
4840 }
4841
4842 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4843}
4844
4845/// Returns a vector_shuffle node for an unpackl operation.
4846static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4847 SDValue V1, SDValue V2) {
4849 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4850 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4851}
4852
4853/// Returns a vector_shuffle node for an unpackh operation.
4854static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4855 SDValue V1, SDValue V2) {
4857 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4858 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4859}
4860
4861/// Returns a node that packs the LHS + RHS nodes together at half width.
4862/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4863/// TODO: Add subvector splitting if/when we have a need for it.
4864static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4865 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4866 bool PackHiHalf = false) {
4867 MVT OpVT = LHS.getSimpleValueType();
4868 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4869 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4870 assert(OpVT == RHS.getSimpleValueType() &&
4871 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4872 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4873 "Unexpected PACK operand types");
4874 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4875 "Unexpected PACK result type");
4876
4877 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4878 if (EltSizeInBits == 32) {
4879 SmallVector<int> PackMask;
4880 int Offset = PackHiHalf ? 1 : 0;
4881 int NumElts = VT.getVectorNumElements();
4882 for (int I = 0; I != NumElts; I += 4) {
4883 PackMask.push_back(I + Offset);
4884 PackMask.push_back(I + Offset + 2);
4885 PackMask.push_back(I + Offset + NumElts);
4886 PackMask.push_back(I + Offset + NumElts + 2);
4887 }
4888 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4889 DAG.getBitcast(VT, RHS), PackMask);
4890 }
4891
4892 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4893 if (!PackHiHalf) {
4894 if (UsePackUS &&
4895 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4896 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4897 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4898
4899 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4900 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4901 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4902 }
4903
4904 // Fallback to sign/zero extending the requested half and pack.
4905 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4906 if (UsePackUS) {
4907 if (PackHiHalf) {
4908 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4909 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4910 } else {
4911 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4912 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4913 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4914 };
4915 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4916 };
4917
4918 if (!PackHiHalf) {
4919 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4920 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4921 }
4922 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4923 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4924 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4925}
4926
4927/// Return a vector_shuffle of the specified vector of zero or undef vector.
4928/// This produces a shuffle where the low element of V2 is swizzled into the
4929/// zero/undef vector, landing at element Idx.
4930/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4932 bool IsZero,
4933 const X86Subtarget &Subtarget,
4934 SelectionDAG &DAG) {
4935 MVT VT = V2.getSimpleValueType();
4936 SDValue V1 = IsZero
4937 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4938 int NumElems = VT.getVectorNumElements();
4939 SmallVector<int, 16> MaskVec(NumElems);
4940 for (int i = 0; i != NumElems; ++i)
4941 // If this is the insertion idx, put the low elt of V2 here.
4942 MaskVec[i] = (i == Idx) ? NumElems : i;
4943 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4944}
4945
4947 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4948 Ptr.getOpcode() == X86ISD::WrapperRIP)
4949 Ptr = Ptr.getOperand(0);
4951}
4952
4953// TODO: Add support for non-zero offsets.
4956 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4957 return nullptr;
4958 return CNode->getConstVal();
4959}
4960
4962 if (!Load || !ISD::isNormalLoad(Load))
4963 return nullptr;
4964 return getTargetConstantFromBasePtr(Load->getBasePtr());
4965}
4966
4971
4972const Constant *
4974 assert(LD && "Unexpected null LoadSDNode");
4975 return getTargetConstantFromNode(LD);
4976}
4977
4979 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4980 SDValue Cond = N->getOperand(0);
4981 SDValue RHS = N->getOperand(2);
4982 EVT CondVT = Cond.getValueType();
4983 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4984 CondVT.getVectorElementType() == MVT::i1 &&
4985 ISD::isBuildVectorAllZeros(RHS.getNode());
4986}
4987
4988// Extract raw constant bits from constant pools.
4989static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4990 APInt &UndefElts,
4991 SmallVectorImpl<APInt> &EltBits,
4992 bool AllowWholeUndefs = true,
4993 bool AllowPartialUndefs = false) {
4994 assert(EltBits.empty() && "Expected an empty EltBits vector");
4995
4997
4998 EVT VT = Op.getValueType();
4999 unsigned SizeInBits = VT.getSizeInBits();
5000 unsigned NumElts = SizeInBits / EltSizeInBits;
5001
5002 // Can't split constant.
5003 if ((SizeInBits % EltSizeInBits) != 0)
5004 return false;
5005
5006 // Bitcast a source array of element bits to the target size.
5007 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5008 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5009 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5010 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5011 "Constant bit sizes don't match");
5012
5013 // Don't split if we don't allow undef bits.
5014 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5015 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5016 return false;
5017
5018 // If we're already the right size, don't bother bitcasting.
5019 if (NumSrcElts == NumElts) {
5020 UndefElts = UndefSrcElts;
5021 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5022 return true;
5023 }
5024
5025 // Extract all the undef/constant element data and pack into single bitsets.
5026 APInt UndefBits(SizeInBits, 0);
5027 APInt MaskBits(SizeInBits, 0);
5028
5029 for (unsigned i = 0; i != NumSrcElts; ++i) {
5030 unsigned BitOffset = i * SrcEltSizeInBits;
5031 if (UndefSrcElts[i])
5032 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5033 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5034 }
5035
5036 // Split the undef/constant single bitset data into the target elements.
5037 UndefElts = APInt(NumElts, 0);
5038 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5039
5040 for (unsigned i = 0; i != NumElts; ++i) {
5041 unsigned BitOffset = i * EltSizeInBits;
5042 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5043
5044 // Only treat an element as UNDEF if all bits are UNDEF.
5045 if (UndefEltBits.isAllOnes()) {
5046 if (!AllowWholeUndefs)
5047 return false;
5048 UndefElts.setBit(i);
5049 continue;
5050 }
5051
5052 // If only some bits are UNDEF then treat them as zero (or bail if not
5053 // supported).
5054 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5055 return false;
5056
5057 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5058 }
5059 return true;
5060 };
5061
5062 // Collect constant bits and insert into mask/undef bit masks.
5063 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5064 unsigned UndefBitIndex) {
5065 if (!Cst)
5066 return false;
5067 if (isa<UndefValue>(Cst)) {
5068 Undefs.setBit(UndefBitIndex);
5069 return true;
5070 }
5071 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5072 Mask = CInt->getValue();
5073 return true;
5074 }
5075 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5076 Mask = CFP->getValueAPF().bitcastToAPInt();
5077 return true;
5078 }
5079 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5080 Type *Ty = CDS->getType();
5081 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5082 Type *EltTy = CDS->getElementType();
5083 bool IsInteger = EltTy->isIntegerTy();
5084 bool IsFP =
5085 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5086 if (!IsInteger && !IsFP)
5087 return false;
5088 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5089 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5090 if (IsInteger)
5091 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5092 else
5093 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5094 I * EltBits);
5095 return true;
5096 }
5097 return false;
5098 };
5099
5100 // Handle UNDEFs.
5101 if (Op.isUndef()) {
5102 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5103 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5104 return CastBitData(UndefSrcElts, SrcEltBits);
5105 }
5106
5107 // Extract scalar constant bits.
5108 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5109 APInt UndefSrcElts = APInt::getZero(1);
5110 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5111 return CastBitData(UndefSrcElts, SrcEltBits);
5112 }
5113 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5114 APInt UndefSrcElts = APInt::getZero(1);
5115 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5116 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5117 return CastBitData(UndefSrcElts, SrcEltBits);
5118 }
5119
5120 // Extract constant bits from build vector.
5121 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5122 BitVector Undefs;
5123 SmallVector<APInt> SrcEltBits;
5124 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5125 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5126 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5127 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5128 if (Undefs[I])
5129 UndefSrcElts.setBit(I);
5130 return CastBitData(UndefSrcElts, SrcEltBits);
5131 }
5132 }
5133
5134 // Extract constant bits from constant pool vector.
5135 if (auto *Cst = getTargetConstantFromNode(Op)) {
5136 Type *CstTy = Cst->getType();
5137 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5138 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5139 return false;
5140
5141 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5142 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5143 if ((SizeInBits % SrcEltSizeInBits) != 0)
5144 return false;
5145
5146 APInt UndefSrcElts(NumSrcElts, 0);
5147 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5148 for (unsigned i = 0; i != NumSrcElts; ++i)
5149 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5150 UndefSrcElts, i))
5151 return false;
5152
5153 return CastBitData(UndefSrcElts, SrcEltBits);
5154 }
5155
5156 // Extract constant bits from a broadcasted constant pool scalar.
5157 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5158 EltSizeInBits <= VT.getScalarSizeInBits()) {
5159 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5160 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5161 return false;
5162
5163 SDValue Ptr = MemIntr->getBasePtr();
5165 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5166 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5167
5168 APInt UndefSrcElts(NumSrcElts, 0);
5169 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5170 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5171 if (UndefSrcElts[0])
5172 UndefSrcElts.setBits(0, NumSrcElts);
5173 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5174 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5175 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5176 return CastBitData(UndefSrcElts, SrcEltBits);
5177 }
5178 }
5179 }
5180
5181 // Extract constant bits from a subvector broadcast.
5182 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5183 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5184 SDValue Ptr = MemIntr->getBasePtr();
5185 // The source constant may be larger than the subvector broadcast,
5186 // ensure we extract the correct subvector constants.
5187 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5188 Type *CstTy = Cst->getType();
5189 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5190 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5191 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5192 (SizeInBits % SubVecSizeInBits) != 0)
5193 return false;
5194 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5195 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5196 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5197 APInt UndefSubElts(NumSubElts, 0);
5198 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5199 APInt(CstEltSizeInBits, 0));
5200 for (unsigned i = 0; i != NumSubElts; ++i) {
5201 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5202 UndefSubElts, i))
5203 return false;
5204 for (unsigned j = 1; j != NumSubVecs; ++j)
5205 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5206 }
5207 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5208 UndefSubElts);
5209 return CastBitData(UndefSubElts, SubEltBits);
5210 }
5211 }
5212
5213 // Extract a rematerialized scalar constant insertion.
5214 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5215 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5216 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5217 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5218 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5219
5220 APInt UndefSrcElts(NumSrcElts, 0);
5221 SmallVector<APInt, 64> SrcEltBits;
5222 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5223 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5224 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5225 return CastBitData(UndefSrcElts, SrcEltBits);
5226 }
5227
5228 // Insert constant bits from a base and sub vector sources.
5229 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5230 // If bitcasts to larger elements we might lose track of undefs - don't
5231 // allow any to be safe.
5232 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5233 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5234
5235 APInt UndefSrcElts, UndefSubElts;
5236 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5237 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5238 UndefSubElts, EltSubBits,
5239 AllowWholeUndefs && AllowUndefs,
5240 AllowPartialUndefs && AllowUndefs) &&
5241 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5242 UndefSrcElts, EltSrcBits,
5243 AllowWholeUndefs && AllowUndefs,
5244 AllowPartialUndefs && AllowUndefs)) {
5245 unsigned BaseIdx = Op.getConstantOperandVal(2);
5246 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5247 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5248 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5249 return CastBitData(UndefSrcElts, EltSrcBits);
5250 }
5251 }
5252
5253 // Extract constant bits from a subvector's source.
5254 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5255 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5256 EltBits, AllowWholeUndefs,
5257 AllowPartialUndefs)) {
5258 EVT SrcVT = Op.getOperand(0).getValueType();
5259 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5260 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5261 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5262 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5263 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5264 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5265 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5266
5267 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5268 if ((BaseIdx + NumSubElts) != NumSrcElts)
5269 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5270 if (BaseIdx != 0)
5271 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5272 return true;
5273 }
5274
5275 // Extract constant bits from shuffle node sources.
5276 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5277 // TODO - support shuffle through bitcasts.
5278 if (EltSizeInBits != VT.getScalarSizeInBits())
5279 return false;
5280
5281 ArrayRef<int> Mask = SVN->getMask();
5282 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5283 llvm::any_of(Mask, [](int M) { return M < 0; }))
5284 return false;
5285
5286 APInt UndefElts0, UndefElts1;
5287 SmallVector<APInt, 32> EltBits0, EltBits1;
5288 if (isAnyInRange(Mask, 0, NumElts) &&
5289 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5290 UndefElts0, EltBits0, AllowWholeUndefs,
5291 AllowPartialUndefs))
5292 return false;
5293 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5294 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5295 UndefElts1, EltBits1, AllowWholeUndefs,
5296 AllowPartialUndefs))
5297 return false;
5298
5299 UndefElts = APInt::getZero(NumElts);
5300 for (int i = 0; i != (int)NumElts; ++i) {
5301 int M = Mask[i];
5302 if (M < 0) {
5303 UndefElts.setBit(i);
5304 EltBits.push_back(APInt::getZero(EltSizeInBits));
5305 } else if (M < (int)NumElts) {
5306 if (UndefElts0[M])
5307 UndefElts.setBit(i);
5308 EltBits.push_back(EltBits0[M]);
5309 } else {
5310 if (UndefElts1[M - NumElts])
5311 UndefElts.setBit(i);
5312 EltBits.push_back(EltBits1[M - NumElts]);
5313 }
5314 }
5315 return true;
5316 }
5317
5318 return false;
5319}
5320
5321namespace llvm {
5322namespace X86 {
5323bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5324 APInt UndefElts;
5325 SmallVector<APInt, 16> EltBits;
5327 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5328 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5329 int SplatIndex = -1;
5330 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5331 if (UndefElts[i])
5332 continue;
5333 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5334 SplatIndex = -1;
5335 break;
5336 }
5337 SplatIndex = i;
5338 }
5339 if (0 <= SplatIndex) {
5340 SplatVal = EltBits[SplatIndex];
5341 return true;
5342 }
5343 }
5344
5345 return false;
5346}
5347
5348int getRoundingModeX86(unsigned RM) {
5349 switch (static_cast<::llvm::RoundingMode>(RM)) {
5350 // clang-format off
5351 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break;
5352 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; break;
5353 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; break;
5354 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; break;
5355 default:
5356 return X86::rmInvalid; // Invalid rounding mode
5357 }
5358}
5359
5360} // namespace X86
5361} // namespace llvm
5362
5364 unsigned MaskEltSizeInBits,
5366 APInt &UndefElts) {
5367 // Extract the raw target constant bits.
5368 SmallVector<APInt, 64> EltBits;
5369 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5370 EltBits, /* AllowWholeUndefs */ true,
5371 /* AllowPartialUndefs */ false))
5372 return false;
5373
5374 // Insert the extracted elements into the mask.
5375 for (const APInt &Elt : EltBits)
5376 RawMask.push_back(Elt.getZExtValue());
5377
5378 return true;
5379}
5380
5381static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5382 bool AllowUndefs) {
5383 APInt UndefElts;
5384 SmallVector<APInt, 64> EltBits;
5385 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5386 /*AllowWholeUndefs*/ AllowUndefs,
5387 /*AllowPartialUndefs*/ false))
5388 return false;
5389
5390 bool IsPow2OrUndef = true;
5391 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5392 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5393 return IsPow2OrUndef;
5394}
5395
5396// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5398 // TODO: don't always ignore oneuse constraints.
5399 V = peekThroughBitcasts(V);
5400 EVT VT = V.getValueType();
5401
5402 // Match not(xor X, -1) -> X.
5403 if (V.getOpcode() == ISD::XOR &&
5404 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5405 isAllOnesConstant(V.getOperand(1))))
5406 return V.getOperand(0);
5407
5408 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5409 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5410 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5411 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5412 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5413 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5414 V.getOperand(1));
5415 }
5416 }
5417
5418 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5419 if (V.getOpcode() == X86ISD::PCMPGT &&
5420 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5421 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5422 V.getOperand(0).hasOneUse()) {
5423 APInt UndefElts;
5424 SmallVector<APInt> EltBits;
5425 if (getTargetConstantBitsFromNode(V.getOperand(0),
5426 V.getScalarValueSizeInBits(), UndefElts,
5427 EltBits) &&
5428 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5429 // Don't fold min_signed_value -> (min_signed_value - 1)
5430 bool MinSigned = false;
5431 for (APInt &Elt : EltBits) {
5432 MinSigned |= Elt.isMinSignedValue();
5433 Elt -= 1;
5434 }
5435 if (!MinSigned) {
5436 SDLoc DL(V);
5437 MVT VT = V.getSimpleValueType();
5438 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5439 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5440 }
5441 }
5442 }
5443
5444 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5446 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5447 for (SDValue &CatOp : CatOps) {
5448 SDValue NotCat = IsNOT(CatOp, DAG);
5449 if (!NotCat)
5450 return SDValue();
5451 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5452 }
5453 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5454 }
5455
5456 // Match not(or(not(X),not(Y))) -> and(X, Y).
5457 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5458 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5459 // TODO: Handle cases with single NOT operand -> ANDNP
5460 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5461 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5462 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5463 DAG.getBitcast(VT, Op1));
5464 }
5465
5466 return SDValue();
5467}
5468
5469/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5470/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5471/// Note: This ignores saturation, so inputs must be checked first.
5473 bool Unary, unsigned NumStages = 1) {
5474 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5475 unsigned NumElts = VT.getVectorNumElements();
5476 unsigned NumLanes = VT.getSizeInBits() / 128;
5477 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5478 unsigned Offset = Unary ? 0 : NumElts;
5479 unsigned Repetitions = 1u << (NumStages - 1);
5480 unsigned Increment = 1u << NumStages;
5481 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5482
5483 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5484 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5485 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5486 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5487 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5488 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5489 }
5490 }
5491}
5492
5493// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5494static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5495 APInt &DemandedLHS, APInt &DemandedRHS) {
5496 int NumLanes = VT.getSizeInBits() / 128;
5497 int NumElts = DemandedElts.getBitWidth();
5498 int NumInnerElts = NumElts / 2;
5499 int NumEltsPerLane = NumElts / NumLanes;
5500 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5501
5502 DemandedLHS = APInt::getZero(NumInnerElts);
5503 DemandedRHS = APInt::getZero(NumInnerElts);
5504
5505 // Map DemandedElts to the packed operands.
5506 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5507 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5508 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5509 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5510 if (DemandedElts[OuterIdx])
5511 DemandedLHS.setBit(InnerIdx);
5512 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5513 DemandedRHS.setBit(InnerIdx);
5514 }
5515 }
5516}
5517
5518// Split the demanded elts of a HADD/HSUB node between its operands.
5519static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5520 APInt &DemandedLHS, APInt &DemandedRHS) {
5522 DemandedLHS, DemandedRHS);
5523 DemandedLHS |= DemandedLHS << 1;
5524 DemandedRHS |= DemandedRHS << 1;
5525}
5526
5527/// Calculates the shuffle mask corresponding to the target-specific opcode.
5528/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5529/// operands in \p Ops, and returns true.
5530/// Sets \p IsUnary to true if only one source is used. Note that this will set
5531/// IsUnary for shuffles which use a single input multiple times, and in those
5532/// cases it will adjust the mask to only have indices within that single input.
5533/// It is an error to call this with non-empty Mask/Ops vectors.
5534static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5536 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5537 if (!isTargetShuffle(N.getOpcode()))
5538 return false;
5539
5540 MVT VT = N.getSimpleValueType();
5541 unsigned NumElems = VT.getVectorNumElements();
5542 unsigned MaskEltSize = VT.getScalarSizeInBits();
5544 APInt RawUndefs;
5545 uint64_t ImmN;
5546
5547 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5548 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5549
5550 IsUnary = false;
5551 bool IsFakeUnary = false;
5552 switch (N.getOpcode()) {
5553 case X86ISD::BLENDI:
5554 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5555 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5556 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5557 DecodeBLENDMask(NumElems, ImmN, Mask);
5558 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5559 break;
5560 case X86ISD::SHUFP:
5561 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5562 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5563 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5564 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5565 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5566 break;
5567 case X86ISD::INSERTPS:
5568 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5569 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5570 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5571 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5572 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5573 break;
5574 case X86ISD::EXTRQI:
5575 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5576 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5577 isa<ConstantSDNode>(N.getOperand(2))) {
5578 int BitLen = N.getConstantOperandVal(1);
5579 int BitIdx = N.getConstantOperandVal(2);
5580 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5581 IsUnary = true;
5582 }
5583 break;
5584 case X86ISD::INSERTQI:
5585 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5586 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5587 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5588 isa<ConstantSDNode>(N.getOperand(3))) {
5589 int BitLen = N.getConstantOperandVal(2);
5590 int BitIdx = N.getConstantOperandVal(3);
5591 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5592 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5593 }
5594 break;
5595 case X86ISD::UNPCKH:
5596 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5597 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5598 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5599 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5600 break;
5601 case X86ISD::UNPCKL:
5602 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5603 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5604 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5605 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5606 break;
5607 case X86ISD::MOVHLPS:
5608 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5609 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5610 DecodeMOVHLPSMask(NumElems, Mask);
5611 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5612 break;
5613 case X86ISD::MOVLHPS:
5614 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5615 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5616 DecodeMOVLHPSMask(NumElems, Mask);
5617 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5618 break;
5619 case X86ISD::VALIGN:
5620 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5621 "Only 32-bit and 64-bit elements are supported!");
5622 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5623 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5624 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5625 DecodeVALIGNMask(NumElems, ImmN, Mask);
5626 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5627 Ops.push_back(N.getOperand(1));
5628 Ops.push_back(N.getOperand(0));
5629 break;
5630 case X86ISD::PALIGNR:
5631 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5632 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5633 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5634 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5635 DecodePALIGNRMask(NumElems, ImmN, Mask);
5636 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5637 Ops.push_back(N.getOperand(1));
5638 Ops.push_back(N.getOperand(0));
5639 break;
5640 case X86ISD::VSHLDQ:
5641 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5642 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5643 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5644 DecodePSLLDQMask(NumElems, ImmN, Mask);
5645 IsUnary = true;
5646 break;
5647 case X86ISD::VSRLDQ:
5648 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5649 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5650 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5651 DecodePSRLDQMask(NumElems, ImmN, Mask);
5652 IsUnary = true;
5653 break;
5654 case X86ISD::PSHUFD:
5655 case X86ISD::VPERMILPI:
5656 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5657 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5658 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5659 IsUnary = true;
5660 break;
5661 case X86ISD::PSHUFHW:
5662 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5663 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5664 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5665 IsUnary = true;
5666 break;
5667 case X86ISD::PSHUFLW:
5668 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5669 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5670 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5671 IsUnary = true;
5672 break;
5673 case X86ISD::VZEXT_MOVL:
5674 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5675 DecodeZeroMoveLowMask(NumElems, Mask);
5676 IsUnary = true;
5677 break;
5678 case X86ISD::VBROADCAST:
5679 // We only decode broadcasts of same-sized vectors, peeking through to
5680 // extracted subvectors is likely to cause hasOneUse issues with
5681 // SimplifyDemandedBits etc.
5682 if (N.getOperand(0).getValueType() == VT) {
5683 DecodeVectorBroadcast(NumElems, Mask);
5684 IsUnary = true;
5685 break;
5686 }
5687 return false;
5688 case X86ISD::VPERMILPV: {
5689 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5690 IsUnary = true;
5691 SDValue MaskNode = N.getOperand(1);
5692 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5693 RawUndefs)) {
5694 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5695 break;
5696 }
5697 return false;
5698 }
5699 case X86ISD::PSHUFB: {
5700 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5701 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5702 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5703 IsUnary = true;
5704 SDValue MaskNode = N.getOperand(1);
5705 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5706 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5707 break;
5708 }
5709 return false;
5710 }
5711 case X86ISD::VPERMI:
5712 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5713 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5714 DecodeVPERMMask(NumElems, ImmN, Mask);
5715 IsUnary = true;
5716 break;
5717 case X86ISD::MOVSS:
5718 case X86ISD::MOVSD:
5719 case X86ISD::MOVSH:
5720 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5721 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5722 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5723 break;
5724 case X86ISD::VPERM2X128:
5725 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5726 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5727 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5728 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5729 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5730 break;
5731 case X86ISD::SHUF128:
5732 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5733 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5734 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5735 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5736 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5737 break;
5738 case X86ISD::MOVSLDUP:
5739 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5740 DecodeMOVSLDUPMask(NumElems, Mask);
5741 IsUnary = true;
5742 break;
5743 case X86ISD::MOVSHDUP:
5744 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5745 DecodeMOVSHDUPMask(NumElems, Mask);
5746 IsUnary = true;
5747 break;
5748 case X86ISD::MOVDDUP:
5749 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5750 DecodeMOVDDUPMask(NumElems, Mask);
5751 IsUnary = true;
5752 break;
5753 case X86ISD::VPERMIL2: {
5754 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5755 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5756 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5757 SDValue MaskNode = N.getOperand(2);
5758 SDValue CtrlNode = N.getOperand(3);
5759 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5760 unsigned CtrlImm = CtrlOp->getZExtValue();
5761 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5762 RawUndefs)) {
5763 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5764 Mask);
5765 break;
5766 }
5767 }
5768 return false;
5769 }
5770 case X86ISD::VPPERM: {
5771 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5772 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5773 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5774 SDValue MaskNode = N.getOperand(2);
5775 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5776 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5777 break;
5778 }
5779 return false;
5780 }
5781 case X86ISD::VPERMV: {
5782 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5783 IsUnary = true;
5784 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5785 Ops.push_back(N.getOperand(1));
5786 SDValue MaskNode = N.getOperand(0);
5787 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5788 RawUndefs)) {
5789 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5790 break;
5791 }
5792 return false;
5793 }
5794 case X86ISD::VPERMV3: {
5795 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5796 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5797 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5798 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5799 Ops.push_back(N.getOperand(0));
5800 Ops.push_back(N.getOperand(2));
5801 SDValue MaskNode = N.getOperand(1);
5802 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5803 RawUndefs)) {
5804 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5805 break;
5806 }
5807 return false;
5808 }
5809 default:
5810 llvm_unreachable("unknown target shuffle node");
5811 }
5812
5813 // Empty mask indicates the decode failed.
5814 if (Mask.empty())
5815 return false;
5816
5817 // Check if we're getting a shuffle mask with zero'd elements.
5818 if (!AllowSentinelZero && isAnyZero(Mask))
5819 return false;
5820
5821 // If we have a fake unary shuffle, the shuffle mask is spread across two
5822 // inputs that are actually the same node. Re-map the mask to always point
5823 // into the first input.
5824 if (IsFakeUnary)
5825 for (int &M : Mask)
5826 if (M >= (int)Mask.size())
5827 M -= Mask.size();
5828
5829 // If we didn't already add operands in the opcode-specific code, default to
5830 // adding 1 or 2 operands starting at 0.
5831 if (Ops.empty()) {
5832 Ops.push_back(N.getOperand(0));
5833 if (!IsUnary || IsFakeUnary)
5834 Ops.push_back(N.getOperand(1));
5835 }
5836
5837 return true;
5838}
5839
5840// Wrapper for getTargetShuffleMask with InUnary;
5841static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5843 SmallVectorImpl<int> &Mask) {
5844 bool IsUnary;
5845 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5846}
5847
5848/// Compute whether each element of a shuffle is zeroable.
5849///
5850/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5851/// Either it is an undef element in the shuffle mask, the element of the input
5852/// referenced is undef, or the element of the input referenced is known to be
5853/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5854/// as many lanes with this technique as possible to simplify the remaining
5855/// shuffle.
5857 SDValue V1, SDValue V2,
5858 APInt &KnownUndef, APInt &KnownZero) {
5859 int Size = Mask.size();
5860 KnownUndef = KnownZero = APInt::getZero(Size);
5861
5862 V1 = peekThroughBitcasts(V1);
5863 V2 = peekThroughBitcasts(V2);
5864
5865 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5866 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5867
5868 int VectorSizeInBits = V1.getValueSizeInBits();
5869 int ScalarSizeInBits = VectorSizeInBits / Size;
5870 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5871
5872 for (int i = 0; i < Size; ++i) {
5873 int M = Mask[i];
5874 // Handle the easy cases.
5875 if (M < 0) {
5876 KnownUndef.setBit(i);
5877 continue;
5878 }
5879 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5880 KnownZero.setBit(i);
5881 continue;
5882 }
5883
5884 // Determine shuffle input and normalize the mask.
5885 SDValue V = M < Size ? V1 : V2;
5886 M %= Size;
5887
5888 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5889 if (V.getOpcode() != ISD::BUILD_VECTOR)
5890 continue;
5891
5892 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5893 // the (larger) source element must be UNDEF/ZERO.
5894 if ((Size % V.getNumOperands()) == 0) {
5895 int Scale = Size / V->getNumOperands();
5896 SDValue Op = V.getOperand(M / Scale);
5897 if (Op.isUndef())
5898 KnownUndef.setBit(i);
5899 if (X86::isZeroNode(Op))
5900 KnownZero.setBit(i);
5901 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5902 APInt Val = Cst->getAPIntValue();
5903 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5904 if (Val == 0)
5905 KnownZero.setBit(i);
5906 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5907 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5908 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5909 if (Val == 0)
5910 KnownZero.setBit(i);
5911 }
5912 continue;
5913 }
5914
5915 // If the BUILD_VECTOR has more elements then all the (smaller) source
5916 // elements must be UNDEF or ZERO.
5917 if ((V.getNumOperands() % Size) == 0) {
5918 int Scale = V->getNumOperands() / Size;
5919 bool AllUndef = true;
5920 bool AllZero = true;
5921 for (int j = 0; j < Scale; ++j) {
5922 SDValue Op = V.getOperand((M * Scale) + j);
5923 AllUndef &= Op.isUndef();
5924 AllZero &= X86::isZeroNode(Op);
5925 }
5926 if (AllUndef)
5927 KnownUndef.setBit(i);
5928 if (AllZero)
5929 KnownZero.setBit(i);
5930 continue;
5931 }
5932 }
5933}
5934
5935/// Decode a target shuffle mask and inputs and see if any values are
5936/// known to be undef or zero from their inputs.
5937/// Returns true if the target shuffle mask was decoded.
5938/// FIXME: Merge this with computeZeroableShuffleElements?
5941 APInt &KnownUndef, APInt &KnownZero) {
5942 bool IsUnary;
5943 if (!isTargetShuffle(N.getOpcode()))
5944 return false;
5945
5946 MVT VT = N.getSimpleValueType();
5947 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5948 return false;
5949
5950 int Size = Mask.size();
5951 SDValue V1 = Ops[0];
5952 SDValue V2 = IsUnary ? V1 : Ops[1];
5953 KnownUndef = KnownZero = APInt::getZero(Size);
5954
5955 V1 = peekThroughBitcasts(V1);
5956 V2 = peekThroughBitcasts(V2);
5957
5958 assert((VT.getSizeInBits() % Size) == 0 &&
5959 "Illegal split of shuffle value type");
5960 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5961
5962 // Extract known constant input data.
5963 APInt UndefSrcElts[2];
5964 SmallVector<APInt, 32> SrcEltBits[2];
5965 bool IsSrcConstant[2] = {
5966 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5967 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5968 /*AllowPartialUndefs*/ false),
5969 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5970 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5971 /*AllowPartialUndefs*/ false)};
5972
5973 for (int i = 0; i < Size; ++i) {
5974 int M = Mask[i];
5975
5976 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5977 if (M < 0) {
5978 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5979 if (SM_SentinelUndef == M)
5980 KnownUndef.setBit(i);
5981 if (SM_SentinelZero == M)
5982 KnownZero.setBit(i);
5983 continue;
5984 }
5985
5986 // Determine shuffle input and normalize the mask.
5987 unsigned SrcIdx = M / Size;
5988 SDValue V = M < Size ? V1 : V2;
5989 M %= Size;
5990
5991 // We are referencing an UNDEF input.
5992 if (V.isUndef()) {
5993 KnownUndef.setBit(i);
5994 continue;
5995 }
5996
5997 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5998 // TODO: We currently only set UNDEF for integer types - floats use the same
5999 // registers as vectors and many of the scalar folded loads rely on the
6000 // SCALAR_TO_VECTOR pattern.
6001 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6002 (Size % V.getValueType().getVectorNumElements()) == 0) {
6003 int Scale = Size / V.getValueType().getVectorNumElements();
6004 int Idx = M / Scale;
6005 if (Idx != 0 && !VT.isFloatingPoint())
6006 KnownUndef.setBit(i);
6007 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6008 KnownZero.setBit(i);
6009 continue;
6010 }
6011
6012 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6013 // base vectors.
6014 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6015 SDValue Vec = V.getOperand(0);
6016 int NumVecElts = Vec.getValueType().getVectorNumElements();
6017 if (Vec.isUndef() && Size == NumVecElts) {
6018 int Idx = V.getConstantOperandVal(2);
6019 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6020 if (M < Idx || (Idx + NumSubElts) <= M)
6021 KnownUndef.setBit(i);
6022 }
6023 continue;
6024 }
6025
6026 // Attempt to extract from the source's constant bits.
6027 if (IsSrcConstant[SrcIdx]) {
6028 if (UndefSrcElts[SrcIdx][M])
6029 KnownUndef.setBit(i);
6030 else if (SrcEltBits[SrcIdx][M] == 0)
6031 KnownZero.setBit(i);
6032 }
6033 }
6034
6035 assert(VT.getVectorNumElements() == (unsigned)Size &&
6036 "Different mask size from vector size!");
6037 return true;
6038}
6039
6040// Replace target shuffle mask elements with known undef/zero sentinels.
6042 const APInt &KnownUndef,
6043 const APInt &KnownZero,
6044 bool ResolveKnownZeros= true) {
6045 unsigned NumElts = Mask.size();
6046 assert(KnownUndef.getBitWidth() == NumElts &&
6047 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6048
6049 for (unsigned i = 0; i != NumElts; ++i) {
6050 if (KnownUndef[i])
6051 Mask[i] = SM_SentinelUndef;
6052 else if (ResolveKnownZeros && KnownZero[i])
6053 Mask[i] = SM_SentinelZero;
6054 }
6055}
6056
6057// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6059 APInt &KnownUndef,
6060 APInt &KnownZero) {
6061 unsigned NumElts = Mask.size();
6062 KnownUndef = KnownZero = APInt::getZero(NumElts);
6063
6064 for (unsigned i = 0; i != NumElts; ++i) {
6065 int M = Mask[i];
6066 if (SM_SentinelUndef == M)
6067 KnownUndef.setBit(i);
6068 if (SM_SentinelZero == M)
6069 KnownZero.setBit(i);
6070 }
6071}
6072
6073// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6075 SDValue Cond, bool IsBLENDV = false) {
6076 EVT CondVT = Cond.getValueType();
6077 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6078 unsigned NumElts = CondVT.getVectorNumElements();
6079
6080 APInt UndefElts;
6081 SmallVector<APInt, 32> EltBits;
6082 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6083 /*AllowWholeUndefs*/ true,
6084 /*AllowPartialUndefs*/ false))
6085 return false;
6086
6087 Mask.resize(NumElts, SM_SentinelUndef);
6088
6089 for (int i = 0; i != (int)NumElts; ++i) {
6090 Mask[i] = i;
6091 // Arbitrarily choose from the 2nd operand if the select condition element
6092 // is undef.
6093 // TODO: Can we do better by matching patterns such as even/odd?
6094 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6095 (IsBLENDV && EltBits[i].isNonNegative()))
6096 Mask[i] += NumElts;
6097 }
6098
6099 return true;
6100}
6101
6102// Forward declaration (for getFauxShuffleMask recursive check).
6103static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6106 const SelectionDAG &DAG, unsigned Depth,
6107 bool ResolveKnownElts);
6108
6109// Attempt to decode ops that could be represented as a shuffle mask.
6110// The decoded shuffle mask may contain a different number of elements to the
6111// destination value type.
6112// TODO: Merge into getTargetShuffleInputs()
6113static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6116 const SelectionDAG &DAG, unsigned Depth,
6117 bool ResolveKnownElts) {
6118 Mask.clear();
6119 Ops.clear();
6120
6121 MVT VT = N.getSimpleValueType();
6122 unsigned NumElts = VT.getVectorNumElements();
6123 unsigned NumSizeInBits = VT.getSizeInBits();
6124 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6125 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6126 return false;
6127 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6128 unsigned NumSizeInBytes = NumSizeInBits / 8;
6129 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6130
6131 unsigned Opcode = N.getOpcode();
6132 switch (Opcode) {
6133 case ISD::VECTOR_SHUFFLE: {
6134 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6135 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6136 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6137 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6138 Ops.push_back(N.getOperand(0));
6139 Ops.push_back(N.getOperand(1));
6140 return true;
6141 }
6142 return false;
6143 }
6144 case ISD::AND:
6145 case X86ISD::ANDNP: {
6146 // Attempt to decode as a per-byte mask.
6147 APInt UndefElts;
6148 SmallVector<APInt, 32> EltBits;
6149 SDValue N0 = N.getOperand(0);
6150 SDValue N1 = N.getOperand(1);
6151 bool IsAndN = (X86ISD::ANDNP == Opcode);
6152 uint64_t ZeroMask = IsAndN ? 255 : 0;
6153 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6154 /*AllowWholeUndefs*/ false,
6155 /*AllowPartialUndefs*/ false))
6156 return false;
6157 // We can't assume an undef src element gives an undef dst - the other src
6158 // might be zero.
6159 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6160 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6161 const APInt &ByteBits = EltBits[i];
6162 if (ByteBits != 0 && ByteBits != 255)
6163 return false;
6164 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6165 }
6166 Ops.push_back(IsAndN ? N1 : N0);
6167 return true;
6168 }
6169 case ISD::OR: {
6170 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6171 // is a valid shuffle index.
6172 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6173 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6174 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6175 return false;
6176
6177 SmallVector<int, 64> SrcMask0, SrcMask1;
6178 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6181 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6182 Depth + 1, true) ||
6183 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6184 Depth + 1, true))
6185 return false;
6186
6187 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6188 SmallVector<int, 64> Mask0, Mask1;
6189 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6190 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6191 for (int i = 0; i != (int)MaskSize; ++i) {
6192 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6193 // loops converting between OR and BLEND shuffles due to
6194 // canWidenShuffleElements merging away undef elements, meaning we
6195 // fail to recognise the OR as the undef element isn't known zero.
6196 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6197 Mask.push_back(SM_SentinelZero);
6198 else if (Mask1[i] == SM_SentinelZero)
6199 Mask.push_back(i);
6200 else if (Mask0[i] == SM_SentinelZero)
6201 Mask.push_back(i + MaskSize);
6202 else
6203 return false;
6204 }
6205 Ops.push_back(N.getOperand(0));
6206 Ops.push_back(N.getOperand(1));
6207 return true;
6208 }
6209 case ISD::CONCAT_VECTORS: {
6210 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6211 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6212 if (NumBitsPerElt == 64) {
6213 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6214 for (unsigned M = 0; M != NumSubElts; ++M)
6215 Mask.push_back((I * NumElts) + M);
6216 Ops.push_back(N.getOperand(I));
6217 }
6218 return true;
6219 }
6220 return false;
6221 }
6222 case ISD::INSERT_SUBVECTOR: {
6223 SDValue Src = N.getOperand(0);
6224 SDValue Sub = N.getOperand(1);
6225 EVT SubVT = Sub.getValueType();
6226 unsigned NumSubElts = SubVT.getVectorNumElements();
6227 uint64_t InsertIdx = N.getConstantOperandVal(2);
6228 // Subvector isn't demanded - just return the base vector.
6229 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6230 Mask.resize(NumElts);
6231 std::iota(Mask.begin(), Mask.end(), 0);
6232 Ops.push_back(Src);
6233 return true;
6234 }
6235 // Handle CONCAT(SUB0, SUB1).
6236 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6237 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6238 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6239 Src.getOperand(0).isUndef() &&
6240 Src.getOperand(1).getValueType() == SubVT &&
6241 Src.getConstantOperandVal(2) == 0 &&
6242 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6243 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6244 Mask.resize(NumElts);
6245 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6246 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6247 Ops.push_back(Src.getOperand(1));
6248 Ops.push_back(Sub);
6249 return true;
6250 }
6251 if (!N->isOnlyUserOf(Sub.getNode()))
6252 return false;
6253
6254 SmallVector<int, 64> SubMask;
6255 SmallVector<SDValue, 2> SubInputs;
6257 EVT SubSrcVT = SubSrc.getValueType();
6258 if (!SubSrcVT.isVector())
6259 return false;
6260
6261 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6262 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6263 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6264 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6265 SDValue SubSrcSrc = SubSrc.getOperand(0);
6266 unsigned NumSubSrcSrcElts =
6267 SubSrcSrc.getValueType().getVectorNumElements();
6268 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6269 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6270 "Subvector valuetype mismatch");
6271 InsertIdx *= (MaxElts / NumElts);
6272 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6273 NumSubElts *= (MaxElts / NumElts);
6274 bool SrcIsUndef = Src.isUndef();
6275 for (int i = 0; i != (int)MaxElts; ++i)
6276 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6277 for (int i = 0; i != (int)NumSubElts; ++i)
6278 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6279 if (!SrcIsUndef)
6280 Ops.push_back(Src);
6281 Ops.push_back(SubSrcSrc);
6282 return true;
6283 }
6284
6285 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6286 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6287 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6288 Depth + 1, ResolveKnownElts))
6289 return false;
6290
6291 // Subvector shuffle inputs must not be larger than the subvector.
6292 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6293 return SubVT.getFixedSizeInBits() <
6294 SubInput.getValueSizeInBits().getFixedValue();
6295 }))
6296 return false;
6297
6298 if (SubMask.size() != NumSubElts) {
6299 assert(((SubMask.size() % NumSubElts) == 0 ||
6300 (NumSubElts % SubMask.size()) == 0) &&
6301 "Illegal submask scale");
6302 if ((NumSubElts % SubMask.size()) == 0) {
6303 int Scale = NumSubElts / SubMask.size();
6304 SmallVector<int, 64> ScaledSubMask;
6305 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6306 SubMask = ScaledSubMask;
6307 } else {
6308 int Scale = SubMask.size() / NumSubElts;
6309 NumSubElts = SubMask.size();
6310 NumElts *= Scale;
6311 InsertIdx *= Scale;
6312 }
6313 }
6314 Ops.push_back(Src);
6315 Ops.append(SubInputs.begin(), SubInputs.end());
6316 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6317 Mask.append(NumElts, SM_SentinelZero);
6318 else
6319 for (int i = 0; i != (int)NumElts; ++i)
6320 Mask.push_back(i);
6321 for (int i = 0; i != (int)NumSubElts; ++i) {
6322 int M = SubMask[i];
6323 if (0 <= M) {
6324 int InputIdx = M / NumSubElts;
6325 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6326 }
6327 Mask[i + InsertIdx] = M;
6328 }
6329 return true;
6330 }
6331 case X86ISD::PINSRB:
6332 case X86ISD::PINSRW:
6335 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6336 // vector, for matching src/dst vector types.
6337 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6338
6339 unsigned DstIdx = 0;
6340 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6341 // Check we have an in-range constant insertion index.
6342 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6343 N.getConstantOperandAPInt(2).uge(NumElts))
6344 return false;
6345 DstIdx = N.getConstantOperandVal(2);
6346
6347 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6348 if (X86::isZeroNode(Scl)) {
6349 Ops.push_back(N.getOperand(0));
6350 for (unsigned i = 0; i != NumElts; ++i)
6351 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6352 return true;
6353 }
6354 }
6355
6356 // Peek through trunc/aext/zext/bitcast.
6357 // TODO: aext shouldn't require SM_SentinelZero padding.
6358 // TODO: handle shift of scalars.
6359 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6360 while (Scl.getOpcode() == ISD::TRUNCATE ||
6361 Scl.getOpcode() == ISD::ANY_EXTEND ||
6362 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6363 (Scl.getOpcode() == ISD::BITCAST &&
6366 Scl = Scl.getOperand(0);
6367 MinBitsPerElt =
6368 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6369 }
6370 if ((MinBitsPerElt % 8) != 0)
6371 return false;
6372
6373 // Attempt to find the source vector the scalar was extracted from.
6374 SDValue SrcExtract;
6375 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6376 Scl.getOpcode() == X86ISD::PEXTRW ||
6377 Scl.getOpcode() == X86ISD::PEXTRB) &&
6378 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6379 SrcExtract = Scl;
6380 }
6381 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6382 return false;
6383
6384 SDValue SrcVec = SrcExtract.getOperand(0);
6385 EVT SrcVT = SrcVec.getValueType();
6386 if (!SrcVT.getScalarType().isByteSized())
6387 return false;
6388 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6389 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6390 unsigned DstByte = DstIdx * NumBytesPerElt;
6391 MinBitsPerElt =
6392 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6393
6394 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6395 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6396 Ops.push_back(SrcVec);
6397 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6398 } else {
6399 Ops.push_back(SrcVec);
6400 Ops.push_back(N.getOperand(0));
6401 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6402 Mask.push_back(NumSizeInBytes + i);
6403 }
6404
6405 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6406 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6407 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6408 Mask[DstByte + i] = SrcByte + i;
6409 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6410 Mask[DstByte + i] = SM_SentinelZero;
6411 return true;
6412 }
6413 case X86ISD::PACKSS:
6414 case X86ISD::PACKUS: {
6415 SDValue N0 = N.getOperand(0);
6416 SDValue N1 = N.getOperand(1);
6417 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6418 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6419 "Unexpected input value type");
6420
6421 APInt EltsLHS, EltsRHS;
6422 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6423
6424 // If we know input saturation won't happen (or we don't care for particular
6425 // lanes), we can treat this as a truncation shuffle.
6426 bool Offset0 = false, Offset1 = false;
6427 if (Opcode == X86ISD::PACKSS) {
6428 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6429 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6430 (!(N1.isUndef() || EltsRHS.isZero()) &&
6431 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6432 return false;
6433 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6434 // PACKSS then it was likely being used for sign-extension for a
6435 // truncation, so just peek through and adjust the mask accordingly.
6436 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6437 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6438 Offset0 = true;
6439 N0 = N0.getOperand(0);
6440 }
6441 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6442 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6443 Offset1 = true;
6444 N1 = N1.getOperand(0);
6445 }
6446 } else {
6447 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6448 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6449 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6450 (!(N1.isUndef() || EltsRHS.isZero()) &&
6451 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6452 return false;
6453 }
6454
6455 bool IsUnary = (N0 == N1);
6456
6457 Ops.push_back(N0);
6458 if (!IsUnary)
6459 Ops.push_back(N1);
6460
6461 createPackShuffleMask(VT, Mask, IsUnary);
6462
6463 if (Offset0 || Offset1) {
6464 for (int &M : Mask)
6465 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6466 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6467 ++M;
6468 }
6469 return true;
6470 }
6471 case ISD::VSELECT:
6472 case X86ISD::BLENDV: {
6473 SDValue Cond = N.getOperand(0);
6474 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6475 Ops.push_back(N.getOperand(1));
6476 Ops.push_back(N.getOperand(2));
6477 return true;
6478 }
6479 return false;
6480 }
6481 case X86ISD::VTRUNC: {
6482 SDValue Src = N.getOperand(0);
6483 EVT SrcVT = Src.getValueType();
6484 if (SrcVT.getSizeInBits() != NumSizeInBits)
6485 return false;
6486 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6487 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6488 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6489 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6490 for (unsigned i = 0; i != NumSrcElts; ++i)
6491 Mask.push_back(i * Scale);
6492 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6493 Ops.push_back(Src);
6494 return true;
6495 }
6496 case ISD::SHL:
6497 case ISD::SRL: {
6498 APInt UndefElts;
6499 SmallVector<APInt, 32> EltBits;
6500 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6501 UndefElts, EltBits,
6502 /*AllowWholeUndefs*/ true,
6503 /*AllowPartialUndefs*/ false))
6504 return false;
6505
6506 // We can only decode 'whole byte' bit shifts as shuffles.
6507 for (unsigned I = 0; I != NumElts; ++I)
6508 if (DemandedElts[I] && !UndefElts[I] &&
6509 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6510 return false;
6511
6512 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6513 Ops.push_back(N.getOperand(0));
6514
6515 for (unsigned I = 0; I != NumElts; ++I) {
6516 if (!DemandedElts[I] || UndefElts[I])
6517 continue;
6518 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6519 unsigned Lo = I * NumBytesPerElt;
6520 unsigned Hi = Lo + NumBytesPerElt;
6521 // Clear mask to all zeros and insert the shifted byte indices.
6522 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6523 if (ISD::SHL == Opcode)
6524 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6525 else
6526 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6527 Lo + ByteShift);
6528 }
6529 return true;
6530 }
6531 case X86ISD::VSHLI:
6532 case X86ISD::VSRLI: {
6533 uint64_t ShiftVal = N.getConstantOperandVal(1);
6534 // Out of range bit shifts are guaranteed to be zero.
6535 if (NumBitsPerElt <= ShiftVal) {
6536 Mask.append(NumElts, SM_SentinelZero);
6537 return true;
6538 }
6539
6540 // We can only decode 'whole byte' bit shifts as shuffles.
6541 if ((ShiftVal % 8) != 0)
6542 break;
6543
6544 uint64_t ByteShift = ShiftVal / 8;
6545 Ops.push_back(N.getOperand(0));
6546
6547 // Clear mask to all zeros and insert the shifted byte indices.
6548 Mask.append(NumSizeInBytes, SM_SentinelZero);
6549
6550 if (X86ISD::VSHLI == Opcode) {
6551 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6552 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6553 Mask[i + j] = i + j - ByteShift;
6554 } else {
6555 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6556 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6557 Mask[i + j - ByteShift] = i + j;
6558 }
6559 return true;
6560 }
6561 case X86ISD::VROTLI:
6562 case X86ISD::VROTRI: {
6563 // We can only decode 'whole byte' bit rotates as shuffles.
6564 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6565 if ((RotateVal % 8) != 0)
6566 return false;
6567 Ops.push_back(N.getOperand(0));
6568 int Offset = RotateVal / 8;
6569 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6570 for (int i = 0; i != (int)NumElts; ++i) {
6571 int BaseIdx = i * NumBytesPerElt;
6572 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6573 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6574 }
6575 }
6576 return true;
6577 }
6578 case X86ISD::VBROADCAST: {
6579 SDValue Src = N.getOperand(0);
6580 if (!Src.getSimpleValueType().isVector()) {
6581 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6582 !isNullConstant(Src.getOperand(1)) ||
6583 Src.getOperand(0).getValueType().getScalarType() !=
6584 VT.getScalarType())
6585 return false;
6586 Src = Src.getOperand(0);
6587 }
6588 Ops.push_back(Src);
6589 Mask.append(NumElts, 0);
6590 return true;
6591 }
6593 SDValue Src = N.getOperand(0);
6594 EVT SrcVT = Src.getValueType();
6595 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6596
6597 // Extended source must be a simple vector.
6598 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6599 (NumBitsPerSrcElt % 8) != 0)
6600 return false;
6601
6602 // We can only handle all-signbits extensions.
6603 APInt DemandedSrcElts =
6604 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6605 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6606 return false;
6607
6608 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6609 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6610 for (unsigned I = 0; I != NumElts; ++I)
6611 Mask.append(Scale, I);
6612 Ops.push_back(Src);
6613 return true;
6614 }
6615 case ISD::ZERO_EXTEND:
6616 case ISD::ANY_EXTEND:
6619 SDValue Src = N.getOperand(0);
6620 EVT SrcVT = Src.getValueType();
6621
6622 // Extended source must be a simple vector.
6623 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6624 (SrcVT.getScalarSizeInBits() % 8) != 0)
6625 return false;
6626
6627 bool IsAnyExtend =
6628 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6629 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6630 IsAnyExtend, Mask);
6631 Ops.push_back(Src);
6632 return true;
6633 }
6634 }
6635
6636 return false;
6637}
6638
6639/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6641 SmallVectorImpl<int> &Mask) {
6642 int MaskWidth = Mask.size();
6643 SmallVector<SDValue, 16> UsedInputs;
6644 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6645 int lo = UsedInputs.size() * MaskWidth;
6646 int hi = lo + MaskWidth;
6647
6648 // Strip UNDEF input usage.
6649 if (Inputs[i].isUndef())
6650 for (int &M : Mask)
6651 if ((lo <= M) && (M < hi))
6652 M = SM_SentinelUndef;
6653
6654 // Check for unused inputs.
6655 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6656 for (int &M : Mask)
6657 if (lo <= M)
6658 M -= MaskWidth;
6659 continue;
6660 }
6661
6662 // Check for repeated inputs.
6663 bool IsRepeat = false;
6664 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6665 if (UsedInputs[j] != Inputs[i])
6666 continue;
6667 for (int &M : Mask)
6668 if (lo <= M)
6669 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6670 IsRepeat = true;
6671 break;
6672 }
6673 if (IsRepeat)
6674 continue;
6675
6676 UsedInputs.push_back(Inputs[i]);
6677 }
6678 Inputs = UsedInputs;
6679}
6680
6681/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6682/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6683/// Returns true if the target shuffle mask was decoded.
6684static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6687 APInt &KnownUndef, APInt &KnownZero,
6688 const SelectionDAG &DAG, unsigned Depth,
6689 bool ResolveKnownElts) {
6691 return false; // Limit search depth.
6692
6693 EVT VT = Op.getValueType();
6694 if (!VT.isSimple() || !VT.isVector())
6695 return false;
6696
6697 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6698 if (ResolveKnownElts)
6699 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6700 return true;
6701 }
6702 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6703 ResolveKnownElts)) {
6704 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6705 return true;
6706 }
6707 return false;
6708}
6709
6710static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6713 const SelectionDAG &DAG, unsigned Depth,
6714 bool ResolveKnownElts) {
6715 APInt KnownUndef, KnownZero;
6716 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6717 KnownZero, DAG, Depth, ResolveKnownElts);
6718}
6719
6722 const SelectionDAG &DAG, unsigned Depth = 0,
6723 bool ResolveKnownElts = true) {
6724 EVT VT = Op.getValueType();
6725 if (!VT.isSimple() || !VT.isVector())
6726 return false;
6727
6728 unsigned NumElts = Op.getValueType().getVectorNumElements();
6729 APInt DemandedElts = APInt::getAllOnes(NumElts);
6730 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6731 ResolveKnownElts);
6732}
6733
6734// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6735static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6736 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6737 SelectionDAG &DAG) {
6738 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6739 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6740 "Unknown broadcast load type");
6741
6742 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6743 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6744 return SDValue();
6745
6748 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6749 SDValue Ops[] = {Mem->getChain(), Ptr};
6750 SDValue BcstLd = DAG.getMemIntrinsicNode(
6751 Opcode, DL, Tys, Ops, MemVT,
6753 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6754 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6755 return BcstLd;
6756}
6757
6758/// Returns the scalar element that will make up the i'th
6759/// element of the result of the vector shuffle.
6760static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6761 SelectionDAG &DAG, unsigned Depth) {
6763 return SDValue(); // Limit search depth.
6764
6765 EVT VT = Op.getValueType();
6766 unsigned Opcode = Op.getOpcode();
6767 unsigned NumElems = VT.getVectorNumElements();
6768
6769 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6770 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6771 int Elt = SV->getMaskElt(Index);
6772
6773 if (Elt < 0)
6774 return DAG.getUNDEF(VT.getVectorElementType());
6775
6776 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6777 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6778 }
6779
6780 // Recurse into target specific vector shuffles to find scalars.
6781 if (isTargetShuffle(Opcode)) {
6782 MVT ShufVT = VT.getSimpleVT();
6783 MVT ShufSVT = ShufVT.getVectorElementType();
6784 int NumElems = (int)ShufVT.getVectorNumElements();
6785 SmallVector<int, 16> ShuffleMask;
6787 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6788 return SDValue();
6789
6790 int Elt = ShuffleMask[Index];
6791 if (Elt == SM_SentinelZero)
6792 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6793 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6794 if (Elt == SM_SentinelUndef)
6795 return DAG.getUNDEF(ShufSVT);
6796
6797 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6798 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6799 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6800 }
6801
6802 // Recurse into insert_subvector base/sub vector to find scalars.
6803 if (Opcode == ISD::INSERT_SUBVECTOR) {
6804 SDValue Vec = Op.getOperand(0);
6805 SDValue Sub = Op.getOperand(1);
6806 uint64_t SubIdx = Op.getConstantOperandVal(2);
6807 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6808
6809 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6810 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6811 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6812 }
6813
6814 // Recurse into concat_vectors sub vector to find scalars.
6815 if (Opcode == ISD::CONCAT_VECTORS) {
6816 EVT SubVT = Op.getOperand(0).getValueType();
6817 unsigned NumSubElts = SubVT.getVectorNumElements();
6818 uint64_t SubIdx = Index / NumSubElts;
6819 uint64_t SubElt = Index % NumSubElts;
6820 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6821 }
6822
6823 // Recurse into extract_subvector src vector to find scalars.
6824 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6825 SDValue Src = Op.getOperand(0);
6826 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6827 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6828 }
6829
6830 // We only peek through bitcasts of the same vector width.
6831 if (Opcode == ISD::BITCAST) {
6832 SDValue Src = Op.getOperand(0);
6833 EVT SrcVT = Src.getValueType();
6834 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6835 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6836 return SDValue();
6837 }
6838
6839 // Actual nodes that may contain scalar elements
6840
6841 // For insert_vector_elt - either return the index matching scalar or recurse
6842 // into the base vector.
6843 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6844 isa<ConstantSDNode>(Op.getOperand(2))) {
6845 if (Op.getConstantOperandAPInt(2) == Index)
6846 return Op.getOperand(1);
6847 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6848 }
6849
6850 if (Opcode == ISD::SCALAR_TO_VECTOR)
6851 return (Index == 0) ? Op.getOperand(0)
6852 : DAG.getUNDEF(VT.getVectorElementType());
6853
6854 if (Opcode == ISD::BUILD_VECTOR)
6855 return Op.getOperand(Index);
6856
6857 return SDValue();
6858}
6859
6860// Use PINSRB/PINSRW/PINSRD to create a build vector.
6862 const APInt &NonZeroMask,
6863 unsigned NumNonZero, unsigned NumZero,
6864 SelectionDAG &DAG,
6865 const X86Subtarget &Subtarget) {
6866 MVT VT = Op.getSimpleValueType();
6867 unsigned NumElts = VT.getVectorNumElements();
6868 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6869 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6870 "Illegal vector insertion");
6871
6872 SDValue V;
6873 bool First = true;
6874
6875 for (unsigned i = 0; i < NumElts; ++i) {
6876 bool IsNonZero = NonZeroMask[i];
6877 if (!IsNonZero)
6878 continue;
6879
6880 // If the build vector contains zeros or our first insertion is not the
6881 // first index then insert into zero vector to break any register
6882 // dependency else use SCALAR_TO_VECTOR.
6883 if (First) {
6884 First = false;
6885 if (NumZero || 0 != i)
6886 V = getZeroVector(VT, Subtarget, DAG, DL);
6887 else {
6888 assert(0 == i && "Expected insertion into zero-index");
6889 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6890 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6891 V = DAG.getBitcast(VT, V);
6892 continue;
6893 }
6894 }
6895 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6896 DAG.getVectorIdxConstant(i, DL));
6897 }
6898
6899 return V;
6900}
6901
6902/// Custom lower build_vector of v16i8.
6904 const APInt &NonZeroMask,
6905 unsigned NumNonZero, unsigned NumZero,
6906 SelectionDAG &DAG,
6907 const X86Subtarget &Subtarget) {
6908 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6909 return SDValue();
6910
6911 // SSE4.1 - use PINSRB to insert each byte directly.
6912 if (Subtarget.hasSSE41())
6913 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6914 DAG, Subtarget);
6915
6916 SDValue V;
6917
6918 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6919 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6920 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6921 !NonZeroMask.extractBits(2, 2).isZero()) {
6922 for (unsigned I = 0; I != 4; ++I) {
6923 if (!NonZeroMask[I])
6924 continue;
6925 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6926 if (I != 0)
6927 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6928 DAG.getConstant(I * 8, DL, MVT::i8));
6929 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6930 }
6931 assert(V && "Failed to fold v16i8 vector to zero");
6932 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6933 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6934 V = DAG.getBitcast(MVT::v8i16, V);
6935 }
6936 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6937 bool ThisIsNonZero = NonZeroMask[i];
6938 bool NextIsNonZero = NonZeroMask[i + 1];
6939 if (!ThisIsNonZero && !NextIsNonZero)
6940 continue;
6941
6942 SDValue Elt;
6943 if (ThisIsNonZero) {
6944 if (NumZero || NextIsNonZero)
6945 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6946 else
6947 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6948 }
6949
6950 if (NextIsNonZero) {
6951 SDValue NextElt = Op.getOperand(i + 1);
6952 if (i == 0 && NumZero)
6953 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6954 else
6955 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6956 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6957 DAG.getConstant(8, DL, MVT::i8));
6958 if (ThisIsNonZero)
6959 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6960 else
6961 Elt = NextElt;
6962 }
6963
6964 // If our first insertion is not the first index or zeros are needed, then
6965 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6966 // elements undefined).
6967 if (!V) {
6968 if (i != 0 || NumZero)
6969 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6970 else {
6971 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6972 V = DAG.getBitcast(MVT::v8i16, V);
6973 continue;
6974 }
6975 }
6976 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6977 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6978 DAG.getVectorIdxConstant(i / 2, DL));
6979 }
6980
6981 return DAG.getBitcast(MVT::v16i8, V);
6982}
6983
6984/// Custom lower build_vector of v8i16.
6986 const APInt &NonZeroMask,
6987 unsigned NumNonZero, unsigned NumZero,
6988 SelectionDAG &DAG,
6989 const X86Subtarget &Subtarget) {
6990 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6991 return SDValue();
6992
6993 // Use PINSRW to insert each byte directly.
6994 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6995 Subtarget);
6996}
6997
6998/// Custom lower build_vector of v4i32 or v4f32.
7000 SelectionDAG &DAG,
7001 const X86Subtarget &Subtarget) {
7002 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7003 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7004 // Because we're creating a less complicated build vector here, we may enable
7005 // further folding of the MOVDDUP via shuffle transforms.
7006 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7007 Op.getOperand(0) == Op.getOperand(2) &&
7008 Op.getOperand(1) == Op.getOperand(3) &&
7009 Op.getOperand(0) != Op.getOperand(1)) {
7010 MVT VT = Op.getSimpleValueType();
7011 MVT EltVT = VT.getVectorElementType();
7012 // Create a new build vector with the first 2 elements followed by undef
7013 // padding, bitcast to v2f64, duplicate, and bitcast back.
7014 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7015 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7016 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7017 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7018 return DAG.getBitcast(VT, Dup);
7019 }
7020
7021 // Find all zeroable elements.
7022 std::bitset<4> Zeroable, Undefs;
7023 for (int i = 0; i < 4; ++i) {
7024 SDValue Elt = Op.getOperand(i);
7025 Undefs[i] = Elt.isUndef();
7026 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7027 }
7028 assert(Zeroable.size() - Zeroable.count() > 1 &&
7029 "We expect at least two non-zero elements!");
7030
7031 // We only know how to deal with build_vector nodes where elements are either
7032 // zeroable or extract_vector_elt with constant index.
7033 SDValue FirstNonZero;
7034 unsigned FirstNonZeroIdx;
7035 for (unsigned i = 0; i < 4; ++i) {
7036 if (Zeroable[i])
7037 continue;
7038 SDValue Elt = Op.getOperand(i);
7039 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7041 return SDValue();
7042 // Make sure that this node is extracting from a 128-bit vector.
7043 MVT VT = Elt.getOperand(0).getSimpleValueType();
7044 if (!VT.is128BitVector())
7045 return SDValue();
7046 if (!FirstNonZero.getNode()) {
7047 FirstNonZero = Elt;
7048 FirstNonZeroIdx = i;
7049 }
7050 }
7051
7052 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7053 SDValue V1 = FirstNonZero.getOperand(0);
7054 MVT VT = V1.getSimpleValueType();
7055
7056 // See if this build_vector can be lowered as a blend with zero.
7057 SDValue Elt;
7058 unsigned EltMaskIdx, EltIdx;
7059 int Mask[4];
7060 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7061 if (Zeroable[EltIdx]) {
7062 // The zero vector will be on the right hand side.
7063 Mask[EltIdx] = EltIdx+4;
7064 continue;
7065 }
7066
7067 Elt = Op->getOperand(EltIdx);
7068 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7069 EltMaskIdx = Elt.getConstantOperandVal(1);
7070 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7071 break;
7072 Mask[EltIdx] = EltIdx;
7073 }
7074
7075 if (EltIdx == 4) {
7076 // Let the shuffle legalizer deal with blend operations.
7077 SDValue VZeroOrUndef = (Zeroable == Undefs)
7078 ? DAG.getUNDEF(VT)
7079 : getZeroVector(VT, Subtarget, DAG, DL);
7080 if (V1.getSimpleValueType() != VT)
7081 V1 = DAG.getBitcast(VT, V1);
7082 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7083 }
7084
7085 // See if we can lower this build_vector to a INSERTPS.
7086 if (!Subtarget.hasSSE41())
7087 return SDValue();
7088
7089 SDValue V2 = Elt.getOperand(0);
7090 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7091 V1 = SDValue();
7092
7093 bool CanFold = true;
7094 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7095 if (Zeroable[i])
7096 continue;
7097
7098 SDValue Current = Op->getOperand(i);
7099 SDValue SrcVector = Current->getOperand(0);
7100 if (!V1.getNode())
7101 V1 = SrcVector;
7102 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7103 }
7104
7105 if (!CanFold)
7106 return SDValue();
7107
7108 assert(V1.getNode() && "Expected at least two non-zero elements!");
7109 if (V1.getSimpleValueType() != MVT::v4f32)
7110 V1 = DAG.getBitcast(MVT::v4f32, V1);
7111 if (V2.getSimpleValueType() != MVT::v4f32)
7112 V2 = DAG.getBitcast(MVT::v4f32, V2);
7113
7114 // Ok, we can emit an INSERTPS instruction.
7115 unsigned ZMask = Zeroable.to_ulong();
7116
7117 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7118 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7119 SDValue Result =
7120 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7121 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7122 return DAG.getBitcast(VT, Result);
7123}
7124
7125/// Return a vector logical shift node.
7126static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7127 SelectionDAG &DAG, const TargetLowering &TLI,
7128 const SDLoc &dl) {
7129 assert(VT.is128BitVector() && "Unknown type for VShift");
7130 MVT ShVT = MVT::v16i8;
7131 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7132 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7133 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7134 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7135 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7136}
7137
7139 SelectionDAG &DAG) {
7140
7141 // Check if the scalar load can be widened into a vector load. And if
7142 // the address is "base + cst" see if the cst can be "absorbed" into
7143 // the shuffle mask.
7145 SDValue Ptr = LD->getBasePtr();
7146 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7147 return SDValue();
7148 EVT PVT = LD->getValueType(0);
7149 if (PVT != MVT::i32 && PVT != MVT::f32)
7150 return SDValue();
7151
7152 int FI = -1;
7153 int64_t Offset = 0;
7155 FI = FINode->getIndex();
7156 Offset = 0;
7157 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7158 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7159 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7160 Offset = Ptr.getConstantOperandVal(1);
7161 Ptr = Ptr.getOperand(0);
7162 } else {
7163 return SDValue();
7164 }
7165
7166 // FIXME: 256-bit vector instructions don't require a strict alignment,
7167 // improve this code to support it better.
7168 Align RequiredAlign(VT.getSizeInBits() / 8);
7169 SDValue Chain = LD->getChain();
7170 // Make sure the stack object alignment is at least 16 or 32.
7172 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7173 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7174 if (MFI.isFixedObjectIndex(FI)) {
7175 // Can't change the alignment. FIXME: It's possible to compute
7176 // the exact stack offset and reference FI + adjust offset instead.
7177 // If someone *really* cares about this. That's the way to implement it.
7178 return SDValue();
7179 } else {
7180 MFI.setObjectAlignment(FI, RequiredAlign);
7181 }
7182 }
7183
7184 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7185 // Ptr + (Offset & ~15).
7186 if (Offset < 0)
7187 return SDValue();
7188 if ((Offset % RequiredAlign.value()) & 3)
7189 return SDValue();
7190 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7191 if (StartOffset) {
7192 SDLoc DL(Ptr);
7193 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7194 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7195 }
7196
7197 int EltNo = (Offset - StartOffset) >> 2;
7198 unsigned NumElems = VT.getVectorNumElements();
7199
7200 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7201 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7202 LD->getPointerInfo().getWithOffset(StartOffset));
7203
7204 SmallVector<int, 8> Mask(NumElems, EltNo);
7205
7206 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7207 }
7208
7209 return SDValue();
7210}
7211
7212// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7213static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7214 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7215 auto *BaseLd = cast<LoadSDNode>(Elt);
7216 if (!BaseLd->isSimple())
7217 return false;
7218 Ld = BaseLd;
7219 ByteOffset = 0;
7220 return true;
7221 }
7222
7223 switch (Elt.getOpcode()) {
7224 case ISD::BITCAST:
7225 case ISD::TRUNCATE:
7227 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7228 case ISD::SRL:
7229 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7230 uint64_t Amt = AmtC->getZExtValue();
7231 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7232 ByteOffset += Amt / 8;
7233 return true;
7234 }
7235 }
7236 break;
7238 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7239 SDValue Src = Elt.getOperand(0);
7240 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7241 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7242 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7243 findEltLoadSrc(Src, Ld, ByteOffset)) {
7244 uint64_t Idx = IdxC->getZExtValue();
7245 ByteOffset += Idx * (SrcSizeInBits / 8);
7246 return true;
7247 }
7248 }
7249 break;
7250 }
7251
7252 return false;
7253}
7254
7255/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7256/// elements can be replaced by a single large load which has the same value as
7257/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7258///
7259/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7261 const SDLoc &DL, SelectionDAG &DAG,
7262 const X86Subtarget &Subtarget,
7263 bool IsAfterLegalize) {
7264 if ((VT.getScalarSizeInBits() % 8) != 0)
7265 return SDValue();
7266
7267 unsigned NumElems = Elts.size();
7268
7269 int LastLoadedElt = -1;
7270 APInt LoadMask = APInt::getZero(NumElems);
7271 APInt ZeroMask = APInt::getZero(NumElems);
7272 APInt UndefMask = APInt::getZero(NumElems);
7273
7274 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7275 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7276
7277 // For each element in the initializer, see if we've found a load, zero or an
7278 // undef.
7279 for (unsigned i = 0; i < NumElems; ++i) {
7280 SDValue Elt = peekThroughBitcasts(Elts[i]);
7281 if (!Elt.getNode())
7282 return SDValue();
7283 if (Elt.isUndef()) {
7284 UndefMask.setBit(i);
7285 continue;
7286 }
7288 ZeroMask.setBit(i);
7289 continue;
7290 }
7291
7292 // Each loaded element must be the correct fractional portion of the
7293 // requested vector load.
7294 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7295 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7296 return SDValue();
7297
7298 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7299 return SDValue();
7300 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7301 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7302 return SDValue();
7303
7304 LoadMask.setBit(i);
7305 LastLoadedElt = i;
7306 }
7307 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7308 NumElems &&
7309 "Incomplete element masks");
7310
7311 // Handle Special Cases - all undef or undef/zero.
7312 if (UndefMask.popcount() == NumElems)
7313 return DAG.getUNDEF(VT);
7314 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7315 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7316 : DAG.getConstantFP(0.0, DL, VT);
7317
7318 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7319 int FirstLoadedElt = LoadMask.countr_zero();
7320 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7321 EVT EltBaseVT = EltBase.getValueType();
7322 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7323 "Register/Memory size mismatch");
7324 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7325 assert(LDBase && "Did not find base load for merging consecutive loads");
7326 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7327 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7328 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7329 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7330 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7331
7332 // TODO: Support offsetting the base load.
7333 if (ByteOffsets[FirstLoadedElt] != 0)
7334 return SDValue();
7335
7336 // Check to see if the element's load is consecutive to the base load
7337 // or offset from a previous (already checked) load.
7338 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7339 LoadSDNode *Ld = Loads[EltIdx];
7340 int64_t ByteOffset = ByteOffsets[EltIdx];
7341 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7342 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7343 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7344 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7345 }
7346 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7347 EltIdx - FirstLoadedElt);
7348 };
7349
7350 // Consecutive loads can contain UNDEFS but not ZERO elements.
7351 // Consecutive loads with UNDEFs and ZEROs elements require a
7352 // an additional shuffle stage to clear the ZERO elements.
7353 bool IsConsecutiveLoad = true;
7354 bool IsConsecutiveLoadWithZeros = true;
7355 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7356 if (LoadMask[i]) {
7357 if (!CheckConsecutiveLoad(LDBase, i)) {
7358 IsConsecutiveLoad = false;
7359 IsConsecutiveLoadWithZeros = false;
7360 break;
7361 }
7362 } else if (ZeroMask[i]) {
7363 IsConsecutiveLoad = false;
7364 }
7365 }
7366
7367 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7368 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7369 assert(LDBase->isSimple() &&
7370 "Cannot merge volatile or atomic loads.");
7371 SDValue NewLd =
7372 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7373 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7374 for (auto *LD : Loads)
7375 if (LD)
7376 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7377 return NewLd;
7378 };
7379
7380 // Check if the base load is entirely dereferenceable.
7381 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7382 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7383
7384 // LOAD - all consecutive load/undefs (must start/end with a load or be
7385 // entirely dereferenceable). If we have found an entire vector of loads and
7386 // undefs, then return a large load of the entire vector width starting at the
7387 // base pointer. If the vector contains zeros, then attempt to shuffle those
7388 // elements.
7389 if (FirstLoadedElt == 0 &&
7390 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7391 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7392 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7393 return SDValue();
7394
7395 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7396 // will lower to regular temporal loads and use the cache.
7397 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7398 VT.is256BitVector() && !Subtarget.hasInt256())
7399 return SDValue();
7400
7401 if (NumElems == 1)
7402 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7403
7404 if (!ZeroMask)
7405 return CreateLoad(VT, LDBase);
7406
7407 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7408 // vector and a zero vector to clear out the zero elements.
7409 if (!IsAfterLegalize && VT.isVector()) {
7410 unsigned NumMaskElts = VT.getVectorNumElements();
7411 if ((NumMaskElts % NumElems) == 0) {
7412 unsigned Scale = NumMaskElts / NumElems;
7413 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7414 for (unsigned i = 0; i < NumElems; ++i) {
7415 if (UndefMask[i])
7416 continue;
7417 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7418 for (unsigned j = 0; j != Scale; ++j)
7419 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7420 }
7421 SDValue V = CreateLoad(VT, LDBase);
7422 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7423 : DAG.getConstantFP(0.0, DL, VT);
7424 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7425 }
7426 }
7427 }
7428
7429 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7430 if (VT.is256BitVector() || VT.is512BitVector()) {
7431 unsigned HalfNumElems = NumElems / 2;
7432 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7433 EVT HalfVT =
7434 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7435 SDValue HalfLD =
7436 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7437 DAG, Subtarget, IsAfterLegalize);
7438 if (HalfLD)
7439 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7440 HalfLD, DAG.getVectorIdxConstant(0, DL));
7441 }
7442 }
7443
7444 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7445 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7446 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7447 LoadSizeInBits == 64) &&
7448 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7449 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7450 : MVT::getIntegerVT(LoadSizeInBits);
7451 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7452 // Allow v4f32 on SSE1 only targets.
7453 // FIXME: Add more isel patterns so we can just use VT directly.
7454 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7455 VecVT = MVT::v4f32;
7456 if (TLI.isTypeLegal(VecVT)) {
7457 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7458 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7459 SDValue ResNode = DAG.getMemIntrinsicNode(
7460 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7462 for (auto *LD : Loads)
7463 if (LD)
7464 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7465 return DAG.getBitcast(VT, ResNode);
7466 }
7467 }
7468
7469 // BROADCAST - match the smallest possible repetition pattern, load that
7470 // scalar/subvector element and then broadcast to the entire vector.
7471 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7472 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7473 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7474 unsigned RepeatSize = SubElems * BaseSizeInBits;
7475 unsigned ScalarSize = std::min(RepeatSize, 64u);
7476 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7477 continue;
7478
7479 // Don't attempt a 1:N subvector broadcast - it should be caught by
7480 // combineConcatVectorOps, else will cause infinite loops.
7481 if (RepeatSize > ScalarSize && SubElems == 1)
7482 continue;
7483
7484 bool Match = true;
7485 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7486 for (unsigned i = 0; i != NumElems && Match; ++i) {
7487 if (!LoadMask[i])
7488 continue;
7489 SDValue Elt = peekThroughBitcasts(Elts[i]);
7490 if (RepeatedLoads[i % SubElems].isUndef())
7491 RepeatedLoads[i % SubElems] = Elt;
7492 else
7493 Match &= (RepeatedLoads[i % SubElems] == Elt);
7494 }
7495
7496 // We must have loads at both ends of the repetition.
7497 Match &= !RepeatedLoads.front().isUndef();
7498 Match &= !RepeatedLoads.back().isUndef();
7499 if (!Match)
7500 continue;
7501
7502 EVT RepeatVT =
7503 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7504 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7505 : EVT::getFloatingPointVT(ScalarSize);
7506 if (RepeatSize > ScalarSize)
7507 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7508 RepeatSize / ScalarSize);
7509 EVT BroadcastVT =
7510 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7511 VT.getSizeInBits() / ScalarSize);
7512 if (TLI.isTypeLegal(BroadcastVT)) {
7513 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7514 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7515 SDValue Broadcast = RepeatLoad;
7516 if (RepeatSize > ScalarSize) {
7517 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7518 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7519 } else {
7520 if (!Subtarget.hasAVX2() &&
7522 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7523 Subtarget,
7524 /*AssumeSingleUse=*/true))
7525 return SDValue();
7526 Broadcast =
7527 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7528 }
7529 return DAG.getBitcast(VT, Broadcast);
7530 }
7531 }
7532 }
7533 }
7534
7535 return SDValue();
7536}
7537
7538// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7539// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7540// are consecutive, non-overlapping, and in the right order.
7542 SelectionDAG &DAG,
7543 const X86Subtarget &Subtarget,
7544 bool IsAfterLegalize) {
7546 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7547 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7548 Elts.push_back(Elt);
7549 continue;
7550 }
7551 return SDValue();
7552 }
7553 assert(Elts.size() == VT.getVectorNumElements());
7554 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7555 IsAfterLegalize);
7556}
7557
7559 const APInt &Undefs, LLVMContext &C) {
7560 unsigned ScalarSize = VT.getScalarSizeInBits();
7561 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7562
7563 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7564 if (VT.isFloatingPoint()) {
7565 if (ScalarSize == 16)
7566 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7567 if (ScalarSize == 32)
7568 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7569 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7570 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7571 }
7572 return Constant::getIntegerValue(Ty, Val);
7573 };
7574
7575 SmallVector<Constant *, 32> ConstantVec;
7576 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7577 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7578 : getConstantScalar(Bits[I]));
7579
7580 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7581}
7582
7583static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7584 unsigned SplatBitSize, LLVMContext &C) {
7585 unsigned ScalarSize = VT.getScalarSizeInBits();
7586
7587 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7588 if (VT.isFloatingPoint()) {
7589 if (ScalarSize == 16)
7590 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7591 if (ScalarSize == 32)
7592 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7593 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7594 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7595 }
7596 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7597 };
7598
7599 if (ScalarSize == SplatBitSize)
7600 return getConstantScalar(SplatValue);
7601
7602 unsigned NumElm = SplatBitSize / ScalarSize;
7603 SmallVector<Constant *, 32> ConstantVec;
7604 for (unsigned I = 0; I != NumElm; ++I) {
7605 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7606 ConstantVec.push_back(getConstantScalar(Val));
7607 }
7608 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7609}
7610
7612 for (auto *U : N->users()) {
7613 unsigned Opc = U->getOpcode();
7614 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7615 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7616 return false;
7617 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7618 return false;
7619 if (isTargetShuffle(Opc))
7620 return true;
7621 if (Opc == ISD::BITCAST) // Ignore bitcasts
7622 return isFoldableUseOfShuffle(U);
7623 if (N->hasOneUse()) {
7624 // TODO, there may be some general way to know if a SDNode can
7625 // be folded. We now only know whether an MI is foldable.
7626 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7627 return false;
7628 return true;
7629 }
7630 }
7631 return false;
7632}
7633
7634// If the node has a single use by a VSELECT then AVX512 targets may be able to
7635// fold as a predicated instruction.
7636static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7637 unsigned SizeInBits = V.getValueSizeInBits();
7638 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7639 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7640 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7641 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7642 return true;
7643 }
7644 }
7645 return false;
7646}
7647
7648/// Attempt to use the vbroadcast instruction to generate a splat value
7649/// from a splat BUILD_VECTOR which uses:
7650/// a. A single scalar load, or a constant.
7651/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7652///
7653/// The VBROADCAST node is returned when a pattern is found,
7654/// or SDValue() otherwise.
7656 const SDLoc &dl,
7657 const X86Subtarget &Subtarget,
7658 SelectionDAG &DAG) {
7659 // VBROADCAST requires AVX.
7660 // TODO: Splats could be generated for non-AVX CPUs using SSE
7661 // instructions, but there's less potential gain for only 128-bit vectors.
7662 if (!Subtarget.hasAVX())
7663 return SDValue();
7664
7665 MVT VT = BVOp->getSimpleValueType(0);
7666 unsigned NumElts = VT.getVectorNumElements();
7667 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7668 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7669 "Unsupported vector type for broadcast.");
7670
7671 // See if the build vector is a repeating sequence of scalars (inc. splat).
7672 SDValue Ld;
7673 BitVector UndefElements;
7674 SmallVector<SDValue, 16> Sequence;
7675 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7676 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7677 if (Sequence.size() == 1)
7678 Ld = Sequence[0];
7679 }
7680
7681 // Attempt to use VBROADCASTM
7682 // From this pattern:
7683 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7684 // b. t1 = (build_vector t0 t0)
7685 //
7686 // Create (VBROADCASTM v2i1 X)
7687 if (!Sequence.empty() && Subtarget.hasCDI()) {
7688 // If not a splat, are the upper sequence values zeroable?
7689 unsigned SeqLen = Sequence.size();
7690 bool UpperZeroOrUndef =
7691 SeqLen == 1 ||
7692 llvm::all_of(ArrayRef(Sequence).drop_front(),
7693 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7694 SDValue Op0 = Sequence[0];
7695 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7696 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7697 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7698 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7699 ? Op0.getOperand(0)
7700 : Op0.getOperand(0).getOperand(0);
7701 MVT MaskVT = BOperand.getSimpleValueType();
7702 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7703 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7704 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7705 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7706 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7707 unsigned Scale = 512 / VT.getSizeInBits();
7708 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7709 }
7710 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7711 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7712 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7713 return DAG.getBitcast(VT, Bcst);
7714 }
7715 }
7716 }
7717
7718 unsigned NumUndefElts = UndefElements.count();
7719 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7720 APInt SplatValue, Undef;
7721 unsigned SplatBitSize;
7722 bool HasUndef;
7723 // Check if this is a repeated constant pattern suitable for broadcasting.
7724 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7725 SplatBitSize > VT.getScalarSizeInBits() &&
7726 SplatBitSize < VT.getSizeInBits()) {
7727 // Avoid replacing with broadcast when it's a use of a shuffle
7728 // instruction to preserve the present custom lowering of shuffles.
7729 if (isFoldableUseOfShuffle(BVOp))
7730 return SDValue();
7731 // replace BUILD_VECTOR with broadcast of the repeated constants.
7732 LLVMContext *Ctx = DAG.getContext();
7733 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7734 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7735 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7736 // Load the constant scalar/subvector and broadcast it.
7737 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7738 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7739 SDValue CP = DAG.getConstantPool(C, PVT);
7740 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7741
7742 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7743 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7744 SDValue Ops[] = {DAG.getEntryNode(), CP};
7745 MachinePointerInfo MPI =
7747 SDValue Brdcst =
7749 MPI, Alignment, MachineMemOperand::MOLoad);
7750 return DAG.getBitcast(VT, Brdcst);
7751 }
7752 if (SplatBitSize > 64) {
7753 // Load the vector of constants and broadcast it.
7754 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7755 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7756 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7757 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7758 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7759 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7760 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7761 MachinePointerInfo MPI =
7764 Ops, VVT, MPI, Alignment,
7766 }
7767 }
7768
7769 // If we are moving a scalar into a vector (Ld must be set and all elements
7770 // but 1 are undef) and that operation is not obviously supported by
7771 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7772 // That's better than general shuffling and may eliminate a load to GPR and
7773 // move from scalar to vector register.
7774 if (!Ld || NumElts - NumUndefElts != 1)
7775 return SDValue();
7776 unsigned ScalarSize = Ld.getValueSizeInBits();
7777 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7778 return SDValue();
7779 }
7780
7781 bool ConstSplatVal =
7782 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7783 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7784
7785 // TODO: Handle broadcasts of non-constant sequences.
7786
7787 // Make sure that all of the users of a non-constant load are from the
7788 // BUILD_VECTOR node.
7789 // FIXME: Is the use count needed for non-constant, non-load case?
7790 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7791 return SDValue();
7792
7793 unsigned ScalarSize = Ld.getValueSizeInBits();
7794 bool IsGE256 = (VT.getSizeInBits() >= 256);
7795
7796 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7797 // instruction to save 8 or more bytes of constant pool data.
7798 // TODO: If multiple splats are generated to load the same constant,
7799 // it may be detrimental to overall size. There needs to be a way to detect
7800 // that condition to know if this is truly a size win.
7801 bool OptForSize = DAG.shouldOptForSize();
7802
7803 // Handle broadcasting a single constant scalar from the constant pool
7804 // into a vector.
7805 // On Sandybridge (no AVX2), it is still better to load a constant vector
7806 // from the constant pool and not to broadcast it from a scalar.
7807 // But override that restriction when optimizing for size.
7808 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7809 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7810 EVT CVT = Ld.getValueType();
7811 assert(!CVT.isVector() && "Must not broadcast a vector type");
7812
7813 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7814 // For size optimization, also splat v2f64 and v2i64, and for size opt
7815 // with AVX2, also splat i8 and i16.
7816 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7817 if (ScalarSize == 32 ||
7818 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7819 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7820 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7821 const Constant *C = nullptr;
7823 C = CI->getConstantIntValue();
7825 C = CF->getConstantFPValue();
7826
7827 assert(C && "Invalid constant type");
7828
7829 SDValue CP =
7831 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7832
7833 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7834 SDValue Ops[] = {DAG.getEntryNode(), CP};
7835 MachinePointerInfo MPI =
7837 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7838 MPI, Alignment, MachineMemOperand::MOLoad);
7839 }
7840 }
7841
7842 // Handle AVX2 in-register broadcasts.
7843 if (!IsLoad && Subtarget.hasInt256() &&
7844 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7845 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7846
7847 // The scalar source must be a normal load.
7848 if (!IsLoad)
7849 return SDValue();
7850
7851 // Make sure the non-chain result is only used by this build vector.
7852 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7853 return SDValue();
7854
7855 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7856 (Subtarget.hasVLX() && ScalarSize == 64)) {
7857 auto *LN = cast<LoadSDNode>(Ld);
7858 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7859 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7860 SDValue BCast =
7862 LN->getMemoryVT(), LN->getMemOperand());
7863 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7864 return BCast;
7865 }
7866
7867 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7868 // double since there is no vbroadcastsd xmm
7869 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7870 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7871 auto *LN = cast<LoadSDNode>(Ld);
7872 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7873 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7874 SDValue BCast =
7876 LN->getMemoryVT(), LN->getMemOperand());
7877 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7878 return BCast;
7879 }
7880
7881 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7882 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7883
7884 // Unsupported broadcast.
7885 return SDValue();
7886}
7887
7888/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7889/// underlying vector and index.
7890///
7891/// Modifies \p ExtractedFromVec to the real vector and returns the real
7892/// index.
7893static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7894 SDValue ExtIdx) {
7895 int Idx = ExtIdx->getAsZExtVal();
7896 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7897 return Idx;
7898
7899 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7900 // lowered this:
7901 // (extract_vector_elt (v8f32 %1), Constant<6>)
7902 // to:
7903 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7904 // (extract_subvector (v8f32 %0), Constant<4>),
7905 // undef)
7906 // Constant<0>)
7907 // In this case the vector is the extract_subvector expression and the index
7908 // is 2, as specified by the shuffle.
7909 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7910 SDValue ShuffleVec = SVOp->getOperand(0);
7911 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7912 assert(ShuffleVecVT.getVectorElementType() ==
7913 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7914
7915 int ShuffleIdx = SVOp->getMaskElt(Idx);
7916 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7917 ExtractedFromVec = ShuffleVec;
7918 return ShuffleIdx;
7919 }
7920 return Idx;
7921}
7922
7924 SelectionDAG &DAG) {
7925 MVT VT = Op.getSimpleValueType();
7926
7927 // Skip if insert_vec_elt is not supported.
7928 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7930 return SDValue();
7931
7932 unsigned NumElems = Op.getNumOperands();
7933 SDValue VecIn1;
7934 SDValue VecIn2;
7935 SmallVector<unsigned, 4> InsertIndices;
7936 SmallVector<int, 8> Mask(NumElems, -1);
7937
7938 for (unsigned i = 0; i != NumElems; ++i) {
7939 unsigned Opc = Op.getOperand(i).getOpcode();
7940
7941 if (Opc == ISD::UNDEF)
7942 continue;
7943
7945 // Quit if more than 1 elements need inserting.
7946 if (InsertIndices.size() > 1)
7947 return SDValue();
7948
7949 InsertIndices.push_back(i);
7950 continue;
7951 }
7952
7953 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7954 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7955
7956 // Quit if non-constant index.
7957 if (!isa<ConstantSDNode>(ExtIdx))
7958 return SDValue();
7959 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7960
7961 // Quit if extracted from vector of different type.
7962 if (ExtractedFromVec.getValueType() != VT)
7963 return SDValue();
7964
7965 if (!VecIn1.getNode())
7966 VecIn1 = ExtractedFromVec;
7967 else if (VecIn1 != ExtractedFromVec) {
7968 if (!VecIn2.getNode())
7969 VecIn2 = ExtractedFromVec;
7970 else if (VecIn2 != ExtractedFromVec)
7971 // Quit if more than 2 vectors to shuffle
7972 return SDValue();
7973 }
7974
7975 if (ExtractedFromVec == VecIn1)
7976 Mask[i] = Idx;
7977 else if (ExtractedFromVec == VecIn2)
7978 Mask[i] = Idx + NumElems;
7979 }
7980
7981 if (!VecIn1.getNode())
7982 return SDValue();
7983
7984 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7985 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7986
7987 for (unsigned Idx : InsertIndices)
7988 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7989 DAG.getVectorIdxConstant(Idx, DL));
7990
7991 return NV;
7992}
7993
7994// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7996 const X86Subtarget &Subtarget) {
7997 MVT VT = Op.getSimpleValueType();
7998 MVT IVT =
7999 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
8001 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8002 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
8003 Op.getOperand(I)));
8004 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8005 return DAG.getBitcast(VT, Res);
8006}
8007
8008// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8010 SelectionDAG &DAG,
8011 const X86Subtarget &Subtarget) {
8012
8013 MVT VT = Op.getSimpleValueType();
8014 assert((VT.getVectorElementType() == MVT::i1) &&
8015 "Unexpected type in LowerBUILD_VECTORvXi1!");
8016 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8017 ISD::isBuildVectorAllOnes(Op.getNode()))
8018 return Op;
8019
8020 uint64_t Immediate = 0;
8021 SmallVector<unsigned, 16> NonConstIdx;
8022 bool IsSplat = true;
8023 bool HasConstElts = false;
8024 int SplatIdx = -1;
8025 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8026 SDValue In = Op.getOperand(idx);
8027 if (In.isUndef())
8028 continue;
8029 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8030 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8031 HasConstElts = true;
8032 } else {
8033 NonConstIdx.push_back(idx);
8034 }
8035 if (SplatIdx < 0)
8036 SplatIdx = idx;
8037 else if (In != Op.getOperand(SplatIdx))
8038 IsSplat = false;
8039 }
8040
8041 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8042 if (IsSplat) {
8043 // The build_vector allows the scalar element to be larger than the vector
8044 // element type. We need to mask it to use as a condition unless we know
8045 // the upper bits are zero.
8046 // FIXME: Use computeKnownBits instead of checking specific opcode?
8047 SDValue Cond = Op.getOperand(SplatIdx);
8048 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8049 if (Cond.getOpcode() != ISD::SETCC)
8050 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8051 DAG.getConstant(1, dl, MVT::i8));
8052
8053 // Perform the select in the scalar domain so we can use cmov.
8054 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8055 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8056 DAG.getAllOnesConstant(dl, MVT::i32),
8057 DAG.getConstant(0, dl, MVT::i32));
8058 Select = DAG.getBitcast(MVT::v32i1, Select);
8059 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8060 } else {
8061 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8062 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8063 DAG.getAllOnesConstant(dl, ImmVT),
8064 DAG.getConstant(0, dl, ImmVT));
8065 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8066 Select = DAG.getBitcast(VecVT, Select);
8067 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8068 DAG.getVectorIdxConstant(0, dl));
8069 }
8070 }
8071
8072 // insert elements one by one
8073 SDValue DstVec;
8074 if (HasConstElts) {
8075 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8076 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8077 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8078 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8079 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8080 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8081 } else {
8082 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8083 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8084 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8085 DstVec = DAG.getBitcast(VecVT, Imm);
8086 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8087 DAG.getVectorIdxConstant(0, dl));
8088 }
8089 } else
8090 DstVec = DAG.getUNDEF(VT);
8091
8092 for (unsigned InsertIdx : NonConstIdx) {
8093 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8094 Op.getOperand(InsertIdx),
8095 DAG.getVectorIdxConstant(InsertIdx, dl));
8096 }
8097 return DstVec;
8098}
8099
8100LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
8101 switch (Opcode) {
8102 case X86ISD::PACKSS:
8103 case X86ISD::PACKUS:
8104 case X86ISD::FHADD:
8105 case X86ISD::FHSUB:
8106 case X86ISD::HADD:
8107 case X86ISD::HSUB:
8108 return true;
8109 }
8110 return false;
8111}
8112
8113/// This is a helper function of LowerToHorizontalOp().
8114/// This function checks that the build_vector \p N in input implements a
8115/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8116/// may not match the layout of an x86 256-bit horizontal instruction.
8117/// In other words, if this returns true, then some extraction/insertion will
8118/// be required to produce a valid horizontal instruction.
8119///
8120/// Parameter \p Opcode defines the kind of horizontal operation to match.
8121/// For example, if \p Opcode is equal to ISD::ADD, then this function
8122/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8123/// is equal to ISD::SUB, then this function checks if this is a horizontal
8124/// arithmetic sub.
8125///
8126/// This function only analyzes elements of \p N whose indices are
8127/// in range [BaseIdx, LastIdx).
8128///
8129/// TODO: This function was originally used to match both real and fake partial
8130/// horizontal operations, but the index-matching logic is incorrect for that.
8131/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8132/// code because it is only used for partial h-op matching now?
8133static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8134 const SDLoc &DL, SelectionDAG &DAG,
8135 unsigned BaseIdx, unsigned LastIdx,
8136 SDValue &V0, SDValue &V1) {
8137 EVT VT = N->getValueType(0);
8138 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8139 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8140 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8141 "Invalid Vector in input!");
8142
8143 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8144 bool CanFold = true;
8145 unsigned ExpectedVExtractIdx = BaseIdx;
8146 unsigned NumElts = LastIdx - BaseIdx;
8147 V0 = DAG.getUNDEF(VT);
8148 V1 = DAG.getUNDEF(VT);
8149
8150 // Check if N implements a horizontal binop.
8151 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8152 SDValue Op = N->getOperand(i + BaseIdx);
8153
8154 // Skip UNDEFs.
8155 if (Op->isUndef()) {
8156 // Update the expected vector extract index.
8157 if (i * 2 == NumElts)
8158 ExpectedVExtractIdx = BaseIdx;
8159 ExpectedVExtractIdx += 2;
8160 continue;
8161 }
8162
8163 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8164
8165 if (!CanFold)
8166 break;
8167
8168 SDValue Op0 = Op.getOperand(0);
8169 SDValue Op1 = Op.getOperand(1);
8170
8171 // Try to match the following pattern:
8172 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8173 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8175 Op0.getOperand(0) == Op1.getOperand(0) &&
8178 if (!CanFold)
8179 break;
8180
8181 unsigned I0 = Op0.getConstantOperandVal(1);
8182 unsigned I1 = Op1.getConstantOperandVal(1);
8183
8184 if (i * 2 < NumElts) {
8185 if (V0.isUndef()) {
8186 V0 = Op0.getOperand(0);
8187 if (V0.getValueType() != VT)
8188 return false;
8189 }
8190 } else {
8191 if (V1.isUndef()) {
8192 V1 = Op0.getOperand(0);
8193 if (V1.getValueType() != VT)
8194 return false;
8195 }
8196 if (i * 2 == NumElts)
8197 ExpectedVExtractIdx = BaseIdx;
8198 }
8199
8200 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8201 if (I0 == ExpectedVExtractIdx)
8202 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8203 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8204 // Try to match the following dag sequence:
8205 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8206 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8207 } else
8208 CanFold = false;
8209
8210 ExpectedVExtractIdx += 2;
8211 }
8212
8213 return CanFold;
8214}
8215
8216/// Emit a sequence of two 128-bit horizontal add/sub followed by
8217/// a concat_vector.
8218///
8219/// This is a helper function of LowerToHorizontalOp().
8220/// This function expects two 256-bit vectors called V0 and V1.
8221/// At first, each vector is split into two separate 128-bit vectors.
8222/// Then, the resulting 128-bit vectors are used to implement two
8223/// horizontal binary operations.
8224///
8225/// The kind of horizontal binary operation is defined by \p X86Opcode.
8226///
8227/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8228/// the two new horizontal binop.
8229/// When Mode is set, the first horizontal binop dag node would take as input
8230/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8231/// horizontal binop dag node would take as input the lower 128-bit of V1
8232/// and the upper 128-bit of V1.
8233/// Example:
8234/// HADD V0_LO, V0_HI
8235/// HADD V1_LO, V1_HI
8236///
8237/// Otherwise, the first horizontal binop dag node takes as input the lower
8238/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8239/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8240/// Example:
8241/// HADD V0_LO, V1_LO
8242/// HADD V0_HI, V1_HI
8243///
8244/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8245/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8246/// the upper 128-bits of the result.
8247static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8248 const SDLoc &DL, SelectionDAG &DAG,
8249 unsigned X86Opcode, bool Mode,
8250 bool isUndefLO, bool isUndefHI) {
8251 MVT VT = V0.getSimpleValueType();
8252 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8253 "Invalid nodes in input!");
8254
8255 unsigned NumElts = VT.getVectorNumElements();
8256 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8257 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8258 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8259 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8260 MVT NewVT = V0_LO.getSimpleValueType();
8261
8262 SDValue LO = DAG.getUNDEF(NewVT);
8263 SDValue HI = DAG.getUNDEF(NewVT);
8264
8265 if (Mode) {
8266 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8267 if (!isUndefLO && !V0->isUndef())
8268 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8269 if (!isUndefHI && !V1->isUndef())
8270 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8271 } else {
8272 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8273 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8274 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8275
8276 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8277 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8278 }
8279
8280 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8281}
8282
8283/// Returns true iff \p BV builds a vector with the result equivalent to
8284/// the result of ADDSUB/SUBADD operation.
8285/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8286/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8287/// \p Opnd0 and \p Opnd1.
8289 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8290 SDValue &Opnd0, SDValue &Opnd1,
8291 unsigned &NumExtracts, bool &IsSubAdd,
8292 bool &HasAllowContract) {
8293 using namespace SDPatternMatch;
8294
8295 MVT VT = BV->getSimpleValueType(0);
8296 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8297 return false;
8298
8299 unsigned NumElts = VT.getVectorNumElements();
8300 SDValue InVec0 = DAG.getUNDEF(VT);
8301 SDValue InVec1 = DAG.getUNDEF(VT);
8302
8303 NumExtracts = 0;
8304 HasAllowContract = NumElts != 0;
8305
8306 // Odd-numbered elements in the input build vector are obtained from
8307 // adding/subtracting two integer/float elements.
8308 // Even-numbered elements in the input build vector are obtained from
8309 // subtracting/adding two integer/float elements.
8310 unsigned Opc[2] = {0, 0};
8311 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8312 SDValue Op = BV->getOperand(i);
8313
8314 // Skip 'undef' values.
8315 unsigned Opcode = Op.getOpcode();
8316 if (Opcode == ISD::UNDEF)
8317 continue;
8318
8319 // Early exit if we found an unexpected opcode.
8320 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8321 return false;
8322
8323 SDValue Op0 = Op.getOperand(0);
8324 SDValue Op1 = Op.getOperand(1);
8325
8326 // Try to match the following pattern:
8327 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8328 // Early exit if we cannot match that sequence.
8329 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8330 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8331 return false;
8332
8333 // We found a valid add/sub node, make sure its the same opcode as previous
8334 // elements for this parity.
8335 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8336 return false;
8337 Opc[i % 2] = Opcode;
8338
8339 // Update InVec0 and InVec1.
8340 if (InVec0.isUndef())
8341 InVec0 = Op0.getOperand(0);
8342 if (InVec1.isUndef())
8343 InVec1 = Op1.getOperand(0);
8344
8345 // Make sure that operands in input to each add/sub node always
8346 // come from a same pair of vectors.
8347 if (InVec0 != Op0.getOperand(0)) {
8348 if (Opcode == ISD::FSUB)
8349 return false;
8350
8351 // FADD is commutable. Try to commute the operands
8352 // and then test again.
8353 std::swap(Op0, Op1);
8354 if (InVec0 != Op0.getOperand(0))
8355 return false;
8356 }
8357
8358 if (InVec1 != Op1.getOperand(0))
8359 return false;
8360
8361 // Increment the number of extractions done.
8362 ++NumExtracts;
8363 HasAllowContract &= Op->getFlags().hasAllowContract();
8364 }
8365
8366 // Ensure we have found an opcode for both parities and that they are
8367 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8368 // inputs are undef.
8369 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8370 InVec0.isUndef() || InVec1.isUndef())
8371 return false;
8372
8373 IsSubAdd = Opc[0] == ISD::FADD;
8374
8375 Opnd0 = InVec0;
8376 Opnd1 = InVec1;
8377 return true;
8378}
8379
8380/// Returns true if is possible to fold MUL and an idiom that has already been
8381/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8382/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8383/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8384///
8385/// Prior to calling this function it should be known that there is some
8386/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8387/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8388/// before replacement of such SDNode with ADDSUB operation. Thus the number
8389/// of \p Opnd0 uses is expected to be equal to 2.
8390/// For example, this function may be called for the following IR:
8391/// %AB = fmul fast <2 x double> %A, %B
8392/// %Sub = fsub fast <2 x double> %AB, %C
8393/// %Add = fadd fast <2 x double> %AB, %C
8394/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8395/// <2 x i32> <i32 0, i32 3>
8396/// There is a def for %Addsub here, which potentially can be replaced by
8397/// X86ISD::ADDSUB operation:
8398/// %Addsub = X86ISD::ADDSUB %AB, %C
8399/// and such ADDSUB can further be replaced with FMADDSUB:
8400/// %Addsub = FMADDSUB %A, %B, %C.
8401///
8402/// The main reason why this method is called before the replacement of the
8403/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8404/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8405/// FMADDSUB is.
8406static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8407 SelectionDAG &DAG, SDValue &Opnd0,
8408 SDValue &Opnd1, SDValue &Opnd2,
8409 unsigned ExpectedUses,
8410 bool AllowSubAddOrAddSubContract) {
8411 if (Opnd0.getOpcode() != ISD::FMUL ||
8412 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8413 return false;
8414
8415 // FIXME: These checks must match the similar ones in
8416 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8417 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8418 // or MUL + ADDSUB to FMADDSUB.
8419 const TargetOptions &Options = DAG.getTarget().Options;
8420 bool AllowFusion =
8421 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8422 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8423 if (!AllowFusion)
8424 return false;
8425
8426 Opnd2 = Opnd1;
8427 Opnd1 = Opnd0.getOperand(1);
8428 Opnd0 = Opnd0.getOperand(0);
8429
8430 return true;
8431}
8432
8433/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8434/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8435/// X86ISD::FMSUBADD node.
8437 const SDLoc &DL,
8438 const X86Subtarget &Subtarget,
8439 SelectionDAG &DAG) {
8440 SDValue Opnd0, Opnd1;
8441 unsigned NumExtracts;
8442 bool IsSubAdd;
8443 bool HasAllowContract;
8444 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8445 HasAllowContract))
8446 return SDValue();
8447
8448 MVT VT = BV->getSimpleValueType(0);
8449
8450 // Try to generate X86ISD::FMADDSUB node here.
8451 SDValue Opnd2;
8452 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8453 HasAllowContract)) {
8454 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8455 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8456 }
8457
8458 // We only support ADDSUB.
8459 if (IsSubAdd)
8460 return SDValue();
8461
8462 // There are no known X86 targets with 512-bit ADDSUB instructions!
8463 // Convert to blend(fsub,fadd).
8464 if (VT.is512BitVector()) {
8465 SmallVector<int> Mask;
8466 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8467 Mask.push_back(I);
8468 Mask.push_back(I + E + 1);
8469 }
8470 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8471 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8472 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8473 }
8474
8475 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8476}
8477
8479 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8480 // Initialize outputs to known values.
8481 MVT VT = BV->getSimpleValueType(0);
8482 HOpcode = ISD::DELETED_NODE;
8483 V0 = DAG.getUNDEF(VT);
8484 V1 = DAG.getUNDEF(VT);
8485
8486 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8487 // half of the result is calculated independently from the 128-bit halves of
8488 // the inputs, so that makes the index-checking logic below more complicated.
8489 unsigned NumElts = VT.getVectorNumElements();
8490 unsigned GenericOpcode = ISD::DELETED_NODE;
8491 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8492 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8493 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8494 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8495 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8496 // Ignore undef elements.
8497 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8498 if (Op.isUndef())
8499 continue;
8500
8501 // If there's an opcode mismatch, we're done.
8502 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8503 return false;
8504
8505 // Initialize horizontal opcode.
8506 if (HOpcode == ISD::DELETED_NODE) {
8507 GenericOpcode = Op.getOpcode();
8508 switch (GenericOpcode) {
8509 // clang-format off
8510 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8511 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8512 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8513 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8514 default: return false;
8515 // clang-format on
8516 }
8517 }
8518
8519 SDValue Op0 = Op.getOperand(0);
8520 SDValue Op1 = Op.getOperand(1);
8521 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8523 Op0.getOperand(0) != Op1.getOperand(0) ||
8525 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8526 return false;
8527
8528 // The source vector is chosen based on which 64-bit half of the
8529 // destination vector is being calculated.
8530 if (j < NumEltsIn64Bits) {
8531 if (V0.isUndef())
8532 V0 = Op0.getOperand(0);
8533 } else {
8534 if (V1.isUndef())
8535 V1 = Op0.getOperand(0);
8536 }
8537
8538 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8539 if (SourceVec != Op0.getOperand(0))
8540 return false;
8541
8542 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8543 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8544 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8545 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8546 (j % NumEltsIn64Bits) * 2;
8547 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8548 continue;
8549
8550 // If this is not a commutative op, this does not match.
8551 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8552 return false;
8553
8554 // Addition is commutative, so try swapping the extract indexes.
8555 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8556 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8557 continue;
8558
8559 // Extract indexes do not match horizontal requirement.
8560 return false;
8561 }
8562 }
8563 // We matched. Opcode and operands are returned by reference as arguments.
8564 return true;
8565}
8566
8568 const SDLoc &DL, SelectionDAG &DAG,
8569 unsigned HOpcode, SDValue V0, SDValue V1) {
8570 // If either input vector is not the same size as the build vector,
8571 // extract/insert the low bits to the correct size.
8572 // This is free (examples: zmm --> xmm, xmm --> ymm).
8573 MVT VT = BV->getSimpleValueType(0);
8574 unsigned Width = VT.getSizeInBits();
8575 if (V0.getValueSizeInBits() > Width)
8576 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8577 else if (V0.getValueSizeInBits() < Width)
8578 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8579
8580 if (V1.getValueSizeInBits() > Width)
8581 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8582 else if (V1.getValueSizeInBits() < Width)
8583 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8584
8585 unsigned NumElts = VT.getVectorNumElements();
8586 APInt DemandedElts = APInt::getAllOnes(NumElts);
8587 for (unsigned i = 0; i != NumElts; ++i)
8588 if (BV->getOperand(i).isUndef())
8589 DemandedElts.clearBit(i);
8590
8591 // If we don't need the upper xmm, then perform as a xmm hop.
8592 unsigned HalfNumElts = NumElts / 2;
8593 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8594 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8595 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8596 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8597 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8598 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8599 }
8600
8601 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8602}
8603
8604/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8606 const X86Subtarget &Subtarget,
8607 SelectionDAG &DAG) {
8608 // We need at least 2 non-undef elements to make this worthwhile by default.
8609 unsigned NumNonUndefs =
8610 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8611 if (NumNonUndefs < 2)
8612 return SDValue();
8613
8614 // There are 4 sets of horizontal math operations distinguished by type:
8615 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8616 // subtarget feature. Try to match those "native" patterns first.
8617 MVT VT = BV->getSimpleValueType(0);
8618 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8619 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8620 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8621 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8622 unsigned HOpcode;
8623 SDValue V0, V1;
8624 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8625 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8626 }
8627
8628 // Try harder to match 256-bit ops by using extract/concat.
8629 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8630 return SDValue();
8631
8632 // Count the number of UNDEF operands in the build_vector in input.
8633 unsigned NumElts = VT.getVectorNumElements();
8634 unsigned Half = NumElts / 2;
8635 unsigned NumUndefsLO = 0;
8636 unsigned NumUndefsHI = 0;
8637 for (unsigned i = 0, e = Half; i != e; ++i)
8638 if (BV->getOperand(i)->isUndef())
8639 NumUndefsLO++;
8640
8641 for (unsigned i = Half, e = NumElts; i != e; ++i)
8642 if (BV->getOperand(i)->isUndef())
8643 NumUndefsHI++;
8644
8645 SDValue InVec0, InVec1;
8646 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8647 SDValue InVec2, InVec3;
8648 unsigned X86Opcode;
8649 bool CanFold = true;
8650
8651 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8652 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8653 InVec3) &&
8654 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8655 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8656 X86Opcode = X86ISD::HADD;
8657 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8658 InVec1) &&
8659 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8660 InVec3) &&
8661 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8662 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8663 X86Opcode = X86ISD::HSUB;
8664 else
8665 CanFold = false;
8666
8667 if (CanFold) {
8668 // Do not try to expand this build_vector into a pair of horizontal
8669 // add/sub if we can emit a pair of scalar add/sub.
8670 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8671 return SDValue();
8672
8673 // Convert this build_vector into a pair of horizontal binops followed by
8674 // a concat vector. We must adjust the outputs from the partial horizontal
8675 // matching calls above to account for undefined vector halves.
8676 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8677 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8678 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8679 bool isUndefLO = NumUndefsLO == Half;
8680 bool isUndefHI = NumUndefsHI == Half;
8681 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8682 isUndefHI);
8683 }
8684 }
8685
8686 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8687 VT == MVT::v16i16) {
8688 unsigned X86Opcode;
8689 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8690 InVec1))
8691 X86Opcode = X86ISD::HADD;
8692 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8693 InVec1))
8694 X86Opcode = X86ISD::HSUB;
8695 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8696 InVec1))
8697 X86Opcode = X86ISD::FHADD;
8698 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8699 InVec1))
8700 X86Opcode = X86ISD::FHSUB;
8701 else
8702 return SDValue();
8703
8704 // Don't try to expand this build_vector into a pair of horizontal add/sub
8705 // if we can simply emit a pair of scalar add/sub.
8706 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8707 return SDValue();
8708
8709 // Convert this build_vector into two horizontal add/sub followed by
8710 // a concat vector.
8711 bool isUndefLO = NumUndefsLO == Half;
8712 bool isUndefHI = NumUndefsHI == Half;
8713 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8714 isUndefLO, isUndefHI);
8715 }
8716
8717 return SDValue();
8718}
8719
8720static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8721 SelectionDAG &DAG);
8722
8723/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8724/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8725/// just apply the bit to the vectors.
8726/// NOTE: Its not in our interest to start make a general purpose vectorizer
8727/// from this, but enough scalar bit operations are created from the later
8728/// legalization + scalarization stages to need basic support.
8730 const X86Subtarget &Subtarget,
8731 SelectionDAG &DAG) {
8732 MVT VT = Op->getSimpleValueType(0);
8733 unsigned NumElems = VT.getVectorNumElements();
8734 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8735
8736 // Check that all elements have the same opcode.
8737 // TODO: Should we allow UNDEFS and if so how many?
8738 unsigned Opcode = Op->getOperand(0).getOpcode();
8739 for (unsigned i = 1; i < NumElems; ++i)
8740 if (Opcode != Op->getOperand(i).getOpcode())
8741 return SDValue();
8742
8743 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8744 bool IsShift = false;
8745 switch (Opcode) {
8746 default:
8747 return SDValue();
8748 case ISD::SHL:
8749 case ISD::SRL:
8750 case ISD::SRA:
8751 IsShift = true;
8752 break;
8753 case ISD::AND:
8754 case ISD::XOR:
8755 case ISD::OR:
8756 // Don't do this if the buildvector is a splat - we'd replace one
8757 // constant with an entire vector.
8758 if (Op->getSplatValue())
8759 return SDValue();
8760 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8761 return SDValue();
8762 break;
8763 }
8764
8765 SmallVector<SDValue, 4> LHSElts, RHSElts;
8766 for (SDValue Elt : Op->ops()) {
8767 SDValue LHS = Elt.getOperand(0);
8768 SDValue RHS = Elt.getOperand(1);
8769
8770 // We expect the canonicalized RHS operand to be the constant.
8772 return SDValue();
8773
8774 // Extend shift amounts.
8775 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8776 if (!IsShift)
8777 return SDValue();
8778 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8779 }
8780
8781 LHSElts.push_back(LHS);
8782 RHSElts.push_back(RHS);
8783 }
8784
8785 // Limit to shifts by uniform immediates.
8786 // TODO: Only accept vXi8/vXi64 special cases?
8787 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8788 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8789 return SDValue();
8790
8791 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8792 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8793 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8794
8795 if (!IsShift)
8796 return Res;
8797
8798 // Immediately lower the shift to ensure the constant build vector doesn't
8799 // get converted to a constant pool before the shift is lowered.
8800 return LowerShift(Res, Subtarget, DAG);
8801}
8802
8803static bool isShuffleFoldableLoad(SDValue);
8804
8805/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8806/// representing a blend.
8808 X86Subtarget const &Subtarget,
8809 SelectionDAG &DAG) {
8810 MVT VT = BVOp->getSimpleValueType(0u);
8811
8812 if (VT != MVT::v4f64)
8813 return SDValue();
8814
8815 // Collect unique operands.
8816 auto UniqueOps = SmallSet<SDValue, 16u>();
8817 for (SDValue Op : BVOp->ops()) {
8818 if (isIntOrFPConstant(Op) || Op.isUndef())
8819 return SDValue();
8820 UniqueOps.insert(Op);
8821 }
8822
8823 // Candidate BUILD_VECTOR must have 2 unique operands.
8824 if (UniqueOps.size() != 2u)
8825 return SDValue();
8826
8827 SDValue Op0 = BVOp->getOperand(0u);
8828 UniqueOps.erase(Op0);
8829 SDValue Op1 = *UniqueOps.begin();
8830
8831 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8832 isShuffleFoldableLoad(Op1)) {
8833 // Create shuffle mask.
8834 auto const NumElems = VT.getVectorNumElements();
8835 SmallVector<int, 16u> Mask(NumElems);
8836 for (auto I = 0u; I < NumElems; ++I) {
8837 SDValue Op = BVOp->getOperand(I);
8838 Mask[I] = Op == Op0 ? I : I + NumElems;
8839 }
8840 // Create shuffle of splats.
8841 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8842 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8843 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8844 }
8845
8846 return SDValue();
8847}
8848
8849/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8850/// functionality to do this, so it's all zeros, all ones, or some derivation
8851/// that is cheap to calculate.
8853 SelectionDAG &DAG,
8854 const X86Subtarget &Subtarget) {
8855 MVT VT = Op.getSimpleValueType();
8856
8857 // Vectors containing all zeros can be matched by pxor and xorps.
8858 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8859 return Op;
8860
8861 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8862 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8863 // vpcmpeqd on 256-bit vectors.
8864 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8865 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8866 return Op;
8867
8868 return getOnesVector(VT, DAG, DL);
8869 }
8870
8871 return SDValue();
8872}
8873
8874/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8875/// from a vector of source values and a vector of extraction indices.
8876/// The vectors might be manipulated to match the type of the permute op.
8877static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8878 const SDLoc &DL, SelectionDAG &DAG,
8879 const X86Subtarget &Subtarget) {
8880 MVT ShuffleVT = VT;
8881 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8882 unsigned NumElts = VT.getVectorNumElements();
8883 unsigned SizeInBits = VT.getSizeInBits();
8884
8885 // Adjust IndicesVec to match VT size.
8886 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8887 "Illegal variable permute mask size");
8888 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8889 // Narrow/widen the indices vector to the correct size.
8890 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8891 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8892 NumElts * VT.getScalarSizeInBits());
8893 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8894 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8895 SDLoc(IndicesVec), SizeInBits);
8896 // Zero-extend the index elements within the vector.
8897 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8898 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8899 IndicesVT, IndicesVec);
8900 }
8901 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8902
8903 // Handle SrcVec that don't match VT type.
8904 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8905 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8906 // Handle larger SrcVec by treating it as a larger permute.
8907 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8908 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8909 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8910 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8911 Subtarget, DAG, SDLoc(IndicesVec));
8912 SDValue NewSrcVec =
8913 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8914 if (NewSrcVec)
8915 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8916 return SDValue();
8917 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8918 // Widen smaller SrcVec to match VT.
8919 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8920 } else
8921 return SDValue();
8922 }
8923
8924 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8925 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8926 EVT SrcVT = Idx.getValueType();
8927 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8928 uint64_t IndexScale = 0;
8929 uint64_t IndexOffset = 0;
8930
8931 // If we're scaling a smaller permute op, then we need to repeat the
8932 // indices, scaling and offsetting them as well.
8933 // e.g. v4i32 -> v16i8 (Scale = 4)
8934 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8935 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8936 for (uint64_t i = 0; i != Scale; ++i) {
8937 IndexScale |= Scale << (i * NumDstBits);
8938 IndexOffset |= i << (i * NumDstBits);
8939 }
8940
8941 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8942 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8943 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8944 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8945 return Idx;
8946 };
8947
8948 unsigned Opcode = 0;
8949 switch (VT.SimpleTy) {
8950 default:
8951 break;
8952 case MVT::v16i8:
8953 if (Subtarget.hasSSSE3())
8954 Opcode = X86ISD::PSHUFB;
8955 break;
8956 case MVT::v8i16:
8957 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8958 Opcode = X86ISD::VPERMV;
8959 else if (Subtarget.hasSSSE3()) {
8960 Opcode = X86ISD::PSHUFB;
8961 ShuffleVT = MVT::v16i8;
8962 }
8963 break;
8964 case MVT::v4f32:
8965 case MVT::v4i32:
8966 if (Subtarget.hasAVX()) {
8967 Opcode = X86ISD::VPERMILPV;
8968 ShuffleVT = MVT::v4f32;
8969 } else if (Subtarget.hasSSSE3()) {
8970 Opcode = X86ISD::PSHUFB;
8971 ShuffleVT = MVT::v16i8;
8972 }
8973 break;
8974 case MVT::v2f64:
8975 case MVT::v2i64:
8976 if (Subtarget.hasAVX()) {
8977 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8978 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8979 Opcode = X86ISD::VPERMILPV;
8980 ShuffleVT = MVT::v2f64;
8981 } else if (Subtarget.hasSSE41()) {
8982 // SSE41 can compare v2i64 - select between indices 0 and 1.
8983 return DAG.getSelectCC(
8984 DL, IndicesVec,
8985 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8986 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8987 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8989 }
8990 break;
8991 case MVT::v32i8:
8992 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8993 Opcode = X86ISD::VPERMV;
8994 else if (Subtarget.hasXOP()) {
8995 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8996 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8997 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8998 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8999 return DAG.getNode(
9001 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9002 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9003 } else if (Subtarget.hasAVX()) {
9004 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9005 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9006 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9007 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9008 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9010 // Permute Lo and Hi and then select based on index range.
9011 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9012 // care about the bit[7] as its just an index vector.
9013 SDValue Idx = Ops[2];
9014 EVT VT = Idx.getValueType();
9015 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9016 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9017 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9019 };
9020 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9021 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9022 PSHUFBBuilder);
9023 }
9024 break;
9025 case MVT::v16i16:
9026 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9027 Opcode = X86ISD::VPERMV;
9028 else if (Subtarget.hasAVX()) {
9029 // Scale to v32i8 and perform as v32i8.
9030 IndicesVec = ScaleIndices(IndicesVec, 2);
9031 return DAG.getBitcast(
9033 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9034 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9035 }
9036 break;
9037 case MVT::v8f32:
9038 case MVT::v8i32:
9039 if (Subtarget.hasAVX2())
9040 Opcode = X86ISD::VPERMV;
9041 else if (Subtarget.hasAVX()) {
9042 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9043 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9044 {0, 1, 2, 3, 0, 1, 2, 3});
9045 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9046 {4, 5, 6, 7, 4, 5, 6, 7});
9047 if (Subtarget.hasXOP())
9048 return DAG.getBitcast(
9049 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9050 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9051 // Permute Lo and Hi and then select based on index range.
9052 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9053 SDValue Res = DAG.getSelectCC(
9054 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9055 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9056 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9058 return DAG.getBitcast(VT, Res);
9059 }
9060 break;
9061 case MVT::v4i64:
9062 case MVT::v4f64:
9063 if (Subtarget.hasAVX512()) {
9064 if (!Subtarget.hasVLX()) {
9065 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9066 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9067 SDLoc(SrcVec));
9068 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9069 DAG, SDLoc(IndicesVec));
9070 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9071 DAG, Subtarget);
9072 return extract256BitVector(Res, 0, DAG, DL);
9073 }
9074 Opcode = X86ISD::VPERMV;
9075 } else if (Subtarget.hasAVX()) {
9076 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9077 SDValue LoLo =
9078 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9079 SDValue HiHi =
9080 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9081 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9082 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9083 if (Subtarget.hasXOP())
9084 return DAG.getBitcast(
9085 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9086 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9087 // Permute Lo and Hi and then select based on index range.
9088 // This works as VPERMILPD only uses index bit[1] to permute elements.
9089 SDValue Res = DAG.getSelectCC(
9090 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9091 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9092 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9094 return DAG.getBitcast(VT, Res);
9095 }
9096 break;
9097 case MVT::v64i8:
9098 if (Subtarget.hasVBMI())
9099 Opcode = X86ISD::VPERMV;
9100 break;
9101 case MVT::v32i16:
9102 if (Subtarget.hasBWI())
9103 Opcode = X86ISD::VPERMV;
9104 break;
9105 case MVT::v16f32:
9106 case MVT::v16i32:
9107 case MVT::v8f64:
9108 case MVT::v8i64:
9109 if (Subtarget.hasAVX512())
9110 Opcode = X86ISD::VPERMV;
9111 break;
9112 }
9113 if (!Opcode)
9114 return SDValue();
9115
9116 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9117 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9118 "Illegal variable permute shuffle type");
9119
9120 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9121 if (Scale > 1)
9122 IndicesVec = ScaleIndices(IndicesVec, Scale);
9123
9124 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9125 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9126
9127 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9128 SDValue Res = Opcode == X86ISD::VPERMV
9129 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9130 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9131 return DAG.getBitcast(VT, Res);
9132}
9133
9134// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9135// reasoned to be a permutation of a vector by indices in a non-constant vector.
9136// (build_vector (extract_elt V, (extract_elt I, 0)),
9137// (extract_elt V, (extract_elt I, 1)),
9138// ...
9139// ->
9140// (vpermv I, V)
9141//
9142// TODO: Handle undefs
9143// TODO: Utilize pshufb and zero mask blending to support more efficient
9144// construction of vectors with constant-0 elements.
9145static SDValue
9147 SelectionDAG &DAG,
9148 const X86Subtarget &Subtarget) {
9149 SDValue SrcVec, IndicesVec;
9150
9151 auto PeekThroughFreeze = [](SDValue N) {
9152 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9153 return N->getOperand(0);
9154 return N;
9155 };
9156 // Check for a match of the permute source vector and permute index elements.
9157 // This is done by checking that the i-th build_vector operand is of the form:
9158 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9159 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9160 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9161 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9162 return SDValue();
9163
9164 // If this is the first extract encountered in V, set the source vector,
9165 // otherwise verify the extract is from the previously defined source
9166 // vector.
9167 if (!SrcVec)
9168 SrcVec = Op.getOperand(0);
9169 else if (SrcVec != Op.getOperand(0))
9170 return SDValue();
9171 SDValue ExtractedIndex = Op->getOperand(1);
9172 // Peek through extends.
9173 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9174 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9175 ExtractedIndex = ExtractedIndex.getOperand(0);
9176 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9177 return SDValue();
9178
9179 // If this is the first extract from the index vector candidate, set the
9180 // indices vector, otherwise verify the extract is from the previously
9181 // defined indices vector.
9182 if (!IndicesVec)
9183 IndicesVec = ExtractedIndex.getOperand(0);
9184 else if (IndicesVec != ExtractedIndex.getOperand(0))
9185 return SDValue();
9186
9187 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9188 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9189 return SDValue();
9190 }
9191
9192 MVT VT = V.getSimpleValueType();
9193 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9194}
9195
9196SDValue
9197X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9198 SDLoc dl(Op);
9199
9200 MVT VT = Op.getSimpleValueType();
9201 MVT EltVT = VT.getVectorElementType();
9202 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9203 unsigned NumElems = Op.getNumOperands();
9204
9205 // Generate vectors for predicate vectors.
9206 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9207 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9208
9209 if (VT.getVectorElementType() == MVT::bf16 &&
9210 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9211 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9212
9213 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9214 return VectorCst;
9215
9216 unsigned EVTBits = EltVT.getSizeInBits();
9217 APInt UndefMask = APInt::getZero(NumElems);
9218 APInt FrozenUndefMask = APInt::getZero(NumElems);
9219 APInt ZeroMask = APInt::getZero(NumElems);
9220 APInt NonZeroMask = APInt::getZero(NumElems);
9221 bool IsAllConstants = true;
9222 bool OneUseFrozenUndefs = true;
9223 SmallSet<SDValue, 8> Values;
9224 unsigned NumConstants = NumElems;
9225 for (unsigned i = 0; i < NumElems; ++i) {
9226 SDValue Elt = Op.getOperand(i);
9227 if (Elt.isUndef()) {
9228 UndefMask.setBit(i);
9229 continue;
9230 }
9231 if (ISD::isFreezeUndef(Elt.getNode())) {
9232 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9233 FrozenUndefMask.setBit(i);
9234 continue;
9235 }
9236 Values.insert(Elt);
9237 if (!isIntOrFPConstant(Elt)) {
9238 IsAllConstants = false;
9239 NumConstants--;
9240 }
9241 if (X86::isZeroNode(Elt)) {
9242 ZeroMask.setBit(i);
9243 } else {
9244 NonZeroMask.setBit(i);
9245 }
9246 }
9247
9248 // All undef vector. Return an UNDEF.
9249 if (UndefMask.isAllOnes())
9250 return DAG.getUNDEF(VT);
9251
9252 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9253 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9254 return DAG.getFreeze(DAG.getUNDEF(VT));
9255
9256 // All undef/freeze(undef)/zero vector. Return a zero vector.
9257 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9258 return getZeroVector(VT, Subtarget, DAG, dl);
9259
9260 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9261 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9262 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9263 // and blend the FREEZE-UNDEF operands back in.
9264 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9265 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9266 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9267 SmallVector<int, 16> BlendMask(NumElems, -1);
9268 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9269 for (unsigned i = 0; i < NumElems; ++i) {
9270 if (UndefMask[i]) {
9271 BlendMask[i] = -1;
9272 continue;
9273 }
9274 BlendMask[i] = i;
9275 if (!FrozenUndefMask[i])
9276 Elts[i] = Op.getOperand(i);
9277 else
9278 BlendMask[i] += NumElems;
9279 }
9280 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9281 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9282 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9283 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9284 }
9285
9286 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9287
9288 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9289 // be better off lowering to a smaller build vector and padding with
9290 // undef/zero.
9291 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9293 unsigned UpperElems = NumElems / 2;
9294 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9295 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9296 if (NumUpperUndefsOrZeros >= UpperElems) {
9297 if (VT.is512BitVector() &&
9298 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9299 UpperElems = NumElems - (NumElems / 4);
9300 // If freeze(undef) is in any upper elements, force to zero.
9301 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9302 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9303 SDValue NewBV =
9304 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9305 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9306 }
9307 }
9308
9309 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9310 return AddSub;
9311 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9312 return HorizontalOp;
9313 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9314 return Broadcast;
9315 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9316 return BitOp;
9317 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9318 return Blend;
9319
9320 unsigned NumZero = ZeroMask.popcount();
9321 unsigned NumNonZero = NonZeroMask.popcount();
9322
9323 // If we are inserting one variable into a vector of non-zero constants, try
9324 // to avoid loading each constant element as a scalar. Load the constants as a
9325 // vector and then insert the variable scalar element. If insertion is not
9326 // supported, fall back to a shuffle to get the scalar blended with the
9327 // constants. Insertion into a zero vector is handled as a special-case
9328 // somewhere below here.
9329 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9330 FrozenUndefMask.isZero() &&
9333 // Create an all-constant vector. The variable element in the old
9334 // build vector is replaced by undef in the constant vector. Save the
9335 // variable scalar element and its index for use in the insertelement.
9336 LLVMContext &Context = *DAG.getContext();
9337 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9338 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9339 SDValue VarElt;
9340 SDValue InsIndex;
9341 for (unsigned i = 0; i != NumElems; ++i) {
9342 SDValue Elt = Op.getOperand(i);
9343 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9344 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9345 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9346 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9347 else if (!Elt.isUndef()) {
9348 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9349 "Expected one variable element in this vector");
9350 VarElt = Elt;
9351 InsIndex = DAG.getVectorIdxConstant(i, dl);
9352 }
9353 }
9354 Constant *CV = ConstantVector::get(ConstVecOps);
9355 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9356
9357 // The constants we just created may not be legal (eg, floating point). We
9358 // must lower the vector right here because we can not guarantee that we'll
9359 // legalize it before loading it. This is also why we could not just create
9360 // a new build vector here. If the build vector contains illegal constants,
9361 // it could get split back up into a series of insert elements.
9362 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9363 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9364 MachineFunction &MF = DAG.getMachineFunction();
9365 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9366 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9367 unsigned InsertC = InsIndex->getAsZExtVal();
9368 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9369 if (InsertC < NumEltsInLow128Bits)
9370 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9371
9372 // There's no good way to insert into the high elements of a >128-bit
9373 // vector, so use shuffles to avoid an extract/insert sequence.
9374 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9375 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9376 SmallVector<int, 8> ShuffleMask;
9377 unsigned NumElts = VT.getVectorNumElements();
9378 for (unsigned i = 0; i != NumElts; ++i)
9379 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9380 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9381 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9382 }
9383
9384 // Special case for single non-zero, non-undef, element.
9385 if (NumNonZero == 1) {
9386 unsigned Idx = NonZeroMask.countr_zero();
9387 SDValue Item = Op.getOperand(Idx);
9388
9389 // If we have a constant or non-constant insertion into the low element of
9390 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9391 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9392 // depending on what the source datatype is.
9393 if (Idx == 0) {
9394 if (NumZero == 0)
9395 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9396
9397 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9398 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9399 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9400 assert((VT.is128BitVector() || VT.is256BitVector() ||
9401 VT.is512BitVector()) &&
9402 "Expected an SSE value type!");
9403 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9404 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9405 // zero vector.
9406 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9407 }
9408
9409 // We can't directly insert an i8 or i16 into a vector, so zero extend
9410 // it to i32 first.
9411 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9412 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9413 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9414 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9415 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9416 return DAG.getBitcast(VT, Item);
9417 }
9418 }
9419
9420 // Is it a vector logical left shift?
9421 if (NumElems == 2 && Idx == 1 &&
9422 X86::isZeroNode(Op.getOperand(0)) &&
9423 !X86::isZeroNode(Op.getOperand(1))) {
9424 unsigned NumBits = VT.getSizeInBits();
9425 return getVShift(true, VT,
9427 VT, Op.getOperand(1)),
9428 NumBits/2, DAG, *this, dl);
9429 }
9430
9431 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9432 return SDValue();
9433
9434 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9435 // is a non-constant being inserted into an element other than the low one,
9436 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9437 // movd/movss) to move this into the low element, then shuffle it into
9438 // place.
9439 if (EVTBits == 32) {
9440 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9441 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9442 }
9443 }
9444
9445 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9446 if (Values.size() == 1) {
9447 if (EVTBits == 32) {
9448 // Instead of a shuffle like this:
9449 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9450 // Check if it's possible to issue this instead.
9451 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9452 unsigned Idx = NonZeroMask.countr_zero();
9453 SDValue Item = Op.getOperand(Idx);
9454 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9455 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9456 }
9457 return SDValue();
9458 }
9459
9460 // A vector full of immediates; various special cases are already
9461 // handled, so this is best done with a single constant-pool load.
9462 if (IsAllConstants)
9463 return SDValue();
9464
9465 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9466 return V;
9467
9468 // See if we can use a vector load to get all of the elements.
9469 {
9470 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9471 if (SDValue LD =
9472 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9473 return LD;
9474 }
9475
9476 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9477 // build_vector and broadcast it.
9478 // TODO: We could probably generalize this more.
9479 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9480 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9481 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9482 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9483 // Make sure all the even/odd operands match.
9484 for (unsigned i = 2; i != NumElems; ++i)
9485 if (Ops[i % 2] != Op.getOperand(i))
9486 return false;
9487 return true;
9488 };
9489 if (CanSplat(Op, NumElems, Ops)) {
9490 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9491 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9492 // Create a new build vector and cast to v2i64/v2f64.
9493 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9494 DAG.getBuildVector(NarrowVT, dl, Ops));
9495 // Broadcast from v2i64/v2f64 and cast to final VT.
9496 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9497 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9498 NewBV));
9499 }
9500 }
9501
9502 // For AVX-length vectors, build the individual 128-bit pieces and use
9503 // shuffles to put them in place.
9504 if (VT.getSizeInBits() > 128) {
9505 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9506
9507 // Build both the lower and upper subvector.
9508 SDValue Lower =
9509 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9511 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9512
9513 // Recreate the wider vector with the lower and upper part.
9514 return concatSubVectors(Lower, Upper, DAG, dl);
9515 }
9516
9517 // Let legalizer expand 2-wide build_vectors.
9518 if (EVTBits == 64) {
9519 if (NumNonZero == 1) {
9520 // One half is zero or undef.
9521 unsigned Idx = NonZeroMask.countr_zero();
9522 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9523 Op.getOperand(Idx));
9524 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9525 }
9526 return SDValue();
9527 }
9528
9529 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9530 if (EVTBits == 8 && NumElems == 16)
9531 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9532 NumZero, DAG, Subtarget))
9533 return V;
9534
9535 if (EltVT == MVT::i16 && NumElems == 8)
9536 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9537 NumZero, DAG, Subtarget))
9538 return V;
9539
9540 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9541 if (EVTBits == 32 && NumElems == 4)
9542 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9543 return V;
9544
9545 // If element VT is == 32 bits, turn it into a number of shuffles.
9546 if (NumElems == 4 && NumZero > 0) {
9547 SmallVector<SDValue, 8> Ops(NumElems);
9548 for (unsigned i = 0; i < 4; ++i) {
9549 bool isZero = !NonZeroMask[i];
9550 if (isZero)
9551 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9552 else
9553 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9554 }
9555
9556 for (unsigned i = 0; i < 2; ++i) {
9557 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9558 default: llvm_unreachable("Unexpected NonZero count");
9559 case 0:
9560 Ops[i] = Ops[i*2]; // Must be a zero vector.
9561 break;
9562 case 1:
9563 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9564 break;
9565 case 2:
9566 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9567 break;
9568 case 3:
9569 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9570 break;
9571 }
9572 }
9573
9574 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9575 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9576 int MaskVec[] = {
9577 Reverse1 ? 1 : 0,
9578 Reverse1 ? 0 : 1,
9579 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9580 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9581 };
9582 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9583 }
9584
9585 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9586
9587 // Check for a build vector from mostly shuffle plus few inserting.
9588 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9589 return Sh;
9590
9591 // For SSE 4.1, use insertps to put the high elements into the low element.
9592 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9594 if (!Op.getOperand(0).isUndef())
9595 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9596 else
9597 Result = DAG.getUNDEF(VT);
9598
9599 for (unsigned i = 1; i < NumElems; ++i) {
9600 if (Op.getOperand(i).isUndef()) continue;
9601 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9602 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9603 }
9604 return Result;
9605 }
9606
9607 // Otherwise, expand into a number of unpckl*, start by extending each of
9608 // our (non-undef) elements to the full vector width with the element in the
9609 // bottom slot of the vector (which generates no code for SSE).
9610 SmallVector<SDValue, 8> Ops(NumElems);
9611 for (unsigned i = 0; i < NumElems; ++i) {
9612 if (!Op.getOperand(i).isUndef())
9613 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9614 else
9615 Ops[i] = DAG.getUNDEF(VT);
9616 }
9617
9618 // Next, we iteratively mix elements, e.g. for v4f32:
9619 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9620 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9621 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9622 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9623 // Generate scaled UNPCKL shuffle mask.
9624 SmallVector<int, 16> Mask;
9625 for(unsigned i = 0; i != Scale; ++i)
9626 Mask.push_back(i);
9627 for (unsigned i = 0; i != Scale; ++i)
9628 Mask.push_back(NumElems+i);
9629 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9630
9631 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9632 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9633 }
9634 return Ops[0];
9635}
9636
9637// 256-bit AVX can use the vinsertf128 instruction
9638// to create 256-bit vectors from two other 128-bit ones.
9639// TODO: Detect subvector broadcast here instead of DAG combine?
9641 SelectionDAG &DAG,
9642 const X86Subtarget &Subtarget) {
9643 MVT ResVT = Op.getSimpleValueType();
9644 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9645 "Value type must be 256-/512-bit wide");
9646
9647 unsigned NumOperands = Op.getNumOperands();
9648 unsigned NumFreezeUndef = 0;
9649 unsigned NumZero = 0;
9650 unsigned NumNonZero = 0;
9651 unsigned NonZeros = 0;
9652 SmallSet<SDValue, 4> Undefs;
9653 for (unsigned i = 0; i != NumOperands; ++i) {
9654 SDValue SubVec = Op.getOperand(i);
9655 if (SubVec.isUndef())
9656 continue;
9657 if (ISD::isFreezeUndef(SubVec.getNode())) {
9658 // If the freeze(undef) has multiple uses then we must fold to zero.
9659 if (SubVec.hasOneUse()) {
9660 ++NumFreezeUndef;
9661 } else {
9662 ++NumZero;
9663 Undefs.insert(SubVec);
9664 }
9665 }
9666 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9667 ++NumZero;
9668 else {
9669 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9670 NonZeros |= 1 << i;
9671 ++NumNonZero;
9672 }
9673 }
9674
9675 // If we have more than 2 non-zeros, build each half separately.
9676 if (NumNonZero > 2) {
9677 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9678 ArrayRef<SDUse> Ops = Op->ops();
9679 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9680 Ops.slice(0, NumOperands/2));
9681 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9682 Ops.slice(NumOperands/2));
9683 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9684 }
9685
9686 // Otherwise, build it up through insert_subvectors.
9687 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9688 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9689 : DAG.getUNDEF(ResVT));
9690
9691 // Replace Undef operands with ZeroVector.
9692 for (SDValue U : Undefs)
9694 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9695
9696 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9697 unsigned NumSubElems = SubVT.getVectorNumElements();
9698 for (unsigned i = 0; i != NumOperands; ++i) {
9699 if ((NonZeros & (1 << i)) == 0)
9700 continue;
9701
9702 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9703 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9704 }
9705
9706 return Vec;
9707}
9708
9709// Returns true if the given node is a type promotion (by concatenating i1
9710// zeros) of the result of a node that already zeros all upper bits of
9711// k-register.
9712// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9714 const X86Subtarget &Subtarget,
9715 SelectionDAG & DAG) {
9716 MVT ResVT = Op.getSimpleValueType();
9717 unsigned NumOperands = Op.getNumOperands();
9718 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9719 "Unexpected number of operands in CONCAT_VECTORS");
9720
9721 uint64_t Zeros = 0;
9722 uint64_t NonZeros = 0;
9723 for (unsigned i = 0; i != NumOperands; ++i) {
9724 SDValue SubVec = Op.getOperand(i);
9725 if (SubVec.isUndef())
9726 continue;
9727 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9728 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9729 Zeros |= (uint64_t)1 << i;
9730 else
9731 NonZeros |= (uint64_t)1 << i;
9732 }
9733
9734 unsigned NumElems = ResVT.getVectorNumElements();
9735
9736 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9737 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9738 // insert_subvector will give us two kshifts.
9739 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9740 Log2_64(NonZeros) != NumOperands - 1) {
9741 unsigned Idx = Log2_64(NonZeros);
9742 SDValue SubVec = Op.getOperand(Idx);
9743 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9744 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9745 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9746 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9747 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9748 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9749 DAG.getVectorIdxConstant(0, dl));
9750 }
9751
9752 // If there are zero or one non-zeros we can handle this very simply.
9753 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9754 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9755 if (!NonZeros)
9756 return Vec;
9757 unsigned Idx = Log2_64(NonZeros);
9758 SDValue SubVec = Op.getOperand(Idx);
9759 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9760 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9761 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9762 }
9763
9764 if (NumOperands > 2) {
9765 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9766 ArrayRef<SDUse> Ops = Op->ops();
9767 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9768 Ops.slice(0, NumOperands / 2));
9769 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9770 Ops.slice(NumOperands / 2));
9771 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9772 }
9773
9774 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9775
9776 if (ResVT.getVectorNumElements() >= 16)
9777 return Op; // The operation is legal with KUNPCK
9778
9779 SDValue Vec =
9780 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9781 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9782 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9783 DAG.getVectorIdxConstant(NumElems / 2, dl));
9784}
9785
9787 const X86Subtarget &Subtarget,
9788 SelectionDAG &DAG) {
9789 SDLoc DL(Op);
9790 MVT VT = Op.getSimpleValueType();
9791 if (VT.getVectorElementType() == MVT::i1)
9792 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9793
9794 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9795 // from two other 128-bit ones.
9796 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9797 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9798 (VT.is512BitVector() &&
9799 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9800 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9801}
9802
9803//===----------------------------------------------------------------------===//
9804// Vector shuffle lowering
9805//
9806// This is an experimental code path for lowering vector shuffles on x86. It is
9807// designed to handle arbitrary vector shuffles and blends, gracefully
9808// degrading performance as necessary. It works hard to recognize idiomatic
9809// shuffles and lower them to optimal instruction patterns without leaving
9810// a framework that allows reasonably efficient handling of all vector shuffle
9811// patterns.
9812//===----------------------------------------------------------------------===//
9813
9814/// Checks whether the vector elements referenced by two shuffle masks are
9815/// equivalent.
9816static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9817 int Idx, int ExpectedIdx) {
9818 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9819 ExpectedIdx < MaskSize && "Out of range element index");
9820 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9821 return false;
9822
9823 EVT VT = Op.getValueType();
9824 EVT ExpectedVT = ExpectedOp.getValueType();
9825
9826 // Sources must be vectors and match the mask's element count.
9827 if (!VT.isVector() || !ExpectedVT.isVector() ||
9828 (int)VT.getVectorNumElements() != MaskSize ||
9829 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9830 return false;
9831
9832 // Exact match.
9833 if (Idx == ExpectedIdx && Op == ExpectedOp)
9834 return true;
9835
9836 switch (Op.getOpcode()) {
9837 case ISD::BUILD_VECTOR:
9838 // If the values are build vectors, we can look through them to find
9839 // equivalent inputs that make the shuffles equivalent.
9840 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9841 case ISD::BITCAST: {
9843 EVT SrcVT = Src.getValueType();
9844 if (Op == ExpectedOp && SrcVT.isVector()) {
9845 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9846 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9847 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9848 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9849 Idx / Scale, ExpectedIdx / Scale);
9850 }
9851 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9852 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9853 for (unsigned I = 0; I != Scale; ++I)
9854 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9855 (Idx * Scale) + I,
9856 (ExpectedIdx * Scale) + I))
9857 return false;
9858 return true;
9859 }
9860 }
9861 break;
9862 }
9863 case ISD::VECTOR_SHUFFLE: {
9864 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9865 return Op == ExpectedOp &&
9866 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9867 }
9868 case X86ISD::VBROADCAST:
9870 return Op == ExpectedOp;
9872 if (Op == ExpectedOp) {
9873 auto *MemOp = cast<MemSDNode>(Op);
9874 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9875 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9876 }
9877 break;
9878 case X86ISD::VPERMI: {
9879 if (Op == ExpectedOp) {
9881 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9882 SDValue Src = Op.getOperand(0);
9883 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9884 Mask[ExpectedIdx]);
9885 }
9886 break;
9887 }
9888 case X86ISD::HADD:
9889 case X86ISD::HSUB:
9890 case X86ISD::FHADD:
9891 case X86ISD::FHSUB:
9892 case X86ISD::PACKSS:
9893 case X86ISD::PACKUS:
9894 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9895 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9896 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9897 int NumElts = VT.getVectorNumElements();
9898 int NumLanes = VT.getSizeInBits() / 128;
9899 int NumEltsPerLane = NumElts / NumLanes;
9900 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9901 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9902 bool SameElt =
9903 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9904 return SameLane && SameElt;
9905 }
9906 break;
9907 }
9908
9909 return false;
9910}
9911
9912/// Tiny helper function to identify a no-op mask.
9913///
9914/// This is a somewhat boring predicate function. It checks whether the mask
9915/// array input, which is assumed to be a single-input shuffle mask of the kind
9916/// used by the X86 shuffle instructions (not a fully general
9917/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9918/// in-place shuffle are 'no-op's.
9920 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9921 assert(Mask[i] >= -1 && "Out of bound mask element!");
9922 if (Mask[i] >= 0 && Mask[i] != i)
9923 return false;
9924 }
9925 return true;
9926}
9927
9928/// Test whether there are elements crossing LaneSizeInBits lanes in this
9929/// shuffle mask.
9930///
9931/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9932/// and we routinely test for these.
9933static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9934 unsigned ScalarSizeInBits,
9935 ArrayRef<int> Mask) {
9936 assert(LaneSizeInBits && ScalarSizeInBits &&
9937 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9938 "Illegal shuffle lane size");
9939 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9940 int Size = Mask.size();
9941 for (int i = 0; i < Size; ++i)
9942 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9943 return true;
9944 return false;
9945}
9946
9947/// Test whether there are elements crossing 128-bit lanes in this
9948/// shuffle mask.
9950 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9951}
9952
9953/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9954/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9955/// better support 'repeated mask + lane permute' style shuffles.
9956static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9957 unsigned ScalarSizeInBits,
9958 ArrayRef<int> Mask) {
9959 assert(LaneSizeInBits && ScalarSizeInBits &&
9960 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9961 "Illegal shuffle lane size");
9962 int NumElts = Mask.size();
9963 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9964 int NumLanes = NumElts / NumEltsPerLane;
9965 if (NumLanes > 1) {
9966 for (int i = 0; i != NumLanes; ++i) {
9967 int SrcLane = -1;
9968 for (int j = 0; j != NumEltsPerLane; ++j) {
9969 int M = Mask[(i * NumEltsPerLane) + j];
9970 if (M < 0)
9971 continue;
9972 int Lane = (M % NumElts) / NumEltsPerLane;
9973 if (SrcLane >= 0 && SrcLane != Lane)
9974 return true;
9975 SrcLane = Lane;
9976 }
9977 }
9978 }
9979 return false;
9980}
9981
9982/// Test whether a shuffle mask is equivalent within each sub-lane.
9983///
9984/// This checks a shuffle mask to see if it is performing the same
9985/// lane-relative shuffle in each sub-lane. This trivially implies
9986/// that it is also not lane-crossing. It may however involve a blend from the
9987/// same lane of a second vector.
9988///
9989/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9990/// non-trivial to compute in the face of undef lanes. The representation is
9991/// suitable for use with existing 128-bit shuffles as entries from the second
9992/// vector have been remapped to [LaneSize, 2*LaneSize).
9993static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9994 ArrayRef<int> Mask,
9995 SmallVectorImpl<int> &RepeatedMask) {
9996 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9997 RepeatedMask.assign(LaneSize, -1);
9998 int Size = Mask.size();
9999 for (int i = 0; i < Size; ++i) {
10000 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10001 if (Mask[i] < 0)
10002 continue;
10003 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10004 // This entry crosses lanes, so there is no way to model this shuffle.
10005 return false;
10006
10007 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10008 // Adjust second vector indices to start at LaneSize instead of Size.
10009 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10010 : Mask[i] % LaneSize + LaneSize;
10011 if (RepeatedMask[i % LaneSize] < 0)
10012 // This is the first non-undef entry in this slot of a 128-bit lane.
10013 RepeatedMask[i % LaneSize] = LocalM;
10014 else if (RepeatedMask[i % LaneSize] != LocalM)
10015 // Found a mismatch with the repeated mask.
10016 return false;
10017 }
10018 return true;
10019}
10020
10021/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10022static bool
10024 SmallVectorImpl<int> &RepeatedMask) {
10025 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10026}
10027
10028static bool
10030 SmallVector<int, 32> RepeatedMask;
10031 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10032}
10033
10034/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10035static bool
10037 SmallVectorImpl<int> &RepeatedMask) {
10038 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10039}
10040
10041/// Test whether a target shuffle mask is equivalent within each sub-lane.
10042/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10043static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10044 unsigned EltSizeInBits,
10045 ArrayRef<int> Mask,
10046 SmallVectorImpl<int> &RepeatedMask) {
10047 int LaneSize = LaneSizeInBits / EltSizeInBits;
10048 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10049 int Size = Mask.size();
10050 for (int i = 0; i < Size; ++i) {
10051 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10052 if (Mask[i] == SM_SentinelUndef)
10053 continue;
10054 if (Mask[i] == SM_SentinelZero) {
10055 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10056 return false;
10057 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10058 continue;
10059 }
10060 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10061 // This entry crosses lanes, so there is no way to model this shuffle.
10062 return false;
10063
10064 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10065 // later vector indices to start at multiples of LaneSize instead of Size.
10066 int LaneM = Mask[i] / Size;
10067 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10068 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10069 // This is the first non-undef entry in this slot of a 128-bit lane.
10070 RepeatedMask[i % LaneSize] = LocalM;
10071 else if (RepeatedMask[i % LaneSize] != LocalM)
10072 // Found a mismatch with the repeated mask.
10073 return false;
10074 }
10075 return true;
10076}
10077
10078/// Test whether a target shuffle mask is equivalent within each sub-lane.
10079/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10080static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10081 ArrayRef<int> Mask,
10082 SmallVectorImpl<int> &RepeatedMask) {
10083 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10084 Mask, RepeatedMask);
10085}
10086
10087/// Checks whether a shuffle mask is equivalent to an explicit list of
10088/// arguments.
10089///
10090/// This is a fast way to test a shuffle mask against a fixed pattern:
10091///
10092/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10093///
10094/// It returns true if the mask is exactly as wide as the argument list, and
10095/// each element of the mask is either -1 (signifying undef) or the value given
10096/// in the argument.
10097static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10098 SDValue V1 = SDValue(),
10099 SDValue V2 = SDValue()) {
10100 int Size = Mask.size();
10101 if (Size != (int)ExpectedMask.size())
10102 return false;
10103
10104 for (int i = 0; i < Size; ++i) {
10105 assert(Mask[i] >= -1 && "Out of bound mask element!");
10106 int MaskIdx = Mask[i];
10107 int ExpectedIdx = ExpectedMask[i];
10108 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10109 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10110 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10111 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10112 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10113 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10114 return false;
10115 }
10116 }
10117 return true;
10118}
10119
10120/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10121///
10122/// The masks must be exactly the same width.
10123///
10124/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10125/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10126///
10127/// SM_SentinelZero is accepted as a valid negative index but must match in
10128/// both, or via a known bits test.
10130 ArrayRef<int> ExpectedMask,
10131 const SelectionDAG &DAG,
10132 SDValue V1 = SDValue(),
10133 SDValue V2 = SDValue()) {
10134 int Size = Mask.size();
10135 if (Size != (int)ExpectedMask.size())
10136 return false;
10137 assert(llvm::all_of(ExpectedMask,
10138 [Size](int M) {
10139 return M == SM_SentinelZero ||
10140 isInRange(M, 0, 2 * Size);
10141 }) &&
10142 "Illegal target shuffle mask");
10143
10144 // Check for out-of-range target shuffle mask indices.
10145 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10146 return false;
10147
10148 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10149 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10150 !V1.getValueType().isVector()))
10151 V1 = SDValue();
10152 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10153 !V2.getValueType().isVector()))
10154 V2 = SDValue();
10155
10156 APInt ZeroV1 = APInt::getZero(Size);
10157 APInt ZeroV2 = APInt::getZero(Size);
10158
10159 for (int i = 0; i < Size; ++i) {
10160 int MaskIdx = Mask[i];
10161 int ExpectedIdx = ExpectedMask[i];
10162 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10163 continue;
10164 // If we failed to match an expected SM_SentinelZero then early out.
10165 if (ExpectedIdx < 0)
10166 return false;
10167 if (MaskIdx == SM_SentinelZero) {
10168 // If we need this expected index to be a zero element, then update the
10169 // relevant zero mask and perform the known bits at the end to minimize
10170 // repeated computes.
10171 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10172 if (ExpectedV &&
10173 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10174 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10175 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10176 ZeroMask.setBit(BitIdx);
10177 continue;
10178 }
10179 }
10180 if (MaskIdx >= 0) {
10181 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10182 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10183 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10184 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10185 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10186 continue;
10187 }
10188 return false;
10189 }
10190 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10191 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10192}
10193
10194// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10195// instructions.
10197 const SelectionDAG &DAG) {
10198 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10199 return false;
10200
10201 SmallVector<int, 8> Unpcklwd;
10202 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10203 /* Unary = */ false);
10204 SmallVector<int, 8> Unpckhwd;
10205 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10206 /* Unary = */ false);
10207 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10208 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10209 return IsUnpackwdMask;
10210}
10211
10213 const SelectionDAG &DAG) {
10214 // Create 128-bit vector type based on mask size.
10215 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10216 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10217
10218 // We can't assume a canonical shuffle mask, so try the commuted version too.
10219 SmallVector<int, 4> CommutedMask(Mask);
10221
10222 // Match any of unary/binary or low/high.
10223 for (unsigned i = 0; i != 4; ++i) {
10224 SmallVector<int, 16> UnpackMask;
10225 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10226 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10227 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10228 return true;
10229 }
10230 return false;
10231}
10232
10233/// Return true if a shuffle mask chooses elements identically in its top and
10234/// bottom halves. For example, any splat mask has the same top and bottom
10235/// halves. If an element is undefined in only one half of the mask, the halves
10236/// are not considered identical.
10238 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10239 unsigned HalfSize = Mask.size() / 2;
10240 for (unsigned i = 0; i != HalfSize; ++i) {
10241 if (Mask[i] != Mask[i + HalfSize])
10242 return false;
10243 }
10244 return true;
10245}
10246
10247/// Get a 4-lane 8-bit shuffle immediate for a mask.
10248///
10249/// This helper function produces an 8-bit shuffle immediate corresponding to
10250/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10251/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10252/// example.
10253///
10254/// NB: We rely heavily on "undef" masks preserving the input lane.
10255static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10256 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10257 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10258 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10259 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10260 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10261
10262 // If the mask only uses one non-undef element, then fully 'splat' it to
10263 // improve later broadcast matching.
10264 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10265 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10266
10267 int FirstElt = Mask[FirstIndex];
10268 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10269 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10270
10271 unsigned Imm = 0;
10272 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10273 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10274 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10275 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10276 return Imm;
10277}
10278
10280 SelectionDAG &DAG) {
10281 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10282}
10283
10284// Canonicalize SHUFPD mask to improve chances of further folding.
10285// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10286static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10287 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10288 "Unexpected SHUFPD mask size");
10289 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10290 "Unexpected SHUFPD mask elements");
10291
10292 // If the mask only uses one non-undef element, then fully 'splat' it to
10293 // improve later broadcast matching.
10294 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10295 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10296 "All undef shuffle mask");
10297
10298 int FirstElt = Mask[FirstIndex];
10299 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10300 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10301 unsigned Imm = 0;
10302 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10303 Imm |= FirstElt << I;
10304 return Imm;
10305 }
10306
10307 // Attempt to keep any undef elements in place to improve chances of the
10308 // shuffle becoming a (commutative) blend.
10309 unsigned Imm = 0;
10310 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10311 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10312
10313 return Imm;
10314}
10315
10317 SelectionDAG &DAG) {
10318 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10319}
10320
10321// The Shuffle result is as follow:
10322// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10323// Each Zeroable's element correspond to a particular Mask's element.
10324// As described in computeZeroableShuffleElements function.
10325//
10326// The function looks for a sub-mask that the nonzero elements are in
10327// increasing order. If such sub-mask exist. The function returns true.
10328static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10329 ArrayRef<int> Mask, const EVT &VectorType,
10330 bool &IsZeroSideLeft) {
10331 int NextElement = -1;
10332 // Check if the Mask's nonzero elements are in increasing order.
10333 for (int i = 0, e = Mask.size(); i < e; i++) {
10334 // Checks if the mask's zeros elements are built from only zeros.
10335 assert(Mask[i] >= -1 && "Out of bound mask element!");
10336 if (Mask[i] < 0)
10337 return false;
10338 if (Zeroable[i])
10339 continue;
10340 // Find the lowest non zero element
10341 if (NextElement < 0) {
10342 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10343 IsZeroSideLeft = NextElement != 0;
10344 }
10345 // Exit if the mask's non zero elements are not in increasing order.
10346 if (NextElement != Mask[i])
10347 return false;
10348 NextElement++;
10349 }
10350 return true;
10351}
10352
10353static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10355 const X86Subtarget &Subtarget,
10356 unsigned Depth = 0);
10357
10358/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10360 ArrayRef<int> Mask, SDValue V1,
10361 SDValue V2, const APInt &Zeroable,
10362 const X86Subtarget &Subtarget,
10363 SelectionDAG &DAG) {
10364 int Size = Mask.size();
10365 int LaneSize = 128 / VT.getScalarSizeInBits();
10366 const int NumBytes = VT.getSizeInBits() / 8;
10367 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10368
10369 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10370 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10371 (Subtarget.hasBWI() && VT.is512BitVector()));
10372
10373 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10374 // Sign bit set in i8 mask means zero element.
10375 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10376
10377 SDValue V;
10378 for (int i = 0; i < NumBytes; ++i) {
10379 int M = Mask[i / NumEltBytes];
10380 if (M < 0) {
10381 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10382 continue;
10383 }
10384 if (Zeroable[i / NumEltBytes]) {
10385 PSHUFBMask[i] = ZeroMask;
10386 continue;
10387 }
10388
10389 // We can only use a single input of V1 or V2.
10390 SDValue SrcV = (M >= Size ? V2 : V1);
10391 if (V && V != SrcV)
10392 return SDValue();
10393 V = SrcV;
10394 M %= Size;
10395
10396 // PSHUFB can't cross lanes, ensure this doesn't happen.
10397 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10398 return SDValue();
10399
10400 M = M % LaneSize;
10401 M = M * NumEltBytes + (i % NumEltBytes);
10402 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10403 }
10404 assert(V && "Failed to find a source input");
10405
10406 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10407 return DAG.getBitcast(
10408 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10409 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10410}
10411
10412static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10413 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10414 const SDLoc &dl);
10415
10416// X86 has dedicated shuffle that can be lowered to VEXPAND
10418 SDValue V2, ArrayRef<int> Mask,
10419 const APInt &Zeroable,
10420 const X86Subtarget &Subtarget,
10421 SelectionDAG &DAG) {
10422 bool IsLeftZeroSide = true;
10423 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10424 IsLeftZeroSide))
10425 return SDValue();
10426 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10428 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10429 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10430 unsigned NumElts = VT.getVectorNumElements();
10431 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10432 "Unexpected number of vector elements");
10433 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10434 Subtarget, DAG, DL);
10435 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10436 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10437 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10438}
10439
10440static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10441 unsigned &UnpackOpcode, bool IsUnary,
10442 ArrayRef<int> TargetMask, const SDLoc &DL,
10443 SelectionDAG &DAG,
10444 const X86Subtarget &Subtarget) {
10445 int NumElts = VT.getVectorNumElements();
10446
10447 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10448 for (int i = 0; i != NumElts; i += 2) {
10449 int M1 = TargetMask[i + 0];
10450 int M2 = TargetMask[i + 1];
10451 Undef1 &= (SM_SentinelUndef == M1);
10452 Undef2 &= (SM_SentinelUndef == M2);
10453 Zero1 &= isUndefOrZero(M1);
10454 Zero2 &= isUndefOrZero(M2);
10455 }
10456 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10457 "Zeroable shuffle detected");
10458
10459 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10460 SmallVector<int, 64> Unpckl, Unpckh;
10461 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10462 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10463 (IsUnary ? V1 : V2))) {
10464 UnpackOpcode = X86ISD::UNPCKL;
10465 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10466 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10467 return true;
10468 }
10469
10470 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10471 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10472 (IsUnary ? V1 : V2))) {
10473 UnpackOpcode = X86ISD::UNPCKH;
10474 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10475 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10476 return true;
10477 }
10478
10479 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10480 if (IsUnary && (Zero1 || Zero2)) {
10481 // Don't bother if we can blend instead.
10482 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10483 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10484 return false;
10485
10486 bool MatchLo = true, MatchHi = true;
10487 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10488 int M = TargetMask[i];
10489
10490 // Ignore if the input is known to be zero or the index is undef.
10491 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10492 (M == SM_SentinelUndef))
10493 continue;
10494
10495 MatchLo &= (M == Unpckl[i]);
10496 MatchHi &= (M == Unpckh[i]);
10497 }
10498
10499 if (MatchLo || MatchHi) {
10500 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10501 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10502 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10503 return true;
10504 }
10505 }
10506
10507 // If a binary shuffle, commute and try again.
10508 if (!IsUnary) {
10510 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10511 UnpackOpcode = X86ISD::UNPCKL;
10512 std::swap(V1, V2);
10513 return true;
10514 }
10515
10517 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10518 UnpackOpcode = X86ISD::UNPCKH;
10519 std::swap(V1, V2);
10520 return true;
10521 }
10522 }
10523
10524 return false;
10525}
10526
10527// X86 has dedicated unpack instructions that can handle specific blend
10528// operations: UNPCKH and UNPCKL.
10530 SDValue V2, ArrayRef<int> Mask,
10531 SelectionDAG &DAG) {
10532 SmallVector<int, 8> Unpckl;
10533 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10534 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10535 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10536
10537 SmallVector<int, 8> Unpckh;
10538 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10539 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10540 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10541
10542 // Commute and try again.
10544 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10545 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10546
10548 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10549 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10550
10551 return SDValue();
10552}
10553
10554/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10555/// followed by unpack 256-bit.
10557 SDValue V2, ArrayRef<int> Mask,
10558 SelectionDAG &DAG) {
10559 SmallVector<int, 32> Unpckl, Unpckh;
10560 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10561 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10562
10563 unsigned UnpackOpcode;
10564 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10565 UnpackOpcode = X86ISD::UNPCKL;
10566 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10567 UnpackOpcode = X86ISD::UNPCKH;
10568 else
10569 return SDValue();
10570
10571 // This is a "natural" unpack operation (rather than the 128-bit sectored
10572 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10573 // input in order to use the x86 instruction.
10574 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10575 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10576 V1 = DAG.getBitcast(VT, V1);
10577 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10578}
10579
10580// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10581// source into the lower elements and zeroing the upper elements.
10582static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10583 ArrayRef<int> Mask, const APInt &Zeroable,
10584 const X86Subtarget &Subtarget) {
10585 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10586 return false;
10587
10588 unsigned NumElts = Mask.size();
10589 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10590 unsigned MaxScale = 64 / EltSizeInBits;
10591
10592 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10593 unsigned SrcEltBits = EltSizeInBits * Scale;
10594 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10595 continue;
10596 unsigned NumSrcElts = NumElts / Scale;
10597 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10598 continue;
10599 unsigned UpperElts = NumElts - NumSrcElts;
10600 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10601 continue;
10602 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10603 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10604 DstVT = MVT::getIntegerVT(EltSizeInBits);
10605 if ((NumSrcElts * EltSizeInBits) >= 128) {
10606 // ISD::TRUNCATE
10607 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10608 } else {
10609 // X86ISD::VTRUNC
10610 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10611 }
10612 return true;
10613 }
10614
10615 return false;
10616}
10617
10618// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10619// element padding to the final DstVT.
10620static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10621 const X86Subtarget &Subtarget,
10622 SelectionDAG &DAG, bool ZeroUppers) {
10623 MVT SrcVT = Src.getSimpleValueType();
10624 MVT DstSVT = DstVT.getScalarType();
10625 unsigned NumDstElts = DstVT.getVectorNumElements();
10626 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10627 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10628
10629 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10630 return SDValue();
10631
10632 // Perform a direct ISD::TRUNCATE if possible.
10633 if (NumSrcElts == NumDstElts)
10634 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10635
10636 if (NumSrcElts > NumDstElts) {
10637 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10638 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10639 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10640 }
10641
10642 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10643 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10644 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10645 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10646 DstVT.getSizeInBits());
10647 }
10648
10649 // Non-VLX targets must truncate from a 512-bit type, so we need to
10650 // widen, truncate and then possibly extract the original subvector.
10651 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10652 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10653 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10654 }
10655
10656 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10657 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10658 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10659 if (DstVT != TruncVT)
10660 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10661 DstVT.getSizeInBits());
10662 return Trunc;
10663}
10664
10665// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10666//
10667// An example is the following:
10668//
10669// t0: ch = EntryToken
10670// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10671// t25: v4i32 = truncate t2
10672// t41: v8i16 = bitcast t25
10673// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10674// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10675// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10676// t18: v2i64 = bitcast t51
10677//
10678// One can just use a single vpmovdw instruction, without avx512vl we need to
10679// use the zmm variant and extract the lower subvector, padding with zeroes.
10680// TODO: Merge with lowerShuffleAsVTRUNC.
10682 SDValue V2, ArrayRef<int> Mask,
10683 const APInt &Zeroable,
10684 const X86Subtarget &Subtarget,
10685 SelectionDAG &DAG) {
10686 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10687 if (!Subtarget.hasAVX512())
10688 return SDValue();
10689
10690 unsigned NumElts = VT.getVectorNumElements();
10691 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10692 unsigned MaxScale = 64 / EltSizeInBits;
10693 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10694 unsigned SrcEltBits = EltSizeInBits * Scale;
10695 unsigned NumSrcElts = NumElts / Scale;
10696 unsigned UpperElts = NumElts - NumSrcElts;
10697 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10698 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10699 continue;
10700
10701 // Attempt to find a matching source truncation, but as a fall back VLX
10702 // cases can use the VPMOV directly.
10703 SDValue Src = peekThroughBitcasts(V1);
10704 if (Src.getOpcode() == ISD::TRUNCATE &&
10705 Src.getScalarValueSizeInBits() == SrcEltBits) {
10706 Src = Src.getOperand(0);
10707 } else if (Subtarget.hasVLX()) {
10708 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10709 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10710 Src = DAG.getBitcast(SrcVT, Src);
10711 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10712 if (Scale == 2 &&
10713 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10714 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10715 return SDValue();
10716 } else
10717 return SDValue();
10718
10719 // VPMOVWB is only available with avx512bw.
10720 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10721 return SDValue();
10722
10723 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10724 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10725 }
10726
10727 return SDValue();
10728}
10729
10730// Attempt to match binary shuffle patterns as a truncate.
10732 SDValue V2, ArrayRef<int> Mask,
10733 const APInt &Zeroable,
10734 const X86Subtarget &Subtarget,
10735 SelectionDAG &DAG) {
10736 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10737 "Unexpected VTRUNC type");
10738 if (!Subtarget.hasAVX512() ||
10739 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10740 return SDValue();
10741
10742 unsigned NumElts = VT.getVectorNumElements();
10743 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10744 unsigned MaxScale = 64 / EltSizeInBits;
10745 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10746 // TODO: Support non-BWI VPMOVWB truncations?
10747 unsigned SrcEltBits = EltSizeInBits * Scale;
10748 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10749 continue;
10750
10751 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10752 // Bail if the V2 elements are undef.
10753 unsigned NumHalfSrcElts = NumElts / Scale;
10754 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10755 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10756 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10757 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10758 continue;
10759
10760 // The elements beyond the truncation must be undef/zero.
10761 unsigned UpperElts = NumElts - NumSrcElts;
10762 if (UpperElts > 0 &&
10763 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10764 continue;
10765 bool UndefUppers =
10766 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10767
10768 // As we're using both sources then we need to concat them together
10769 // and truncate from the double-sized src.
10770 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10771
10772 // For offset truncations, ensure that the concat is cheap.
10773 SDValue Src =
10774 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10775 if (!Src) {
10776 if (Offset)
10777 continue;
10778 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10779 }
10780
10781 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10782 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10783 Src = DAG.getBitcast(SrcVT, Src);
10784
10785 // Shift the offset'd elements into place for the truncation.
10786 // TODO: Use getTargetVShiftByConstNode.
10787 if (Offset)
10788 Src = DAG.getNode(
10789 X86ISD::VSRLI, DL, SrcVT, Src,
10790 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10791
10792 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10793 }
10794 }
10795
10796 return SDValue();
10797}
10798
10799/// Check whether a compaction lowering can be done by dropping even/odd
10800/// elements and compute how many times even/odd elements must be dropped.
10801///
10802/// This handles shuffles which take every Nth element where N is a power of
10803/// two. Example shuffle masks:
10804///
10805/// (even)
10806/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10807/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10808/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10809/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10810/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10811/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10812///
10813/// (odd)
10814/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10815/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10816///
10817/// Any of these lanes can of course be undef.
10818///
10819/// This routine only supports N <= 3.
10820/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10821/// for larger N.
10822///
10823/// \returns N above, or the number of times even/odd elements must be dropped
10824/// if there is such a number. Otherwise returns zero.
10825static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10826 bool IsSingleInput) {
10827 // The modulus for the shuffle vector entries is based on whether this is
10828 // a single input or not.
10829 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10830 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10831 "We should only be called with masks with a power-of-2 size!");
10832
10833 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10834 int Offset = MatchEven ? 0 : 1;
10835
10836 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10837 // and 2^3 simultaneously. This is because we may have ambiguity with
10838 // partially undef inputs.
10839 bool ViableForN[3] = {true, true, true};
10840
10841 for (int i = 0, e = Mask.size(); i < e; ++i) {
10842 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10843 // want.
10844 if (Mask[i] < 0)
10845 continue;
10846
10847 bool IsAnyViable = false;
10848 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10849 if (ViableForN[j]) {
10850 uint64_t N = j + 1;
10851
10852 // The shuffle mask must be equal to (i * 2^N) % M.
10853 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10854 IsAnyViable = true;
10855 else
10856 ViableForN[j] = false;
10857 }
10858 // Early exit if we exhaust the possible powers of two.
10859 if (!IsAnyViable)
10860 break;
10861 }
10862
10863 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10864 if (ViableForN[j])
10865 return j + 1;
10866
10867 // Return 0 as there is no viable power of two.
10868 return 0;
10869}
10870
10871// X86 has dedicated pack instructions that can handle specific truncation
10872// operations: PACKSS and PACKUS.
10873// Checks for compaction shuffle masks if MaxStages > 1.
10874// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10875static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10876 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10877 const SelectionDAG &DAG,
10878 const X86Subtarget &Subtarget,
10879 unsigned MaxStages = 1) {
10880 unsigned NumElts = VT.getVectorNumElements();
10881 unsigned BitSize = VT.getScalarSizeInBits();
10882 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10883 "Illegal maximum compaction");
10884
10885 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10886 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10887 unsigned NumPackedBits = NumSrcBits - BitSize;
10888 N1 = peekThroughBitcasts(N1);
10889 N2 = peekThroughBitcasts(N2);
10890 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10891 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10892 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10893 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10894 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10895 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10896 return false;
10897 if (Subtarget.hasSSE41() || BitSize == 8) {
10898 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10899 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10900 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10901 V1 = N1;
10902 V2 = N2;
10903 SrcVT = PackVT;
10904 PackOpcode = X86ISD::PACKUS;
10905 return true;
10906 }
10907 }
10908 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10909 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10910 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10911 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10912 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10913 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10914 V1 = N1;
10915 V2 = N2;
10916 SrcVT = PackVT;
10917 PackOpcode = X86ISD::PACKSS;
10918 return true;
10919 }
10920 return false;
10921 };
10922
10923 // Attempt to match against wider and wider compaction patterns.
10924 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10925 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10926 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10927
10928 // Try binary shuffle.
10929 SmallVector<int, 32> BinaryMask;
10930 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10931 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10932 if (MatchPACK(V1, V2, PackVT))
10933 return true;
10934
10935 // Try unary shuffle.
10936 SmallVector<int, 32> UnaryMask;
10937 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10938 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10939 if (MatchPACK(V1, V1, PackVT))
10940 return true;
10941 }
10942
10943 return false;
10944}
10945
10947 SDValue V2, ArrayRef<int> Mask,
10948 const X86Subtarget &Subtarget,
10949 SelectionDAG &DAG) {
10950 MVT PackVT;
10951 unsigned PackOpcode;
10952 unsigned SizeBits = VT.getSizeInBits();
10953 unsigned EltBits = VT.getScalarSizeInBits();
10954 unsigned MaxStages = Log2_32(64 / EltBits);
10955 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10956 Subtarget, MaxStages))
10957 return SDValue();
10958
10959 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10960 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10961
10962 // Don't lower multi-stage packs on AVX512, truncation is better.
10963 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10964 return SDValue();
10965
10966 // Pack to the largest type possible:
10967 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10968 unsigned MaxPackBits = 16;
10969 if (CurrentEltBits > 16 &&
10970 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10971 MaxPackBits = 32;
10972
10973 // Repeatedly pack down to the target size.
10974 SDValue Res;
10975 for (unsigned i = 0; i != NumStages; ++i) {
10976 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10977 unsigned NumSrcElts = SizeBits / SrcEltBits;
10978 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10979 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10980 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10981 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10982 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10983 DAG.getBitcast(SrcVT, V2));
10984 V1 = V2 = Res;
10985 CurrentEltBits /= 2;
10986 }
10987 assert(Res && Res.getValueType() == VT &&
10988 "Failed to lower compaction shuffle");
10989 return Res;
10990}
10991
10992/// Try to emit a bitmask instruction for a shuffle.
10993///
10994/// This handles cases where we can model a blend exactly as a bitmask due to
10995/// one of the inputs being zeroable.
10997 SDValue V2, ArrayRef<int> Mask,
10998 const APInt &Zeroable,
10999 const X86Subtarget &Subtarget,
11000 SelectionDAG &DAG) {
11001 MVT MaskVT = VT;
11002 MVT EltVT = VT.getVectorElementType();
11003 SDValue Zero, AllOnes;
11004 // Use f64 if i64 isn't legal.
11005 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11006 EltVT = MVT::f64;
11007 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11008 }
11009
11010 MVT LogicVT = VT;
11011 if (EltVT.isFloatingPoint()) {
11012 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11013 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11014 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11015 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11016 } else {
11017 Zero = DAG.getConstant(0, DL, EltVT);
11018 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11019 }
11020
11021 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11022 SDValue V;
11023 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11024 if (Zeroable[i])
11025 continue;
11026 if (Mask[i] % Size != i)
11027 return SDValue(); // Not a blend.
11028 if (!V)
11029 V = Mask[i] < Size ? V1 : V2;
11030 else if (V != (Mask[i] < Size ? V1 : V2))
11031 return SDValue(); // Can only let one input through the mask.
11032
11033 VMaskOps[i] = AllOnes;
11034 }
11035 if (!V)
11036 return SDValue(); // No non-zeroable elements!
11037
11038 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11039 VMask = DAG.getBitcast(LogicVT, VMask);
11040 V = DAG.getBitcast(LogicVT, V);
11041 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11042 return DAG.getBitcast(VT, And);
11043}
11044
11045/// Try to emit a blend instruction for a shuffle using bit math.
11046///
11047/// This is used as a fallback approach when first class blend instructions are
11048/// unavailable. Currently it is only suitable for integer vectors, but could
11049/// be generalized for floating point vectors if desirable.
11051 SDValue V2, ArrayRef<int> Mask,
11052 SelectionDAG &DAG) {
11053 assert(VT.isInteger() && "Only supports integer vector types!");
11054 MVT EltVT = VT.getVectorElementType();
11055 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11056 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11058 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11059 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11060 return SDValue(); // Shuffled input!
11061 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11062 }
11063
11064 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11065 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11066}
11067
11069 SDValue PreservedSrc,
11070 const X86Subtarget &Subtarget,
11071 SelectionDAG &DAG);
11072
11075 const APInt &Zeroable, bool &ForceV1Zero,
11076 bool &ForceV2Zero, uint64_t &BlendMask) {
11077 bool V1IsZeroOrUndef =
11079 bool V2IsZeroOrUndef =
11081
11082 BlendMask = 0;
11083 ForceV1Zero = false, ForceV2Zero = false;
11084 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11085
11086 int NumElts = Mask.size();
11087 int NumLanes = VT.getSizeInBits() / 128;
11088 int NumEltsPerLane = NumElts / NumLanes;
11089 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11090
11091 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11092 // then ensure the blend mask part for that lane just references that input.
11093 bool ForceWholeLaneMasks =
11094 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11095
11096 // Attempt to generate the binary blend mask. If an input is zero then
11097 // we can use any lane.
11098 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11099 // Keep track of the inputs used per lane.
11100 bool LaneV1InUse = false;
11101 bool LaneV2InUse = false;
11102 uint64_t LaneBlendMask = 0;
11103 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11104 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11105 int M = Mask[Elt];
11106 if (M == SM_SentinelUndef)
11107 continue;
11108 if (M == Elt || (0 <= M && M < NumElts &&
11109 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11110 Mask[Elt] = Elt;
11111 LaneV1InUse = true;
11112 continue;
11113 }
11114 if (M == (Elt + NumElts) ||
11115 (NumElts <= M &&
11116 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11117 LaneBlendMask |= 1ull << LaneElt;
11118 Mask[Elt] = Elt + NumElts;
11119 LaneV2InUse = true;
11120 continue;
11121 }
11122 if (Zeroable[Elt]) {
11123 if (V1IsZeroOrUndef) {
11124 ForceV1Zero = true;
11125 Mask[Elt] = Elt;
11126 LaneV1InUse = true;
11127 continue;
11128 }
11129 if (V2IsZeroOrUndef) {
11130 ForceV2Zero = true;
11131 LaneBlendMask |= 1ull << LaneElt;
11132 Mask[Elt] = Elt + NumElts;
11133 LaneV2InUse = true;
11134 continue;
11135 }
11136 }
11137 return false;
11138 }
11139
11140 // If we only used V2 then splat the lane blend mask to avoid any demanded
11141 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11142 // blend mask bit).
11143 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11144 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11145
11146 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11147 }
11148 return true;
11149}
11150
11151/// Try to emit a blend instruction for a shuffle.
11152///
11153/// This doesn't do any checks for the availability of instructions for blending
11154/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11155/// be matched in the backend with the type given. What it does check for is
11156/// that the shuffle mask is a blend, or convertible into a blend with zero.
11158 SDValue V2, ArrayRef<int> Original,
11159 const APInt &Zeroable,
11160 const X86Subtarget &Subtarget,
11161 SelectionDAG &DAG) {
11162 uint64_t BlendMask = 0;
11163 bool ForceV1Zero = false, ForceV2Zero = false;
11164 SmallVector<int, 64> Mask(Original);
11165 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11166 BlendMask))
11167 return SDValue();
11168
11169 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11170 if (ForceV1Zero)
11171 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11172 if (ForceV2Zero)
11173 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11174
11175 unsigned NumElts = VT.getVectorNumElements();
11176
11177 switch (VT.SimpleTy) {
11178 case MVT::v4i64:
11179 case MVT::v8i32:
11180 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11181 [[fallthrough]];
11182 case MVT::v4f64:
11183 case MVT::v8f32:
11184 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11185 [[fallthrough]];
11186 case MVT::v2f64:
11187 case MVT::v2i64:
11188 case MVT::v4f32:
11189 case MVT::v4i32:
11190 case MVT::v8i16:
11191 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11192 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11193 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11194 case MVT::v16i16: {
11195 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11196 SmallVector<int, 8> RepeatedMask;
11197 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11198 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11199 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11200 BlendMask = 0;
11201 for (int i = 0; i < 8; ++i)
11202 if (RepeatedMask[i] >= 8)
11203 BlendMask |= 1ull << i;
11204 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11205 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11206 }
11207 // Use PBLENDW for lower/upper lanes and then blend lanes.
11208 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11209 // merge to VSELECT where useful.
11210 uint64_t LoMask = BlendMask & 0xFF;
11211 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11212 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11213 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11214 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11215 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11216 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11217 return DAG.getVectorShuffle(
11218 MVT::v16i16, DL, Lo, Hi,
11219 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11220 }
11221 [[fallthrough]];
11222 }
11223 case MVT::v32i8:
11224 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11225 [[fallthrough]];
11226 case MVT::v16i8: {
11227 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11228
11229 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11230 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11231 Subtarget, DAG))
11232 return Masked;
11233
11234 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11235 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11236 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11237 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11238 }
11239
11240 // If we have VPTERNLOG, we can use that as a bit blend.
11241 if (Subtarget.hasVLX())
11242 if (SDValue BitBlend =
11243 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11244 return BitBlend;
11245
11246 // Scale the blend by the number of bytes per element.
11247 int Scale = VT.getScalarSizeInBits() / 8;
11248
11249 // This form of blend is always done on bytes. Compute the byte vector
11250 // type.
11251 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11252
11253 // x86 allows load folding with blendvb from the 2nd source operand. But
11254 // we are still using LLVM select here (see comment below), so that's V1.
11255 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11256 // allow that load-folding possibility.
11257 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11259 std::swap(V1, V2);
11260 }
11261
11262 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11263 // mix of LLVM's code generator and the x86 backend. We tell the code
11264 // generator that boolean values in the elements of an x86 vector register
11265 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11266 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11267 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11268 // of the element (the remaining are ignored) and 0 in that high bit would
11269 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11270 // the LLVM model for boolean values in vector elements gets the relevant
11271 // bit set, it is set backwards and over constrained relative to x86's
11272 // actual model.
11273 SmallVector<SDValue, 32> VSELECTMask;
11274 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11275 for (int j = 0; j < Scale; ++j)
11276 VSELECTMask.push_back(
11277 Mask[i] < 0
11278 ? DAG.getUNDEF(MVT::i8)
11279 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11280
11281 V1 = DAG.getBitcast(BlendVT, V1);
11282 V2 = DAG.getBitcast(BlendVT, V2);
11283 return DAG.getBitcast(
11284 VT,
11285 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11286 V1, V2));
11287 }
11288 case MVT::v16f32:
11289 case MVT::v8f64:
11290 case MVT::v8i64:
11291 case MVT::v16i32:
11292 case MVT::v32i16:
11293 case MVT::v64i8: {
11294 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11295 bool OptForSize = DAG.shouldOptForSize();
11296 if (!OptForSize) {
11297 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11298 Subtarget, DAG))
11299 return Masked;
11300 }
11301
11302 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11303 // masked move.
11304 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11305 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11306 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11307 }
11308 default:
11309 llvm_unreachable("Not a supported integer vector type!");
11310 }
11311}
11312
11313/// Try to lower as a blend of elements from two inputs followed by
11314/// a single-input permutation.
11315///
11316/// This matches the pattern where we can blend elements from two inputs and
11317/// then reduce the shuffle to a single-input permutation.
11319 SDValue V1, SDValue V2,
11320 ArrayRef<int> Mask,
11321 SelectionDAG &DAG,
11322 bool ImmBlends = false) {
11323 // We build up the blend mask while checking whether a blend is a viable way
11324 // to reduce the shuffle.
11325 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11326 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11327
11328 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11329 if (Mask[i] < 0)
11330 continue;
11331
11332 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11333
11334 if (BlendMask[Mask[i] % Size] < 0)
11335 BlendMask[Mask[i] % Size] = Mask[i];
11336 else if (BlendMask[Mask[i] % Size] != Mask[i])
11337 return SDValue(); // Can't blend in the needed input!
11338
11339 PermuteMask[i] = Mask[i] % Size;
11340 }
11341
11342 // If only immediate blends, then bail if the blend mask can't be widened to
11343 // i16.
11344 unsigned EltSize = VT.getScalarSizeInBits();
11345 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11346 return SDValue();
11347
11348 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11349 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11350}
11351
11352/// Try to lower as an unpack of elements from two inputs followed by
11353/// a single-input permutation.
11354///
11355/// This matches the pattern where we can unpack elements from two inputs and
11356/// then reduce the shuffle to a single-input (wider) permutation.
11358 SDValue V1, SDValue V2,
11359 ArrayRef<int> Mask,
11360 SelectionDAG &DAG) {
11361 int NumElts = Mask.size();
11362 int NumLanes = VT.getSizeInBits() / 128;
11363 int NumLaneElts = NumElts / NumLanes;
11364 int NumHalfLaneElts = NumLaneElts / 2;
11365
11366 bool MatchLo = true, MatchHi = true;
11367 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11368
11369 // Determine UNPCKL/UNPCKH type and operand order.
11370 for (int Elt = 0; Elt != NumElts; ++Elt) {
11371 int M = Mask[Elt];
11372 if (M < 0)
11373 continue;
11374
11375 // Normalize the mask value depending on whether it's V1 or V2.
11376 int NormM = M;
11377 SDValue &Op = Ops[Elt & 1];
11378 if (M < NumElts && (Op.isUndef() || Op == V1))
11379 Op = V1;
11380 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11381 Op = V2;
11382 NormM -= NumElts;
11383 } else
11384 return SDValue();
11385
11386 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11387 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11388 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11389 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11390 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11391 if (MatchLoAnyLane || MatchHiAnyLane) {
11392 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11393 "Failed to match UNPCKLO/UNPCKHI");
11394 break;
11395 }
11396 }
11397 MatchLo &= MatchLoAnyLane;
11398 MatchHi &= MatchHiAnyLane;
11399 if (!MatchLo && !MatchHi)
11400 return SDValue();
11401 }
11402 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11403
11404 // Element indices have changed after unpacking. Calculate permute mask
11405 // so that they will be put back to the position as dictated by the
11406 // original shuffle mask indices.
11407 SmallVector<int, 32> PermuteMask(NumElts, -1);
11408 for (int Elt = 0; Elt != NumElts; ++Elt) {
11409 int M = Mask[Elt];
11410 if (M < 0)
11411 continue;
11412 int NormM = M;
11413 if (NumElts <= M)
11414 NormM -= NumElts;
11415 bool IsFirstOp = M < NumElts;
11416 int BaseMaskElt =
11417 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11418 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11419 PermuteMask[Elt] = BaseMaskElt;
11420 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11421 PermuteMask[Elt] = BaseMaskElt + 1;
11422 assert(PermuteMask[Elt] != -1 &&
11423 "Input mask element is defined but failed to assign permute mask");
11424 }
11425
11426 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11427 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11428 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11429}
11430
11431/// Try to lower a shuffle as a permute of the inputs followed by an
11432/// UNPCK instruction.
11433///
11434/// This specifically targets cases where we end up with alternating between
11435/// the two inputs, and so can permute them into something that feeds a single
11436/// UNPCK instruction. Note that this routine only targets integer vectors
11437/// because for floating point vectors we have a generalized SHUFPS lowering
11438/// strategy that handles everything that doesn't *exactly* match an unpack,
11439/// making this clever lowering unnecessary.
11441 SDValue V1, SDValue V2,
11442 ArrayRef<int> Mask,
11443 const X86Subtarget &Subtarget,
11444 SelectionDAG &DAG) {
11445 int Size = Mask.size();
11446 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11447
11448 // This routine only supports 128-bit integer dual input vectors.
11449 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11450 return SDValue();
11451
11452 int NumLoInputs =
11453 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11454 int NumHiInputs =
11455 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11456
11457 bool UnpackLo = NumLoInputs >= NumHiInputs;
11458
11459 auto TryUnpack = [&](int ScalarSize, int Scale) {
11460 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11461 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11462
11463 for (int i = 0; i < Size; ++i) {
11464 if (Mask[i] < 0)
11465 continue;
11466
11467 // Each element of the unpack contains Scale elements from this mask.
11468 int UnpackIdx = i / Scale;
11469
11470 // We only handle the case where V1 feeds the first slots of the unpack.
11471 // We rely on canonicalization to ensure this is the case.
11472 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11473 return SDValue();
11474
11475 // Setup the mask for this input. The indexing is tricky as we have to
11476 // handle the unpack stride.
11477 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11478 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11479 Mask[i] % Size;
11480 }
11481
11482 // If we will have to shuffle both inputs to use the unpack, check whether
11483 // we can just unpack first and shuffle the result. If so, skip this unpack.
11484 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11485 !isNoopShuffleMask(V2Mask))
11486 return SDValue();
11487
11488 // Shuffle the inputs into place.
11489 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11490 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11491
11492 // Cast the inputs to the type we will use to unpack them.
11493 MVT UnpackVT =
11494 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11495 V1 = DAG.getBitcast(UnpackVT, V1);
11496 V2 = DAG.getBitcast(UnpackVT, V2);
11497
11498 // Unpack the inputs and cast the result back to the desired type.
11499 return DAG.getBitcast(
11500 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11501 UnpackVT, V1, V2));
11502 };
11503
11504 // We try each unpack from the largest to the smallest to try and find one
11505 // that fits this mask.
11506 int OrigScalarSize = VT.getScalarSizeInBits();
11507 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11508 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11509 return Unpack;
11510
11511 // If we're shuffling with a zero vector then we're better off not doing
11512 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11515 return SDValue();
11516
11517 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11518 // initial unpack.
11519 if (NumLoInputs == 0 || NumHiInputs == 0) {
11520 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11521 "We have to have *some* inputs!");
11522 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11523
11524 // FIXME: We could consider the total complexity of the permute of each
11525 // possible unpacking. Or at the least we should consider how many
11526 // half-crossings are created.
11527 // FIXME: We could consider commuting the unpacks.
11528
11529 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11530 for (int i = 0; i < Size; ++i) {
11531 if (Mask[i] < 0)
11532 continue;
11533
11534 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11535
11536 PermMask[i] =
11537 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11538 }
11539 return DAG.getVectorShuffle(
11540 VT, DL,
11541 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11542 V1, V2),
11543 DAG.getUNDEF(VT), PermMask);
11544 }
11545
11546 return SDValue();
11547}
11548
11549/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11550/// permuting the elements of the result in place.
11552 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11553 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11554 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11555 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11556 (VT.is512BitVector() && !Subtarget.hasBWI()))
11557 return SDValue();
11558
11559 // We don't currently support lane crossing permutes.
11560 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11561 return SDValue();
11562
11563 int Scale = VT.getScalarSizeInBits() / 8;
11564 int NumLanes = VT.getSizeInBits() / 128;
11565 int NumElts = VT.getVectorNumElements();
11566 int NumEltsPerLane = NumElts / NumLanes;
11567
11568 // Determine range of mask elts.
11569 bool Blend1 = true;
11570 bool Blend2 = true;
11571 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11572 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11573 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11574 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11575 int M = Mask[Lane + Elt];
11576 if (M < 0)
11577 continue;
11578 if (M < NumElts) {
11579 Blend1 &= (M == (Lane + Elt));
11580 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11581 M = M % NumEltsPerLane;
11582 Range1.first = std::min(Range1.first, M);
11583 Range1.second = std::max(Range1.second, M);
11584 } else {
11585 M -= NumElts;
11586 Blend2 &= (M == (Lane + Elt));
11587 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11588 M = M % NumEltsPerLane;
11589 Range2.first = std::min(Range2.first, M);
11590 Range2.second = std::max(Range2.second, M);
11591 }
11592 }
11593 }
11594
11595 // Bail if we don't need both elements.
11596 // TODO - it might be worth doing this for unary shuffles if the permute
11597 // can be widened.
11598 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11599 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11600 return SDValue();
11601
11602 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11603 return SDValue();
11604
11605 // Rotate the 2 ops so we can access both ranges, then permute the result.
11606 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11607 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11608 SDValue Rotate = DAG.getBitcast(
11609 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11610 DAG.getBitcast(ByteVT, Lo),
11611 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11612 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11613 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11614 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11615 int M = Mask[Lane + Elt];
11616 if (M < 0)
11617 continue;
11618 if (M < NumElts)
11619 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11620 else
11621 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11622 }
11623 }
11624 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11625 };
11626
11627 // Check if the ranges are small enough to rotate from either direction.
11628 if (Range2.second < Range1.first)
11629 return RotateAndPermute(V1, V2, Range1.first, 0);
11630 if (Range1.second < Range2.first)
11631 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11632 return SDValue();
11633}
11634
11636 return isUndefOrEqual(Mask, 0);
11637}
11638
11640 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11641}
11642
11643/// Check if the Mask consists of the same element repeated multiple times.
11645 size_t NumUndefs = 0;
11646 std::optional<int> UniqueElt;
11647 for (int Elt : Mask) {
11648 if (Elt == SM_SentinelUndef) {
11649 NumUndefs++;
11650 continue;
11651 }
11652 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11653 return false;
11654 UniqueElt = Elt;
11655 }
11656 // Make sure the element is repeated enough times by checking the number of
11657 // undefs is small.
11658 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11659}
11660
11661/// Generic routine to decompose a shuffle and blend into independent
11662/// blends and permutes.
11663///
11664/// This matches the extremely common pattern for handling combined
11665/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11666/// operations. It will try to pick the best arrangement of shuffles and
11667/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11669 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11670 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11671 int NumElts = Mask.size();
11672 int NumLanes = VT.getSizeInBits() / 128;
11673 int NumEltsPerLane = NumElts / NumLanes;
11674
11675 // Shuffle the input elements into the desired positions in V1 and V2 and
11676 // unpack/blend them together.
11677 bool IsAlternating = true;
11678 bool V1Zero = true, V2Zero = true;
11679 SmallVector<int, 32> V1Mask(NumElts, -1);
11680 SmallVector<int, 32> V2Mask(NumElts, -1);
11681 SmallVector<int, 32> FinalMask(NumElts, -1);
11682 for (int i = 0; i < NumElts; ++i) {
11683 int M = Mask[i];
11684 if (M >= 0 && M < NumElts) {
11685 V1Mask[i] = M;
11686 FinalMask[i] = i;
11687 V1Zero &= Zeroable[i];
11688 IsAlternating &= (i & 1) == 0;
11689 } else if (M >= NumElts) {
11690 V2Mask[i] = M - NumElts;
11691 FinalMask[i] = i + NumElts;
11692 V2Zero &= Zeroable[i];
11693 IsAlternating &= (i & 1) == 1;
11694 }
11695 }
11696
11697 // If we effectively only demand the 0'th element of \p Input, and not only
11698 // as 0'th element, then broadcast said input,
11699 // and change \p InputMask to be a no-op (identity) mask.
11700 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11701 &DAG](SDValue &Input,
11702 MutableArrayRef<int> InputMask) {
11703 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11704 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11705 !X86::mayFoldLoad(Input, Subtarget)))
11706 return;
11707 if (isNoopShuffleMask(InputMask))
11708 return;
11709 assert(isBroadcastShuffleMask(InputMask) &&
11710 "Expected to demand only the 0'th element.");
11712 for (auto I : enumerate(InputMask)) {
11713 int &InputMaskElt = I.value();
11714 if (InputMaskElt >= 0)
11715 InputMaskElt = I.index();
11716 }
11717 };
11718
11719 // Currently, we may need to produce one shuffle per input, and blend results.
11720 // It is possible that the shuffle for one of the inputs is already a no-op.
11721 // See if we can simplify non-no-op shuffles into broadcasts,
11722 // which we consider to be strictly better than an arbitrary shuffle.
11723 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11725 canonicalizeBroadcastableInput(V1, V1Mask);
11726 canonicalizeBroadcastableInput(V2, V2Mask);
11727 }
11728
11729 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11730 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11731 // the shuffle may be able to fold with a load or other benefit. However, when
11732 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11733 // pre-shuffle first is a better strategy.
11734 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11735 // If we don't have blends, see if we can create a cheap unpack.
11736 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
11737 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
11738 is128BitUnpackShuffleMask(V2Mask, DAG)))
11739 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11740 DL, VT, V1, V2, Mask, Subtarget, DAG))
11741 return PermUnpack;
11742
11743 // Only prefer immediate blends to unpack/rotate.
11744 if (SDValue BlendPerm =
11745 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
11746 return BlendPerm;
11747
11748 // If either input vector provides only a single element which is repeated
11749 // multiple times, unpacking from both input vectors would generate worse
11750 // code. e.g. for
11751 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11752 // it is better to process t4 first to create a vector of t4[0], then unpack
11753 // that vector with t2.
11754 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11756 if (SDValue UnpackPerm =
11757 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11758 return UnpackPerm;
11759
11761 DL, VT, V1, V2, Mask, Subtarget, DAG))
11762 return RotatePerm;
11763
11764 // Unpack/rotate failed - try again with variable blends.
11765 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11766 DAG))
11767 return BlendPerm;
11768
11769 if (VT.getScalarSizeInBits() >= 32)
11770 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11771 DL, VT, V1, V2, Mask, Subtarget, DAG))
11772 return PermUnpack;
11773 }
11774
11775 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11776 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11777 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11778 // than half the elements coming from each source.
11779 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11780 V1Mask.assign(NumElts, -1);
11781 V2Mask.assign(NumElts, -1);
11782 FinalMask.assign(NumElts, -1);
11783 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11784 for (int j = 0; j != NumEltsPerLane; ++j) {
11785 int M = Mask[i + j];
11786 if (M >= 0 && M < NumElts) {
11787 V1Mask[i + (j / 2)] = M;
11788 FinalMask[i + j] = i + (j / 2);
11789 } else if (M >= NumElts) {
11790 V2Mask[i + (j / 2)] = M - NumElts;
11791 FinalMask[i + j] = i + (j / 2) + NumElts;
11792 }
11793 }
11794 }
11795
11796 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11797 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11798 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11799}
11800
11801static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11802 const X86Subtarget &Subtarget,
11803 ArrayRef<int> Mask) {
11804 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11805 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11806
11807 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11808 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11809 int MaxSubElts = 64 / EltSizeInBits;
11810 unsigned RotateAmt, NumSubElts;
11811 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11812 MaxSubElts, NumSubElts, RotateAmt))
11813 return -1;
11814 unsigned NumElts = Mask.size();
11815 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11816 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11817 return RotateAmt;
11818}
11819
11820/// Lower shuffle using X86ISD::VROTLI rotations.
11822 ArrayRef<int> Mask,
11823 const X86Subtarget &Subtarget,
11824 SelectionDAG &DAG) {
11825 // Only XOP + AVX512 targets have bit rotation instructions.
11826 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11827 bool IsLegal =
11828 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11829 if (!IsLegal && Subtarget.hasSSE3())
11830 return SDValue();
11831
11832 MVT RotateVT;
11833 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11834 Subtarget, Mask);
11835 if (RotateAmt < 0)
11836 return SDValue();
11837
11838 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11839 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11840 // widen to vXi16 or more then existing lowering should will be better.
11841 if (!IsLegal) {
11842 if ((RotateAmt % 16) == 0)
11843 return SDValue();
11844 // TODO: Use getTargetVShiftByConstNode.
11845 unsigned ShlAmt = RotateAmt;
11846 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11847 V1 = DAG.getBitcast(RotateVT, V1);
11848 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11849 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11850 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11851 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11852 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11853 return DAG.getBitcast(VT, Rot);
11854 }
11855
11856 SDValue Rot =
11857 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11858 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11859 return DAG.getBitcast(VT, Rot);
11860}
11861
11862/// Try to match a vector shuffle as an element rotation.
11863///
11864/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11866 ArrayRef<int> Mask) {
11867 int NumElts = Mask.size();
11868
11869 // We need to detect various ways of spelling a rotation:
11870 // [11, 12, 13, 14, 15, 0, 1, 2]
11871 // [-1, 12, 13, 14, -1, -1, 1, -1]
11872 // [-1, -1, -1, -1, -1, -1, 1, 2]
11873 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11874 // [-1, 4, 5, 6, -1, -1, 9, -1]
11875 // [-1, 4, 5, 6, -1, -1, -1, -1]
11876 int Rotation = 0;
11877 SDValue Lo, Hi;
11878 for (int i = 0; i < NumElts; ++i) {
11879 int M = Mask[i];
11880 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11881 "Unexpected mask index.");
11882 if (M < 0)
11883 continue;
11884
11885 // Determine where a rotated vector would have started.
11886 int StartIdx = i - (M % NumElts);
11887 if (StartIdx == 0)
11888 // The identity rotation isn't interesting, stop.
11889 return -1;
11890
11891 // If we found the tail of a vector the rotation must be the missing
11892 // front. If we found the head of a vector, it must be how much of the
11893 // head.
11894 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11895
11896 if (Rotation == 0)
11897 Rotation = CandidateRotation;
11898 else if (Rotation != CandidateRotation)
11899 // The rotations don't match, so we can't match this mask.
11900 return -1;
11901
11902 // Compute which value this mask is pointing at.
11903 SDValue MaskV = M < NumElts ? V1 : V2;
11904
11905 // Compute which of the two target values this index should be assigned
11906 // to. This reflects whether the high elements are remaining or the low
11907 // elements are remaining.
11908 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11909
11910 // Either set up this value if we've not encountered it before, or check
11911 // that it remains consistent.
11912 if (!TargetV)
11913 TargetV = MaskV;
11914 else if (TargetV != MaskV)
11915 // This may be a rotation, but it pulls from the inputs in some
11916 // unsupported interleaving.
11917 return -1;
11918 }
11919
11920 // Check that we successfully analyzed the mask, and normalize the results.
11921 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11922 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11923 if (!Lo)
11924 Lo = Hi;
11925 else if (!Hi)
11926 Hi = Lo;
11927
11928 V1 = Lo;
11929 V2 = Hi;
11930
11931 return Rotation;
11932}
11933
11934/// Try to lower a vector shuffle as a byte rotation.
11935///
11936/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11937/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11938/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11939/// try to generically lower a vector shuffle through such an pattern. It
11940/// does not check for the profitability of lowering either as PALIGNR or
11941/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11942/// This matches shuffle vectors that look like:
11943///
11944/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11945///
11946/// Essentially it concatenates V1 and V2, shifts right by some number of
11947/// elements, and takes the low elements as the result. Note that while this is
11948/// specified as a *right shift* because x86 is little-endian, it is a *left
11949/// rotate* of the vector lanes.
11951 ArrayRef<int> Mask) {
11952 // Don't accept any shuffles with zero elements.
11953 if (isAnyZero(Mask))
11954 return -1;
11955
11956 // PALIGNR works on 128-bit lanes.
11957 SmallVector<int, 16> RepeatedMask;
11958 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11959 return -1;
11960
11961 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11962 if (Rotation <= 0)
11963 return -1;
11964
11965 // PALIGNR rotates bytes, so we need to scale the
11966 // rotation based on how many bytes are in the vector lane.
11967 int NumElts = RepeatedMask.size();
11968 int Scale = 16 / NumElts;
11969 return Rotation * Scale;
11970}
11971
11973 SDValue V2, ArrayRef<int> Mask,
11974 const X86Subtarget &Subtarget,
11975 SelectionDAG &DAG) {
11976 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11977
11978 SDValue Lo = V1, Hi = V2;
11979 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11980 if (ByteRotation <= 0)
11981 return SDValue();
11982
11983 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11984 // PSLLDQ/PSRLDQ.
11985 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11986 Lo = DAG.getBitcast(ByteVT, Lo);
11987 Hi = DAG.getBitcast(ByteVT, Hi);
11988
11989 // SSSE3 targets can use the palignr instruction.
11990 if (Subtarget.hasSSSE3()) {
11991 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11992 "512-bit PALIGNR requires BWI instructions");
11993 return DAG.getBitcast(
11994 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11995 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11996 }
11997
11998 assert(VT.is128BitVector() &&
11999 "Rotate-based lowering only supports 128-bit lowering!");
12000 assert(Mask.size() <= 16 &&
12001 "Can shuffle at most 16 bytes in a 128-bit vector!");
12002 assert(ByteVT == MVT::v16i8 &&
12003 "SSE2 rotate lowering only needed for v16i8!");
12004
12005 // Default SSE2 implementation
12006 int LoByteShift = 16 - ByteRotation;
12007 int HiByteShift = ByteRotation;
12008
12009 SDValue LoShift =
12010 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12011 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12012 SDValue HiShift =
12013 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12014 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12015 return DAG.getBitcast(VT,
12016 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12017}
12018
12019/// Try to lower a vector shuffle as a dword/qword rotation.
12020///
12021/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12022/// rotation of the concatenation of two vectors; This routine will
12023/// try to generically lower a vector shuffle through such an pattern.
12024///
12025/// Essentially it concatenates V1 and V2, shifts right by some number of
12026/// elements, and takes the low elements as the result. Note that while this is
12027/// specified as a *right shift* because x86 is little-endian, it is a *left
12028/// rotate* of the vector lanes.
12030 SDValue V2, ArrayRef<int> Mask,
12031 const APInt &Zeroable,
12032 const X86Subtarget &Subtarget,
12033 SelectionDAG &DAG) {
12034 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12035 "Only 32-bit and 64-bit elements are supported!");
12036
12037 // 128/256-bit vectors are only supported with VLX.
12038 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12039 && "VLX required for 128/256-bit vectors");
12040
12041 SDValue Lo = V1, Hi = V2;
12042 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12043 if (0 < Rotation)
12044 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12045 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12046
12047 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12048 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12049 // TODO: We can probably make this more aggressive and use shift-pairs like
12050 // lowerShuffleAsByteShiftMask.
12051 unsigned NumElts = Mask.size();
12052 unsigned ZeroLo = Zeroable.countr_one();
12053 unsigned ZeroHi = Zeroable.countl_one();
12054 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12055 if (!ZeroLo && !ZeroHi)
12056 return SDValue();
12057
12058 if (ZeroLo) {
12059 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12060 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12061 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12062 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12063 getZeroVector(VT, Subtarget, DAG, DL),
12064 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12065 }
12066
12067 if (ZeroHi) {
12068 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12069 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12070 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12071 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12072 getZeroVector(VT, Subtarget, DAG, DL), Src,
12073 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12074 }
12075
12076 return SDValue();
12077}
12078
12079/// Try to lower a vector shuffle as a byte shift sequence.
12081 SDValue V2, ArrayRef<int> Mask,
12082 const APInt &Zeroable,
12083 const X86Subtarget &Subtarget,
12084 SelectionDAG &DAG) {
12085 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12086 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12087
12088 // We need a shuffle that has zeros at one/both ends and a sequential
12089 // shuffle from one source within.
12090 unsigned ZeroLo = Zeroable.countr_one();
12091 unsigned ZeroHi = Zeroable.countl_one();
12092 if (!ZeroLo && !ZeroHi)
12093 return SDValue();
12094
12095 unsigned NumElts = Mask.size();
12096 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12097 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12098 return SDValue();
12099
12100 unsigned Scale = VT.getScalarSizeInBits() / 8;
12101 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12102 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12103 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12104 return SDValue();
12105
12106 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12107 Res = DAG.getBitcast(MVT::v16i8, Res);
12108
12109 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12110 // inner sequential set of elements, possibly offset:
12111 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12112 // 01234567 --> 4567zzzz --> zzzzz456
12113 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12114 if (ZeroLo == 0) {
12115 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12116 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12117 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12118 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12119 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12120 } else if (ZeroHi == 0) {
12121 unsigned Shift = Mask[ZeroLo] % NumElts;
12122 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12123 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12124 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12125 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12126 } else if (!Subtarget.hasSSSE3()) {
12127 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12128 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12129 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12130 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12131 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12132 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12133 Shift += Mask[ZeroLo] % NumElts;
12134 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12135 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12136 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12137 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12138 } else
12139 return SDValue();
12140
12141 return DAG.getBitcast(VT, Res);
12142}
12143
12144/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12145///
12146/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12147/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12148/// matches elements from one of the input vectors shuffled to the left or
12149/// right with zeroable elements 'shifted in'. It handles both the strictly
12150/// bit-wise element shifts and the byte shift across an entire 128-bit double
12151/// quad word lane.
12152///
12153/// PSHL : (little-endian) left bit shift.
12154/// [ zz, 0, zz, 2 ]
12155/// [ -1, 4, zz, -1 ]
12156/// PSRL : (little-endian) right bit shift.
12157/// [ 1, zz, 3, zz]
12158/// [ -1, -1, 7, zz]
12159/// PSLLDQ : (little-endian) left byte shift
12160/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12161/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12162/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12163/// PSRLDQ : (little-endian) right byte shift
12164/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12165/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12166/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12167static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12168 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12169 int MaskOffset, const APInt &Zeroable,
12170 const X86Subtarget &Subtarget) {
12171 int Size = Mask.size();
12172 unsigned SizeInBits = Size * ScalarSizeInBits;
12173
12174 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12175 for (int i = 0; i < Size; i += Scale)
12176 for (int j = 0; j < Shift; ++j)
12177 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12178 return false;
12179
12180 return true;
12181 };
12182
12183 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12184 for (int i = 0; i != Size; i += Scale) {
12185 unsigned Pos = Left ? i + Shift : i;
12186 unsigned Low = Left ? i : i + Shift;
12187 unsigned Len = Scale - Shift;
12188 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12189 return -1;
12190 }
12191
12192 int ShiftEltBits = ScalarSizeInBits * Scale;
12193 bool ByteShift = ShiftEltBits > 64;
12194 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12195 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12196 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12197
12198 // Normalize the scale for byte shifts to still produce an i64 element
12199 // type.
12200 Scale = ByteShift ? Scale / 2 : Scale;
12201
12202 // We need to round trip through the appropriate type for the shift.
12203 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12204 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12205 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12206 return (int)ShiftAmt;
12207 };
12208
12209 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12210 // keep doubling the size of the integer elements up to that. We can
12211 // then shift the elements of the integer vector by whole multiples of
12212 // their width within the elements of the larger integer vector. Test each
12213 // multiple to see if we can find a match with the moved element indices
12214 // and that the shifted in elements are all zeroable.
12215 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12216 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12217 for (int Shift = 1; Shift != Scale; ++Shift)
12218 for (bool Left : {true, false})
12219 if (CheckZeros(Shift, Scale, Left)) {
12220 int ShiftAmt = MatchShift(Shift, Scale, Left);
12221 if (0 < ShiftAmt)
12222 return ShiftAmt;
12223 }
12224
12225 // no match
12226 return -1;
12227}
12228
12230 SDValue V2, ArrayRef<int> Mask,
12231 const APInt &Zeroable,
12232 const X86Subtarget &Subtarget,
12233 SelectionDAG &DAG, bool BitwiseOnly) {
12234 int Size = Mask.size();
12235 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12236
12237 MVT ShiftVT;
12238 SDValue V = V1;
12239 unsigned Opcode;
12240
12241 // Try to match shuffle against V1 shift.
12242 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12243 Mask, 0, Zeroable, Subtarget);
12244
12245 // If V1 failed, try to match shuffle against V2 shift.
12246 if (ShiftAmt < 0) {
12247 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12248 Mask, Size, Zeroable, Subtarget);
12249 V = V2;
12250 }
12251
12252 if (ShiftAmt < 0)
12253 return SDValue();
12254
12255 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12256 return SDValue();
12257
12258 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12259 "Illegal integer vector type");
12260 V = DAG.getBitcast(ShiftVT, V);
12261 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12262 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12263 return DAG.getBitcast(VT, V);
12264}
12265
12266// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12267// Remainder of lower half result is zero and upper half is all undef.
12268static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12269 ArrayRef<int> Mask, uint64_t &BitLen,
12270 uint64_t &BitIdx, const APInt &Zeroable) {
12271 int Size = Mask.size();
12272 int HalfSize = Size / 2;
12273 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12274 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12275
12276 // Upper half must be undefined.
12277 if (!isUndefUpperHalf(Mask))
12278 return false;
12279
12280 // Determine the extraction length from the part of the
12281 // lower half that isn't zeroable.
12282 int Len = HalfSize;
12283 for (; Len > 0; --Len)
12284 if (!Zeroable[Len - 1])
12285 break;
12286 assert(Len > 0 && "Zeroable shuffle mask");
12287
12288 // Attempt to match first Len sequential elements from the lower half.
12289 SDValue Src;
12290 int Idx = -1;
12291 for (int i = 0; i != Len; ++i) {
12292 int M = Mask[i];
12293 if (M == SM_SentinelUndef)
12294 continue;
12295 SDValue &V = (M < Size ? V1 : V2);
12296 M = M % Size;
12297
12298 // The extracted elements must start at a valid index and all mask
12299 // elements must be in the lower half.
12300 if (i > M || M >= HalfSize)
12301 return false;
12302
12303 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12304 Src = V;
12305 Idx = M - i;
12306 continue;
12307 }
12308 return false;
12309 }
12310
12311 if (!Src || Idx < 0)
12312 return false;
12313
12314 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12315 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12316 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12317 V1 = Src;
12318 return true;
12319}
12320
12321// INSERTQ: Extract lowest Len elements from lower half of second source and
12322// insert over first source, starting at Idx.
12323// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12324static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12325 ArrayRef<int> Mask, uint64_t &BitLen,
12326 uint64_t &BitIdx) {
12327 int Size = Mask.size();
12328 int HalfSize = Size / 2;
12329 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12330
12331 // Upper half must be undefined.
12332 if (!isUndefUpperHalf(Mask))
12333 return false;
12334
12335 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12336 SDValue Base;
12337
12338 // Attempt to match first source from mask before insertion point.
12339 if (isUndefInRange(Mask, 0, Idx)) {
12340 /* EMPTY */
12341 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12342 Base = V1;
12343 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12344 Base = V2;
12345 } else {
12346 continue;
12347 }
12348
12349 // Extend the extraction length looking to match both the insertion of
12350 // the second source and the remaining elements of the first.
12351 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12352 SDValue Insert;
12353 int Len = Hi - Idx;
12354
12355 // Match insertion.
12356 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12357 Insert = V1;
12358 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12359 Insert = V2;
12360 } else {
12361 continue;
12362 }
12363
12364 // Match the remaining elements of the lower half.
12365 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12366 /* EMPTY */
12367 } else if ((!Base || (Base == V1)) &&
12368 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12369 Base = V1;
12370 } else if ((!Base || (Base == V2)) &&
12371 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12372 Size + Hi)) {
12373 Base = V2;
12374 } else {
12375 continue;
12376 }
12377
12378 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12379 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12380 V1 = Base;
12381 V2 = Insert;
12382 return true;
12383 }
12384 }
12385
12386 return false;
12387}
12388
12389/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12391 SDValue V2, ArrayRef<int> Mask,
12392 const APInt &Zeroable, SelectionDAG &DAG) {
12393 uint64_t BitLen, BitIdx;
12394 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12395 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12396 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12397 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12398
12399 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12400 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12401 V2 ? V2 : DAG.getUNDEF(VT),
12402 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12403 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12404
12405 return SDValue();
12406}
12407
12408/// Lower a vector shuffle as an any/signed/zero extension.
12409///
12410/// Given a specific number of elements, element bit width, and extension
12411/// stride, produce either an extension based on the available
12412/// features of the subtarget. The extended elements are consecutive and
12413/// begin and can start from an offsetted element index in the input; to
12414/// avoid excess shuffling the offset must either being in the bottom lane
12415/// or at the start of a higher lane. All extended elements must be from
12416/// the same lane.
12418 int Scale, int Offset,
12419 unsigned ExtOpc, SDValue InputV,
12420 ArrayRef<int> Mask,
12421 const X86Subtarget &Subtarget,
12422 SelectionDAG &DAG) {
12423 assert(Scale > 1 && "Need a scale to extend.");
12424 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12425 int EltBits = VT.getScalarSizeInBits();
12426 int NumElements = VT.getVectorNumElements();
12427 int NumEltsPerLane = 128 / EltBits;
12428 int OffsetLane = Offset / NumEltsPerLane;
12429 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12430 "Only 8, 16, and 32 bit elements can be extended.");
12431 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12432 assert(0 <= Offset && "Extension offset must be positive.");
12433 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12434 "Extension offset must be in the first lane or start an upper lane.");
12435
12436 // Check that an index is in same lane as the base offset.
12437 auto SafeOffset = [&](int Idx) {
12438 return OffsetLane == (Idx / NumEltsPerLane);
12439 };
12440
12441 // Shift along an input so that the offset base moves to the first element.
12442 auto ShuffleOffset = [&](SDValue V) {
12443 if (!Offset)
12444 return V;
12445
12446 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12447 for (int i = 0; i * Scale < NumElements; ++i) {
12448 int SrcIdx = i + Offset;
12449 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12450 }
12451 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12452 };
12453
12454 // Found a valid a/zext mask! Try various lowering strategies based on the
12455 // input type and available ISA extensions.
12456 if (Subtarget.hasSSE41()) {
12457 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12458 // PUNPCK will catch this in a later shuffle match.
12459 if (Offset && Scale == 2 && VT.is128BitVector())
12460 return SDValue();
12461 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12462 NumElements / Scale);
12463 InputV = DAG.getBitcast(VT, InputV);
12464 InputV = ShuffleOffset(InputV);
12465 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12466 return DAG.getBitcast(VT, InputV);
12467 }
12468
12469 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12470 InputV = DAG.getBitcast(VT, InputV);
12471 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12472
12473 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12474 if (ExtOpc == ISD::SIGN_EXTEND)
12475 return SDValue();
12476
12477 // For any extends we can cheat for larger element sizes and use shuffle
12478 // instructions that can fold with a load and/or copy.
12479 if (AnyExt && EltBits == 32) {
12480 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12481 -1};
12482 return DAG.getBitcast(
12483 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12484 DAG.getBitcast(MVT::v4i32, InputV),
12485 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12486 }
12487 if (AnyExt && EltBits == 16 && Scale > 2) {
12488 int PSHUFDMask[4] = {Offset / 2, -1,
12489 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12490 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12491 DAG.getBitcast(MVT::v4i32, InputV),
12492 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12493 int PSHUFWMask[4] = {1, -1, -1, -1};
12494 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12495 return DAG.getBitcast(
12496 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12497 DAG.getBitcast(MVT::v8i16, InputV),
12498 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12499 }
12500
12501 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12502 // to 64-bits.
12503 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12504 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12505 assert(VT.is128BitVector() && "Unexpected vector width!");
12506
12507 int LoIdx = Offset * EltBits;
12508 SDValue Lo = DAG.getBitcast(
12509 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12510 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12511 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12512
12513 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12514 return DAG.getBitcast(VT, Lo);
12515
12516 int HiIdx = (Offset + 1) * EltBits;
12517 SDValue Hi = DAG.getBitcast(
12518 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12519 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12520 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12521 return DAG.getBitcast(VT,
12522 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12523 }
12524
12525 // If this would require more than 2 unpack instructions to expand, use
12526 // pshufb when available. We can only use more than 2 unpack instructions
12527 // when zero extending i8 elements which also makes it easier to use pshufb.
12528 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12529 assert(NumElements == 16 && "Unexpected byte vector width!");
12530 SDValue PSHUFBMask[16];
12531 for (int i = 0; i < 16; ++i) {
12532 int Idx = Offset + (i / Scale);
12533 if ((i % Scale == 0 && SafeOffset(Idx))) {
12534 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12535 continue;
12536 }
12537 PSHUFBMask[i] =
12538 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12539 }
12540 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12541 return DAG.getBitcast(
12542 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12543 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12544 }
12545
12546 // If we are extending from an offset, ensure we start on a boundary that
12547 // we can unpack from.
12548 int AlignToUnpack = Offset % (NumElements / Scale);
12549 if (AlignToUnpack) {
12550 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12551 for (int i = AlignToUnpack; i < NumElements; ++i)
12552 ShMask[i - AlignToUnpack] = i;
12553 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12554 Offset -= AlignToUnpack;
12555 }
12556
12557 // Otherwise emit a sequence of unpacks.
12558 do {
12559 unsigned UnpackLoHi = X86ISD::UNPCKL;
12560 if (Offset >= (NumElements / 2)) {
12561 UnpackLoHi = X86ISD::UNPCKH;
12562 Offset -= (NumElements / 2);
12563 }
12564
12565 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12566 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12567 : getZeroVector(InputVT, Subtarget, DAG, DL);
12568 InputV = DAG.getBitcast(InputVT, InputV);
12569 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12570 Scale /= 2;
12571 EltBits *= 2;
12572 NumElements /= 2;
12573 } while (Scale > 1);
12574 return DAG.getBitcast(VT, InputV);
12575}
12576
12577/// Try to lower a vector shuffle as a zero extension on any microarch.
12578///
12579/// This routine will try to do everything in its power to cleverly lower
12580/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12581/// check for the profitability of this lowering, it tries to aggressively
12582/// match this pattern. It will use all of the micro-architectural details it
12583/// can to emit an efficient lowering. It handles both blends with all-zero
12584/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12585/// masking out later).
12586///
12587/// The reason we have dedicated lowering for zext-style shuffles is that they
12588/// are both incredibly common and often quite performance sensitive.
12590 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12591 const APInt &Zeroable, const X86Subtarget &Subtarget,
12592 SelectionDAG &DAG) {
12593 int Bits = VT.getSizeInBits();
12594 int NumLanes = Bits / 128;
12595 int NumElements = VT.getVectorNumElements();
12596 int NumEltsPerLane = NumElements / NumLanes;
12597 assert(VT.getScalarSizeInBits() <= 32 &&
12598 "Exceeds 32-bit integer zero extension limit");
12599 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12600
12601 // Define a helper function to check a particular ext-scale and lower to it if
12602 // valid.
12603 auto Lower = [&](int Scale) -> SDValue {
12604 SDValue InputV;
12605 bool AnyExt = true;
12606 int Offset = 0;
12607 int Matches = 0;
12608 for (int i = 0; i < NumElements; ++i) {
12609 int M = Mask[i];
12610 if (M < 0)
12611 continue; // Valid anywhere but doesn't tell us anything.
12612 if (i % Scale != 0) {
12613 // Each of the extended elements need to be zeroable.
12614 if (!Zeroable[i])
12615 return SDValue();
12616
12617 // We no longer are in the anyext case.
12618 AnyExt = false;
12619 continue;
12620 }
12621
12622 // Each of the base elements needs to be consecutive indices into the
12623 // same input vector.
12624 SDValue V = M < NumElements ? V1 : V2;
12625 M = M % NumElements;
12626 if (!InputV) {
12627 InputV = V;
12628 Offset = M - (i / Scale);
12629 } else if (InputV != V)
12630 return SDValue(); // Flip-flopping inputs.
12631
12632 // Offset must start in the lowest 128-bit lane or at the start of an
12633 // upper lane.
12634 // FIXME: Is it ever worth allowing a negative base offset?
12635 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12636 (Offset % NumEltsPerLane) == 0))
12637 return SDValue();
12638
12639 // If we are offsetting, all referenced entries must come from the same
12640 // lane.
12641 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12642 return SDValue();
12643
12644 if ((M % NumElements) != (Offset + (i / Scale)))
12645 return SDValue(); // Non-consecutive strided elements.
12646 Matches++;
12647 }
12648
12649 // If we fail to find an input, we have a zero-shuffle which should always
12650 // have already been handled.
12651 // FIXME: Maybe handle this here in case during blending we end up with one?
12652 if (!InputV)
12653 return SDValue();
12654
12655 // If we are offsetting, don't extend if we only match a single input, we
12656 // can always do better by using a basic PSHUF or PUNPCK.
12657 if (Offset != 0 && Matches < 2)
12658 return SDValue();
12659
12660 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12661 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12662 InputV, Mask, Subtarget, DAG);
12663 };
12664
12665 // The widest scale possible for extending is to a 64-bit integer.
12666 assert(Bits % 64 == 0 &&
12667 "The number of bits in a vector must be divisible by 64 on x86!");
12668 int NumExtElements = Bits / 64;
12669
12670 // Each iteration, try extending the elements half as much, but into twice as
12671 // many elements.
12672 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12673 assert(NumElements % NumExtElements == 0 &&
12674 "The input vector size must be divisible by the extended size.");
12675 if (SDValue V = Lower(NumElements / NumExtElements))
12676 return V;
12677 }
12678
12679 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12680 if (Bits != 128)
12681 return SDValue();
12682
12683 // Returns one of the source operands if the shuffle can be reduced to a
12684 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12685 auto CanZExtLowHalf = [&]() {
12686 for (int i = NumElements / 2; i != NumElements; ++i)
12687 if (!Zeroable[i])
12688 return SDValue();
12689 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12690 return V1;
12691 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12692 return V2;
12693 return SDValue();
12694 };
12695
12696 if (SDValue V = CanZExtLowHalf()) {
12697 V = DAG.getBitcast(MVT::v2i64, V);
12698 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12699 return DAG.getBitcast(VT, V);
12700 }
12701
12702 // No viable ext lowering found.
12703 return SDValue();
12704}
12705
12706/// Try to get a scalar value for a specific element of a vector.
12707///
12708/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12710 SelectionDAG &DAG) {
12711 MVT VT = V.getSimpleValueType();
12712 MVT EltVT = VT.getVectorElementType();
12713 V = peekThroughBitcasts(V);
12714
12715 // If the bitcasts shift the element size, we can't extract an equivalent
12716 // element from it.
12717 MVT NewVT = V.getSimpleValueType();
12718 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12719 return SDValue();
12720
12721 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12722 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12723 // Ensure the scalar operand is the same size as the destination.
12724 // FIXME: Add support for scalar truncation where possible.
12725 SDValue S = V.getOperand(Idx);
12726 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12727 return DAG.getBitcast(EltVT, S);
12728 }
12729
12730 return SDValue();
12731}
12732
12733/// Helper to test for a load that can be folded with x86 shuffles.
12734///
12735/// This is particularly important because the set of instructions varies
12736/// significantly based on whether the operand is a load or not.
12738 return V.hasOneUse() &&
12740}
12741
12742template<typename T>
12743static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12744 T EltVT = VT.getScalarType();
12745 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12746 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12747}
12748
12749/// Try to lower insertion of a single element into a zero vector.
12750///
12751/// This is a common pattern that we have especially efficient patterns to lower
12752/// across all subtarget feature sets.
12754 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12755 const APInt &Zeroable, const X86Subtarget &Subtarget,
12756 SelectionDAG &DAG) {
12757 MVT ExtVT = VT;
12758 MVT EltVT = VT.getVectorElementType();
12759 unsigned NumElts = VT.getVectorNumElements();
12760 unsigned EltBits = VT.getScalarSizeInBits();
12761
12762 if (isSoftF16(EltVT, Subtarget))
12763 return SDValue();
12764
12765 int V2Index =
12766 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12767 Mask.begin();
12768 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12769 bool IsV1Zeroable = true;
12770 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12771 if (i != V2Index && !Zeroable[i]) {
12772 IsV1Zeroable = false;
12773 break;
12774 }
12775
12776 // Bail if a non-zero V1 isn't used in place.
12777 if (!IsV1Zeroable) {
12778 SmallVector<int, 8> V1Mask(Mask);
12779 V1Mask[V2Index] = -1;
12780 if (!isNoopShuffleMask(V1Mask))
12781 return SDValue();
12782 }
12783
12784 // Check for a single input from a SCALAR_TO_VECTOR node.
12785 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12786 // all the smarts here sunk into that routine. However, the current
12787 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12788 // vector shuffle lowering is dead.
12789 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12790 DAG);
12791 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12792 // We need to zext the scalar if it is smaller than an i32.
12793 V2S = DAG.getBitcast(EltVT, V2S);
12794 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12795 // Using zext to expand a narrow element won't work for non-zero
12796 // insertions. But we can use a masked constant vector if we're
12797 // inserting V2 into the bottom of V1.
12798 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12799 return SDValue();
12800
12801 // Zero-extend directly to i32.
12802 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12803 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12804
12805 // If we're inserting into a constant, mask off the inserted index
12806 // and OR with the zero-extended scalar.
12807 if (!IsV1Zeroable) {
12808 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12809 Bits[V2Index] = APInt::getZero(EltBits);
12810 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12811 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12812 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12813 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12814 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12815 }
12816 }
12817 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12818 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12819 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12820 // Either not inserting from the low element of the input or the input
12821 // element size is too small to use VZEXT_MOVL to clear the high bits.
12822 return SDValue();
12823 }
12824
12825 if (!IsV1Zeroable) {
12826 // If V1 can't be treated as a zero vector we have fewer options to lower
12827 // this. We can't support integer vectors or non-zero targets cheaply.
12828 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12829 if (!VT.isFloatingPoint() || V2Index != 0)
12830 return SDValue();
12831 if (!VT.is128BitVector())
12832 return SDValue();
12833
12834 // Otherwise, use MOVSD, MOVSS or MOVSH.
12835 unsigned MovOpc = 0;
12836 if (EltVT == MVT::f16)
12837 MovOpc = X86ISD::MOVSH;
12838 else if (EltVT == MVT::f32)
12839 MovOpc = X86ISD::MOVSS;
12840 else if (EltVT == MVT::f64)
12841 MovOpc = X86ISD::MOVSD;
12842 else
12843 llvm_unreachable("Unsupported floating point element type to handle!");
12844 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12845 }
12846
12847 // This lowering only works for the low element with floating point vectors.
12848 if (VT.isFloatingPoint() && V2Index != 0)
12849 return SDValue();
12850
12851 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12852 if (ExtVT != VT)
12853 V2 = DAG.getBitcast(VT, V2);
12854
12855 if (V2Index != 0) {
12856 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12857 // the desired position. Otherwise it is more efficient to do a vector
12858 // shift left. We know that we can do a vector shift left because all
12859 // the inputs are zero.
12860 if (VT.isFloatingPoint() || NumElts <= 4) {
12861 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12862 V2Shuffle[V2Index] = 0;
12863 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12864 } else {
12865 V2 = DAG.getBitcast(MVT::v16i8, V2);
12866 V2 = DAG.getNode(
12867 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12868 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12869 V2 = DAG.getBitcast(VT, V2);
12870 }
12871 }
12872 return V2;
12873}
12874
12875/// Try to lower broadcast of a single - truncated - integer element,
12876/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12877///
12878/// This assumes we have AVX2.
12880 int BroadcastIdx,
12881 const X86Subtarget &Subtarget,
12882 SelectionDAG &DAG) {
12883 assert(Subtarget.hasAVX2() &&
12884 "We can only lower integer broadcasts with AVX2!");
12885
12886 MVT EltVT = VT.getVectorElementType();
12887 MVT V0VT = V0.getSimpleValueType();
12888
12889 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12890 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12891
12892 MVT V0EltVT = V0VT.getVectorElementType();
12893 if (!V0EltVT.isInteger())
12894 return SDValue();
12895
12896 const unsigned EltSize = EltVT.getSizeInBits();
12897 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12898
12899 // This is only a truncation if the original element type is larger.
12900 if (V0EltSize <= EltSize)
12901 return SDValue();
12902
12903 assert(((V0EltSize % EltSize) == 0) &&
12904 "Scalar type sizes must all be powers of 2 on x86!");
12905
12906 const unsigned V0Opc = V0.getOpcode();
12907 const unsigned Scale = V0EltSize / EltSize;
12908 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12909
12910 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12911 V0Opc != ISD::BUILD_VECTOR)
12912 return SDValue();
12913
12914 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12915
12916 // If we're extracting non-least-significant bits, shift so we can truncate.
12917 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12918 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12919 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12920 if (const int OffsetIdx = BroadcastIdx % Scale)
12921 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12922 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12923
12924 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12925 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12926}
12927
12928/// Test whether this can be lowered with a single SHUFPS instruction.
12929///
12930/// This is used to disable more specialized lowerings when the shufps lowering
12931/// will happen to be efficient.
12933 // This routine only handles 128-bit shufps.
12934 assert(Mask.size() == 4 && "Unsupported mask size!");
12935 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12936 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12937 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12938 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12939
12940 // To lower with a single SHUFPS we need to have the low half and high half
12941 // each requiring a single input.
12942 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12943 return false;
12944 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12945 return false;
12946
12947 return true;
12948}
12949
12950/// Test whether the specified input (0 or 1) is in-place blended by the
12951/// given mask.
12952///
12953/// This returns true if the elements from a particular input are already in the
12954/// slot required by the given mask and require no permutation.
12956 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12957 int Size = Mask.size();
12958 for (int i = 0; i < Size; ++i)
12959 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12960 return false;
12961
12962 return true;
12963}
12964
12965/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12966/// the given mask.
12967///
12969 int BroadcastableElement = 0) {
12970 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12971 int Size = Mask.size();
12972 for (int i = 0; i < Size; ++i)
12973 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12974 Mask[i] % Size != BroadcastableElement)
12975 return false;
12976 return true;
12977}
12978
12979/// If we are extracting two 128-bit halves of a vector and shuffling the
12980/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12981/// multi-shuffle lowering.
12983 SDValue N1, ArrayRef<int> Mask,
12984 SelectionDAG &DAG) {
12985 MVT VT = N0.getSimpleValueType();
12986 assert((VT.is128BitVector() &&
12987 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12988 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12989
12990 // Check that both sources are extracts of the same source vector.
12991 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12993 N0.getOperand(0) != N1.getOperand(0) ||
12994 !N0.hasOneUse() || !N1.hasOneUse())
12995 return SDValue();
12996
12997 SDValue WideVec = N0.getOperand(0);
12998 MVT WideVT = WideVec.getSimpleValueType();
12999 if (!WideVT.is256BitVector())
13000 return SDValue();
13001
13002 // Match extracts of each half of the wide source vector. Commute the shuffle
13003 // if the extract of the low half is N1.
13004 unsigned NumElts = VT.getVectorNumElements();
13005 SmallVector<int, 4> NewMask(Mask);
13006 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13007 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13008 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13010 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13011 return SDValue();
13012
13013 // Final bailout: if the mask is simple, we are better off using an extract
13014 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13015 // because that avoids a constant load from memory.
13016 if (NumElts == 4 &&
13017 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13018 return SDValue();
13019
13020 // Extend the shuffle mask with undef elements.
13021 NewMask.append(NumElts, -1);
13022
13023 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13024 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13025 NewMask);
13026 // This is free: ymm -> xmm.
13027 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13028 DAG.getVectorIdxConstant(0, DL));
13029}
13030
13031/// Try to lower broadcast of a single element.
13032///
13033/// For convenience, this code also bundles all of the subtarget feature set
13034/// filtering. While a little annoying to re-dispatch on type here, there isn't
13035/// a convenient way to factor it out.
13037 SDValue V2, ArrayRef<int> Mask,
13038 const X86Subtarget &Subtarget,
13039 SelectionDAG &DAG) {
13040 MVT EltVT = VT.getVectorElementType();
13041 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13042 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13043 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13044 return SDValue();
13045
13046 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13047 // we can only broadcast from a register with AVX2.
13048 unsigned NumEltBits = VT.getScalarSizeInBits();
13049 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13052 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13053
13054 // Check that the mask is a broadcast.
13055 int BroadcastIdx = getSplatIndex(Mask);
13056 if (BroadcastIdx < 0) {
13057 // Check for hidden broadcast.
13058 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13059 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13060 return SDValue();
13061 BroadcastIdx = 0;
13062 }
13063 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13064 "a sorted mask where the broadcast "
13065 "comes from V1.");
13066 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13067
13068 // Go up the chain of (vector) values to find a scalar load that we can
13069 // combine with the broadcast.
13070 // TODO: Combine this logic with findEltLoadSrc() used by
13071 // EltsFromConsecutiveLoads().
13072 int BitOffset = BroadcastIdx * NumEltBits;
13073 SDValue V = V1;
13074 for (;;) {
13075 switch (V.getOpcode()) {
13076 case ISD::BITCAST: {
13077 V = V.getOperand(0);
13078 continue;
13079 }
13080 case ISD::CONCAT_VECTORS: {
13081 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13082 int OpIdx = BitOffset / OpBitWidth;
13083 V = V.getOperand(OpIdx);
13084 BitOffset %= OpBitWidth;
13085 continue;
13086 }
13088 // The extraction index adds to the existing offset.
13089 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13090 unsigned Idx = V.getConstantOperandVal(1);
13091 unsigned BeginOffset = Idx * EltBitWidth;
13092 BitOffset += BeginOffset;
13093 V = V.getOperand(0);
13094 continue;
13095 }
13096 case ISD::INSERT_SUBVECTOR: {
13097 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13098 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13099 int Idx = (int)V.getConstantOperandVal(2);
13100 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13101 int BeginOffset = Idx * EltBitWidth;
13102 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13103 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13104 BitOffset -= BeginOffset;
13105 V = VInner;
13106 } else {
13107 V = VOuter;
13108 }
13109 continue;
13110 }
13111 }
13112 break;
13113 }
13114 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13115 BroadcastIdx = BitOffset / NumEltBits;
13116
13117 // Do we need to bitcast the source to retrieve the original broadcast index?
13118 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13119
13120 // Check if this is a broadcast of a scalar. We special case lowering
13121 // for scalars so that we can more effectively fold with loads.
13122 // If the original value has a larger element type than the shuffle, the
13123 // broadcast element is in essence truncated. Make that explicit to ease
13124 // folding.
13125 if (BitCastSrc && VT.isInteger())
13126 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13127 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13128 return TruncBroadcast;
13129
13130 // Also check the simpler case, where we can directly reuse the scalar.
13131 if (!BitCastSrc &&
13132 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13133 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13134 V = V.getOperand(BroadcastIdx);
13135
13136 // If we can't broadcast from a register, check that the input is a load.
13137 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13138 return SDValue();
13139 } else if (ISD::isNormalLoad(V.getNode()) &&
13140 cast<LoadSDNode>(V)->isSimple()) {
13141 // We do not check for one-use of the vector load because a broadcast load
13142 // is expected to be a win for code size, register pressure, and possibly
13143 // uops even if the original vector load is not eliminated.
13144
13145 // Reduce the vector load and shuffle to a broadcasted scalar load.
13146 auto *Ld = cast<LoadSDNode>(V);
13147 SDValue BaseAddr = Ld->getBasePtr();
13148 MVT SVT = VT.getScalarType();
13149 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13150 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13151 SDValue NewAddr =
13153
13154 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13155 // than MOVDDUP.
13156 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13157 if (Opcode == X86ISD::VBROADCAST) {
13158 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13159 SDValue Ops[] = {Ld->getChain(), NewAddr};
13160 V = DAG.getMemIntrinsicNode(
13161 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13163 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13165 return DAG.getBitcast(VT, V);
13166 }
13167 assert(SVT == MVT::f64 && "Unexpected VT!");
13168 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13170 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13172 } else if (!BroadcastFromReg) {
13173 // We can't broadcast from a vector register.
13174 return SDValue();
13175 } else if (BitOffset != 0) {
13176 // We can only broadcast from the zero-element of a vector register,
13177 // but it can be advantageous to broadcast from the zero-element of a
13178 // subvector.
13179 if (!VT.is256BitVector() && !VT.is512BitVector())
13180 return SDValue();
13181
13182 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13183 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13184 return SDValue();
13185
13186 // If we are broadcasting an element from the lowest 128-bit subvector, try
13187 // to move the element in position.
13188 if (BitOffset < 128 && NumActiveElts > 1 &&
13189 V.getScalarValueSizeInBits() == NumEltBits) {
13190 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13191 "Unexpected bit-offset");
13192 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13193 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13194 V = extractSubVector(V, 0, DAG, DL, 128);
13195 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13196 } else {
13197 // Only broadcast the zero-element of a 128-bit subvector.
13198 if ((BitOffset % 128) != 0)
13199 return SDValue();
13200
13201 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13202 "Unexpected bit-offset");
13203 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13204 "Unexpected vector size");
13205 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13206 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13207 }
13208 }
13209
13210 // On AVX we can use VBROADCAST directly for scalar sources.
13211 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13212 V = DAG.getBitcast(MVT::f64, V);
13213 if (Subtarget.hasAVX()) {
13214 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13215 return DAG.getBitcast(VT, V);
13216 }
13217 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13218 }
13219
13220 // If this is a scalar, do the broadcast on this type and bitcast.
13221 if (!V.getValueType().isVector()) {
13222 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13223 "Unexpected scalar size");
13224 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13226 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13227 }
13228
13229 // We only support broadcasting from 128-bit vectors to minimize the
13230 // number of patterns we need to deal with in isel. So extract down to
13231 // 128-bits, removing as many bitcasts as possible.
13232 if (V.getValueSizeInBits() > 128)
13234
13235 // Otherwise cast V to a vector with the same element type as VT, but
13236 // possibly narrower than VT. Then perform the broadcast.
13237 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13238 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13239 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13240}
13241
13242// Check for whether we can use INSERTPS to perform the shuffle. We only use
13243// INSERTPS when the V1 elements are already in the correct locations
13244// because otherwise we can just always use two SHUFPS instructions which
13245// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13246// perform INSERTPS if a single V1 element is out of place and all V2
13247// elements are zeroable.
13249 unsigned &InsertPSMask,
13250 const APInt &Zeroable,
13251 ArrayRef<int> Mask, SelectionDAG &DAG) {
13252 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13253 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13254 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13255
13256 // Attempt to match INSERTPS with one element from VA or VB being
13257 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13258 // are updated.
13259 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13260 ArrayRef<int> CandidateMask) {
13261 unsigned ZMask = 0;
13262 int VADstIndex = -1;
13263 int VBDstIndex = -1;
13264 bool VAUsedInPlace = false;
13265
13266 for (int i = 0; i < 4; ++i) {
13267 // Synthesize a zero mask from the zeroable elements (includes undefs).
13268 if (Zeroable[i]) {
13269 ZMask |= 1 << i;
13270 continue;
13271 }
13272
13273 // Flag if we use any VA inputs in place.
13274 if (i == CandidateMask[i]) {
13275 VAUsedInPlace = true;
13276 continue;
13277 }
13278
13279 // We can only insert a single non-zeroable element.
13280 if (VADstIndex >= 0 || VBDstIndex >= 0)
13281 return false;
13282
13283 if (CandidateMask[i] < 4) {
13284 // VA input out of place for insertion.
13285 VADstIndex = i;
13286 } else {
13287 // VB input for insertion.
13288 VBDstIndex = i;
13289 }
13290 }
13291
13292 // Don't bother if we have no (non-zeroable) element for insertion.
13293 if (VADstIndex < 0 && VBDstIndex < 0)
13294 return false;
13295
13296 // Determine element insertion src/dst indices. The src index is from the
13297 // start of the inserted vector, not the start of the concatenated vector.
13298 unsigned VBSrcIndex = 0;
13299 if (VADstIndex >= 0) {
13300 // If we have a VA input out of place, we use VA as the V2 element
13301 // insertion and don't use the original V2 at all.
13302 VBSrcIndex = CandidateMask[VADstIndex];
13303 VBDstIndex = VADstIndex;
13304 VB = VA;
13305 } else {
13306 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13307 }
13308
13309 // If no V1 inputs are used in place, then the result is created only from
13310 // the zero mask and the V2 insertion - so remove V1 dependency.
13311 if (!VAUsedInPlace)
13312 VA = DAG.getUNDEF(MVT::v4f32);
13313
13314 // Update V1, V2 and InsertPSMask accordingly.
13315 V1 = VA;
13316 V2 = VB;
13317
13318 // Insert the V2 element into the desired position.
13319 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13320 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13321 return true;
13322 };
13323
13324 if (matchAsInsertPS(V1, V2, Mask))
13325 return true;
13326
13327 // Commute and try again.
13328 SmallVector<int, 4> CommutedMask(Mask);
13330 if (matchAsInsertPS(V2, V1, CommutedMask))
13331 return true;
13332
13333 return false;
13334}
13335
13337 ArrayRef<int> Mask, const APInt &Zeroable,
13338 SelectionDAG &DAG) {
13339 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13340 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13341
13342 // Attempt to match the insertps pattern.
13343 unsigned InsertPSMask = 0;
13344 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13345 return SDValue();
13346
13347 // Insert the V2 element into the desired position.
13348 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13349 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13350}
13351
13352/// Handle lowering of 2-lane 64-bit floating point shuffles.
13353///
13354/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13355/// support for floating point shuffles but not integer shuffles. These
13356/// instructions will incur a domain crossing penalty on some chips though so
13357/// it is better to avoid lowering through this for integer vectors where
13358/// possible.
13360 const APInt &Zeroable, SDValue V1, SDValue V2,
13361 const X86Subtarget &Subtarget,
13362 SelectionDAG &DAG) {
13363 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13364 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13365 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13366
13367 if (V2.isUndef()) {
13368 // Check for being able to broadcast a single element.
13369 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13370 Mask, Subtarget, DAG))
13371 return Broadcast;
13372
13373 // Straight shuffle of a single input vector. Simulate this by using the
13374 // single input as both of the "inputs" to this instruction..
13375 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13376
13377 if (Subtarget.hasAVX()) {
13378 // If we have AVX, we can use VPERMILPS which will allow folding a load
13379 // into the shuffle.
13380 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13381 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13382 }
13383
13384 return DAG.getNode(
13385 X86ISD::SHUFP, DL, MVT::v2f64,
13386 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13387 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13388 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13389 }
13390 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13391 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13392 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13393 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13394
13395 if (Subtarget.hasAVX2())
13396 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13397 return Extract;
13398
13399 // When loading a scalar and then shuffling it into a vector we can often do
13400 // the insertion cheaply.
13402 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13403 return Insertion;
13404 // Try inverting the insertion since for v2 masks it is easy to do and we
13405 // can't reliably sort the mask one way or the other.
13406 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13407 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13409 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13410 return Insertion;
13411
13412 // Try to use one of the special instruction patterns to handle two common
13413 // blend patterns if a zero-blend above didn't work.
13414 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13415 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13416 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13417 // We can either use a special instruction to load over the low double or
13418 // to move just the low double.
13419 return DAG.getNode(
13420 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13421 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13422
13423 if (Subtarget.hasSSE41())
13424 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13425 Zeroable, Subtarget, DAG))
13426 return Blend;
13427
13428 // Use dedicated unpack instructions for masks that match their pattern.
13429 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13430 return V;
13431
13432 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13433 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13434 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13435}
13436
13437/// Handle lowering of 2-lane 64-bit integer shuffles.
13438///
13439/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13440/// the integer unit to minimize domain crossing penalties. However, for blends
13441/// it falls back to the floating point shuffle operation with appropriate bit
13442/// casting.
13444 const APInt &Zeroable, SDValue V1, SDValue V2,
13445 const X86Subtarget &Subtarget,
13446 SelectionDAG &DAG) {
13447 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13448 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13449 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13450
13451 if (V2.isUndef()) {
13452 // Check for being able to broadcast a single element.
13453 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13454 Mask, Subtarget, DAG))
13455 return Broadcast;
13456
13457 // Straight shuffle of a single input vector. For everything from SSE2
13458 // onward this has a single fast instruction with no scary immediates.
13459 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13460 V1 = DAG.getBitcast(MVT::v4i32, V1);
13461 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13462 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13463 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13464 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13465 return DAG.getBitcast(
13466 MVT::v2i64,
13467 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13468 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13469 }
13470 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13471 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13472 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13473 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13474
13475 if (Subtarget.hasAVX2())
13476 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13477 return Extract;
13478
13479 // Try to use shift instructions.
13480 if (SDValue Shift =
13481 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13482 DAG, /*BitwiseOnly*/ false))
13483 return Shift;
13484
13485 // When loading a scalar and then shuffling it into a vector we can often do
13486 // the insertion cheaply.
13488 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13489 return Insertion;
13490 // Try inverting the insertion since for v2 masks it is easy to do and we
13491 // can't reliably sort the mask one way or the other.
13492 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13494 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13495 return Insertion;
13496
13497 // We have different paths for blend lowering, but they all must use the
13498 // *exact* same predicate.
13499 bool IsBlendSupported = Subtarget.hasSSE41();
13500 if (IsBlendSupported)
13501 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13502 Zeroable, Subtarget, DAG))
13503 return Blend;
13504
13505 // Use dedicated unpack instructions for masks that match their pattern.
13506 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13507 return V;
13508
13509 // Try to use byte rotation instructions.
13510 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13511 if (Subtarget.hasSSSE3()) {
13512 if (Subtarget.hasVLX())
13513 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13514 Zeroable, Subtarget, DAG))
13515 return Rotate;
13516
13517 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13518 Subtarget, DAG))
13519 return Rotate;
13520 }
13521
13522 // If we have direct support for blends, we should lower by decomposing into
13523 // a permute. That will be faster than the domain cross.
13524 if (IsBlendSupported)
13525 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13526 Zeroable, Subtarget, DAG);
13527
13528 // We implement this with SHUFPD which is pretty lame because it will likely
13529 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13530 // However, all the alternatives are still more cycles and newer chips don't
13531 // have this problem. It would be really nice if x86 had better shuffles here.
13532 V1 = DAG.getBitcast(MVT::v2f64, V1);
13533 V2 = DAG.getBitcast(MVT::v2f64, V2);
13534 return DAG.getBitcast(MVT::v2i64,
13535 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13536}
13537
13538/// Lower a vector shuffle using the SHUFPS instruction.
13539///
13540/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13541/// It makes no assumptions about whether this is the *best* lowering, it simply
13542/// uses it.
13544 ArrayRef<int> Mask, SDValue V1,
13545 SDValue V2, SelectionDAG &DAG) {
13546 SDValue LowV = V1, HighV = V2;
13547 SmallVector<int, 4> NewMask(Mask);
13548 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13549
13550 if (NumV2Elements == 1) {
13551 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13552
13553 // Compute the index adjacent to V2Index and in the same half by toggling
13554 // the low bit.
13555 int V2AdjIndex = V2Index ^ 1;
13556
13557 if (Mask[V2AdjIndex] < 0) {
13558 // Handles all the cases where we have a single V2 element and an undef.
13559 // This will only ever happen in the high lanes because we commute the
13560 // vector otherwise.
13561 if (V2Index < 2)
13562 std::swap(LowV, HighV);
13563 NewMask[V2Index] -= 4;
13564 } else {
13565 // Handle the case where the V2 element ends up adjacent to a V1 element.
13566 // To make this work, blend them together as the first step.
13567 int V1Index = V2AdjIndex;
13568 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13569 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13570 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13571
13572 // Now proceed to reconstruct the final blend as we have the necessary
13573 // high or low half formed.
13574 if (V2Index < 2) {
13575 LowV = V2;
13576 HighV = V1;
13577 } else {
13578 HighV = V2;
13579 }
13580 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13581 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13582 }
13583 } else if (NumV2Elements == 2) {
13584 if (Mask[0] < 4 && Mask[1] < 4) {
13585 // Handle the easy case where we have V1 in the low lanes and V2 in the
13586 // high lanes.
13587 NewMask[2] -= 4;
13588 NewMask[3] -= 4;
13589 } else if (Mask[2] < 4 && Mask[3] < 4) {
13590 // We also handle the reversed case because this utility may get called
13591 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13592 // arrange things in the right direction.
13593 NewMask[0] -= 4;
13594 NewMask[1] -= 4;
13595 HighV = V1;
13596 LowV = V2;
13597 } else {
13598 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13599 // trying to place elements directly, just blend them and set up the final
13600 // shuffle to place them.
13601
13602 // The first two blend mask elements are for V1, the second two are for
13603 // V2.
13604 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13605 Mask[2] < 4 ? Mask[2] : Mask[3],
13606 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13607 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13608 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13609 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13610
13611 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13612 // a blend.
13613 LowV = HighV = V1;
13614 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13615 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13616 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13617 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13618 }
13619 } else if (NumV2Elements == 3) {
13620 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13621 // we can get here due to other paths (e.g repeated mask matching) that we
13622 // don't want to do another round of lowerVECTOR_SHUFFLE.
13624 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13625 }
13626 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13627 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13628}
13629
13630/// Lower 4-lane 32-bit floating point shuffles.
13631///
13632/// Uses instructions exclusively from the floating point unit to minimize
13633/// domain crossing penalties, as these are sufficient to implement all v4f32
13634/// shuffles.
13636 const APInt &Zeroable, SDValue V1, SDValue V2,
13637 const X86Subtarget &Subtarget,
13638 SelectionDAG &DAG) {
13639 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13640 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13641 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13642
13643 if (Subtarget.hasSSE41())
13644 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13645 Zeroable, Subtarget, DAG))
13646 return Blend;
13647
13648 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13649
13650 if (NumV2Elements == 0) {
13651 // Check for being able to broadcast a single element.
13652 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13653 Mask, Subtarget, DAG))
13654 return Broadcast;
13655
13656 // Use even/odd duplicate instructions for masks that match their pattern.
13657 if (Subtarget.hasSSE3()) {
13658 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13659 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13660 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13661 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13662 }
13663
13664 if (Subtarget.hasAVX()) {
13665 // If we have AVX, we can use VPERMILPS which will allow folding a load
13666 // into the shuffle.
13667 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13668 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13669 }
13670
13671 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13672 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13673 if (!Subtarget.hasSSE2()) {
13674 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13675 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13676 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13677 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13678 }
13679
13680 // Otherwise, use a straight shuffle of a single input vector. We pass the
13681 // input vector to both operands to simulate this with a SHUFPS.
13682 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13683 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13684 }
13685
13686 if (Subtarget.hasSSE2())
13688 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13689 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13690 return ZExt;
13691 }
13692
13693 if (Subtarget.hasAVX2())
13694 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13695 return Extract;
13696
13697 // There are special ways we can lower some single-element blends. However, we
13698 // have custom ways we can lower more complex single-element blends below that
13699 // we defer to if both this and BLENDPS fail to match, so restrict this to
13700 // when the V2 input is targeting element 0 of the mask -- that is the fast
13701 // case here.
13702 if (NumV2Elements == 1 && Mask[0] >= 4)
13704 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13705 return V;
13706
13707 if (Subtarget.hasSSE41()) {
13708 // Use INSERTPS if we can complete the shuffle efficiently.
13709 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13710 return V;
13711
13712 if (!isSingleSHUFPSMask(Mask))
13713 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13714 V2, Mask, DAG))
13715 return BlendPerm;
13716 }
13717
13718 // Use low/high mov instructions. These are only valid in SSE1 because
13719 // otherwise they are widened to v2f64 and never get here.
13720 if (!Subtarget.hasSSE2()) {
13721 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13722 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13723 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13724 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13725 }
13726
13727 // Use dedicated unpack instructions for masks that match their pattern.
13728 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13729 return V;
13730
13731 // Otherwise fall back to a SHUFPS lowering strategy.
13732 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13733}
13734
13735/// Lower 4-lane i32 vector shuffles.
13736///
13737/// We try to handle these with integer-domain shuffles where we can, but for
13738/// blends we use the floating point domain blend instructions.
13740 const APInt &Zeroable, SDValue V1, SDValue V2,
13741 const X86Subtarget &Subtarget,
13742 SelectionDAG &DAG) {
13743 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13744 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13745 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13746
13747 // Whenever we can lower this as a zext, that instruction is strictly faster
13748 // than any alternative. It also allows us to fold memory operands into the
13749 // shuffle in many cases.
13750 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13751 Zeroable, Subtarget, DAG))
13752 return ZExt;
13753
13754 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13755
13756 // Try to use shift instructions if fast.
13757 if (Subtarget.preferLowerShuffleAsShift()) {
13758 if (SDValue Shift =
13759 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13760 Subtarget, DAG, /*BitwiseOnly*/ true))
13761 return Shift;
13762 if (NumV2Elements == 0)
13763 if (SDValue Rotate =
13764 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13765 return Rotate;
13766 }
13767
13768 if (NumV2Elements == 0) {
13769 // Try to use broadcast unless the mask only has one non-undef element.
13770 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13771 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13772 Mask, Subtarget, DAG))
13773 return Broadcast;
13774 }
13775
13776 // Straight shuffle of a single input vector. For everything from SSE2
13777 // onward this has a single fast instruction with no scary immediates.
13778 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13779 // but we aren't actually going to use the UNPCK instruction because doing
13780 // so prevents folding a load into this instruction or making a copy.
13781 const int UnpackLoMask[] = {0, 0, 1, 1};
13782 const int UnpackHiMask[] = {2, 2, 3, 3};
13783 if (!isSingleElementRepeatedMask(Mask)) {
13784 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13785 Mask = UnpackLoMask;
13786 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13787 Mask = UnpackHiMask;
13788 }
13789
13790 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13791 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13792 }
13793
13794 if (Subtarget.hasAVX2())
13795 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13796 return Extract;
13797
13798 // Try to use shift instructions.
13799 if (SDValue Shift =
13800 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13801 DAG, /*BitwiseOnly*/ false))
13802 return Shift;
13803
13804 // There are special ways we can lower some single-element blends.
13805 if (NumV2Elements == 1)
13807 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13808 return V;
13809
13810 // We have different paths for blend lowering, but they all must use the
13811 // *exact* same predicate.
13812 bool IsBlendSupported = Subtarget.hasSSE41();
13813 if (IsBlendSupported)
13814 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13815 Zeroable, Subtarget, DAG))
13816 return Blend;
13817
13818 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13819 Zeroable, Subtarget, DAG))
13820 return Masked;
13821
13822 // Use dedicated unpack instructions for masks that match their pattern.
13823 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13824 return V;
13825
13826 // Try to use byte rotation instructions.
13827 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13828 if (Subtarget.hasSSSE3()) {
13829 if (Subtarget.hasVLX())
13830 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13831 Zeroable, Subtarget, DAG))
13832 return Rotate;
13833
13834 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13835 Subtarget, DAG))
13836 return Rotate;
13837 }
13838
13839 // Assume that a single SHUFPS is faster than an alternative sequence of
13840 // multiple instructions (even if the CPU has a domain penalty).
13841 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13842 if (!isSingleSHUFPSMask(Mask)) {
13843 // If we have direct support for blends, we should lower by decomposing into
13844 // a permute. That will be faster than the domain cross.
13845 if (IsBlendSupported)
13846 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13847 Zeroable, Subtarget, DAG);
13848
13849 // Try to lower by permuting the inputs into an unpack instruction.
13850 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13851 Mask, Subtarget, DAG))
13852 return Unpack;
13853 }
13854
13855 // We implement this with SHUFPS because it can blend from two vectors.
13856 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13857 // up the inputs, bypassing domain shift penalties that we would incur if we
13858 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13859 // relevant.
13860 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13861 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13862 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13863 return DAG.getBitcast(MVT::v4i32, ShufPS);
13864}
13865
13866/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13867/// shuffle lowering, and the most complex part.
13868///
13869/// The lowering strategy is to try to form pairs of input lanes which are
13870/// targeted at the same half of the final vector, and then use a dword shuffle
13871/// to place them onto the right half, and finally unpack the paired lanes into
13872/// their final position.
13873///
13874/// The exact breakdown of how to form these dword pairs and align them on the
13875/// correct sides is really tricky. See the comments within the function for
13876/// more of the details.
13877///
13878/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13879/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13880/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13881/// vector, form the analogous 128-bit 8-element Mask.
13883 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13884 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13885 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13886 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13887
13888 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13889 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13890 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13891
13892 // Attempt to directly match PSHUFLW or PSHUFHW.
13893 if (isUndefOrInRange(LoMask, 0, 4) &&
13894 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13895 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13896 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13897 }
13898 if (isUndefOrInRange(HiMask, 4, 8) &&
13899 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13900 for (int i = 0; i != 4; ++i)
13901 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13902 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13903 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13904 }
13905
13906 SmallVector<int, 4> LoInputs;
13907 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13908 array_pod_sort(LoInputs.begin(), LoInputs.end());
13909 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13910 SmallVector<int, 4> HiInputs;
13911 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13912 array_pod_sort(HiInputs.begin(), HiInputs.end());
13913 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13914 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13915 int NumHToL = LoInputs.size() - NumLToL;
13916 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13917 int NumHToH = HiInputs.size() - NumLToH;
13918 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13919 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13920 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13921 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13922
13923 // If we are shuffling values from one half - check how many different DWORD
13924 // pairs we need to create. If only 1 or 2 then we can perform this as a
13925 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13926 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13927 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13928 V = DAG.getNode(ShufWOp, DL, VT, V,
13929 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13930 V = DAG.getBitcast(PSHUFDVT, V);
13931 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13932 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13933 return DAG.getBitcast(VT, V);
13934 };
13935
13936 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13937 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13938 SmallVector<std::pair<int, int>, 4> DWordPairs;
13939 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13940
13941 // Collect the different DWORD pairs.
13942 for (int DWord = 0; DWord != 4; ++DWord) {
13943 int M0 = Mask[2 * DWord + 0];
13944 int M1 = Mask[2 * DWord + 1];
13945 M0 = (M0 >= 0 ? M0 % 4 : M0);
13946 M1 = (M1 >= 0 ? M1 % 4 : M1);
13947 if (M0 < 0 && M1 < 0)
13948 continue;
13949
13950 bool Match = false;
13951 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13952 auto &DWordPair = DWordPairs[j];
13953 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13954 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13955 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13956 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13957 PSHUFDMask[DWord] = DOffset + j;
13958 Match = true;
13959 break;
13960 }
13961 }
13962 if (!Match) {
13963 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13964 DWordPairs.push_back(std::make_pair(M0, M1));
13965 }
13966 }
13967
13968 if (DWordPairs.size() <= 2) {
13969 DWordPairs.resize(2, std::make_pair(-1, -1));
13970 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13971 DWordPairs[1].first, DWordPairs[1].second};
13972 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13973 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13974 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13975 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13976 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13977 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13978 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13979 }
13980 if ((NumHToL + NumHToH) == 0)
13981 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13982 if ((NumLToL + NumLToH) == 0)
13983 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13984 }
13985 }
13986
13987 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13988 // such inputs we can swap two of the dwords across the half mark and end up
13989 // with <=2 inputs to each half in each half. Once there, we can fall through
13990 // to the generic code below. For example:
13991 //
13992 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13993 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13994 //
13995 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13996 // and an existing 2-into-2 on the other half. In this case we may have to
13997 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13998 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13999 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14000 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14001 // half than the one we target for fixing) will be fixed when we re-enter this
14002 // path. We will also combine away any sequence of PSHUFD instructions that
14003 // result into a single instruction. Here is an example of the tricky case:
14004 //
14005 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14006 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14007 //
14008 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14009 //
14010 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14011 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14012 //
14013 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14014 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14015 //
14016 // The result is fine to be handled by the generic logic.
14017 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14018 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14019 int AOffset, int BOffset) {
14020 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14021 "Must call this with A having 3 or 1 inputs from the A half.");
14022 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14023 "Must call this with B having 1 or 3 inputs from the B half.");
14024 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14025 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14026
14027 bool ThreeAInputs = AToAInputs.size() == 3;
14028
14029 // Compute the index of dword with only one word among the three inputs in
14030 // a half by taking the sum of the half with three inputs and subtracting
14031 // the sum of the actual three inputs. The difference is the remaining
14032 // slot.
14033 int ADWord = 0, BDWord = 0;
14034 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14035 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14036 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14037 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14038 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14039 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14040 int TripleNonInputIdx =
14041 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14042 TripleDWord = TripleNonInputIdx / 2;
14043
14044 // We use xor with one to compute the adjacent DWord to whichever one the
14045 // OneInput is in.
14046 OneInputDWord = (OneInput / 2) ^ 1;
14047
14048 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14049 // and BToA inputs. If there is also such a problem with the BToB and AToB
14050 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14051 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14052 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14053 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14054 // Compute how many inputs will be flipped by swapping these DWords. We
14055 // need
14056 // to balance this to ensure we don't form a 3-1 shuffle in the other
14057 // half.
14058 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14059 llvm::count(AToBInputs, 2 * ADWord + 1);
14060 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14061 llvm::count(BToBInputs, 2 * BDWord + 1);
14062 if ((NumFlippedAToBInputs == 1 &&
14063 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14064 (NumFlippedBToBInputs == 1 &&
14065 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14066 // We choose whether to fix the A half or B half based on whether that
14067 // half has zero flipped inputs. At zero, we may not be able to fix it
14068 // with that half. We also bias towards fixing the B half because that
14069 // will more commonly be the high half, and we have to bias one way.
14070 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14071 ArrayRef<int> Inputs) {
14072 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14073 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14074 // Determine whether the free index is in the flipped dword or the
14075 // unflipped dword based on where the pinned index is. We use this bit
14076 // in an xor to conditionally select the adjacent dword.
14077 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14078 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14079 if (IsFixIdxInput == IsFixFreeIdxInput)
14080 FixFreeIdx += 1;
14081 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14082 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14083 "We need to be changing the number of flipped inputs!");
14084 int PSHUFHalfMask[] = {0, 1, 2, 3};
14085 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14086 V = DAG.getNode(
14087 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14088 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14089 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14090
14091 for (int &M : Mask)
14092 if (M >= 0 && M == FixIdx)
14093 M = FixFreeIdx;
14094 else if (M >= 0 && M == FixFreeIdx)
14095 M = FixIdx;
14096 };
14097 if (NumFlippedBToBInputs != 0) {
14098 int BPinnedIdx =
14099 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14100 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14101 } else {
14102 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14103 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14104 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14105 }
14106 }
14107 }
14108
14109 int PSHUFDMask[] = {0, 1, 2, 3};
14110 PSHUFDMask[ADWord] = BDWord;
14111 PSHUFDMask[BDWord] = ADWord;
14112 V = DAG.getBitcast(
14113 VT,
14114 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14115 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14116
14117 // Adjust the mask to match the new locations of A and B.
14118 for (int &M : Mask)
14119 if (M >= 0 && M/2 == ADWord)
14120 M = 2 * BDWord + M % 2;
14121 else if (M >= 0 && M/2 == BDWord)
14122 M = 2 * ADWord + M % 2;
14123
14124 // Recurse back into this routine to re-compute state now that this isn't
14125 // a 3 and 1 problem.
14126 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14127 };
14128 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14129 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14130 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14131 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14132
14133 // At this point there are at most two inputs to the low and high halves from
14134 // each half. That means the inputs can always be grouped into dwords and
14135 // those dwords can then be moved to the correct half with a dword shuffle.
14136 // We use at most one low and one high word shuffle to collect these paired
14137 // inputs into dwords, and finally a dword shuffle to place them.
14138 int PSHUFLMask[4] = {-1, -1, -1, -1};
14139 int PSHUFHMask[4] = {-1, -1, -1, -1};
14140 int PSHUFDMask[4] = {-1, -1, -1, -1};
14141
14142 // First fix the masks for all the inputs that are staying in their
14143 // original halves. This will then dictate the targets of the cross-half
14144 // shuffles.
14145 auto fixInPlaceInputs =
14146 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14147 MutableArrayRef<int> SourceHalfMask,
14148 MutableArrayRef<int> HalfMask, int HalfOffset) {
14149 if (InPlaceInputs.empty())
14150 return;
14151 if (InPlaceInputs.size() == 1) {
14152 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14153 InPlaceInputs[0] - HalfOffset;
14154 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14155 return;
14156 }
14157 if (IncomingInputs.empty()) {
14158 // Just fix all of the in place inputs.
14159 for (int Input : InPlaceInputs) {
14160 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14161 PSHUFDMask[Input / 2] = Input / 2;
14162 }
14163 return;
14164 }
14165
14166 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14167 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14168 InPlaceInputs[0] - HalfOffset;
14169 // Put the second input next to the first so that they are packed into
14170 // a dword. We find the adjacent index by toggling the low bit.
14171 int AdjIndex = InPlaceInputs[0] ^ 1;
14172 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14173 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14174 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14175 };
14176 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14177 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14178
14179 // Now gather the cross-half inputs and place them into a free dword of
14180 // their target half.
14181 // FIXME: This operation could almost certainly be simplified dramatically to
14182 // look more like the 3-1 fixing operation.
14183 auto moveInputsToRightHalf = [&PSHUFDMask](
14184 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14185 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14186 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14187 int DestOffset) {
14188 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14189 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14190 };
14191 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14192 int Word) {
14193 int LowWord = Word & ~1;
14194 int HighWord = Word | 1;
14195 return isWordClobbered(SourceHalfMask, LowWord) ||
14196 isWordClobbered(SourceHalfMask, HighWord);
14197 };
14198
14199 if (IncomingInputs.empty())
14200 return;
14201
14202 if (ExistingInputs.empty()) {
14203 // Map any dwords with inputs from them into the right half.
14204 for (int Input : IncomingInputs) {
14205 // If the source half mask maps over the inputs, turn those into
14206 // swaps and use the swapped lane.
14207 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14208 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14209 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14210 Input - SourceOffset;
14211 // We have to swap the uses in our half mask in one sweep.
14212 for (int &M : HalfMask)
14213 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14214 M = Input;
14215 else if (M == Input)
14216 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14217 } else {
14218 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14219 Input - SourceOffset &&
14220 "Previous placement doesn't match!");
14221 }
14222 // Note that this correctly re-maps both when we do a swap and when
14223 // we observe the other side of the swap above. We rely on that to
14224 // avoid swapping the members of the input list directly.
14225 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14226 }
14227
14228 // Map the input's dword into the correct half.
14229 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14230 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14231 else
14232 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14233 Input / 2 &&
14234 "Previous placement doesn't match!");
14235 }
14236
14237 // And just directly shift any other-half mask elements to be same-half
14238 // as we will have mirrored the dword containing the element into the
14239 // same position within that half.
14240 for (int &M : HalfMask)
14241 if (M >= SourceOffset && M < SourceOffset + 4) {
14242 M = M - SourceOffset + DestOffset;
14243 assert(M >= 0 && "This should never wrap below zero!");
14244 }
14245 return;
14246 }
14247
14248 // Ensure we have the input in a viable dword of its current half. This
14249 // is particularly tricky because the original position may be clobbered
14250 // by inputs being moved and *staying* in that half.
14251 if (IncomingInputs.size() == 1) {
14252 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14253 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14254 SourceOffset;
14255 SourceHalfMask[InputFixed - SourceOffset] =
14256 IncomingInputs[0] - SourceOffset;
14257 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14258 IncomingInputs[0] = InputFixed;
14259 }
14260 } else if (IncomingInputs.size() == 2) {
14261 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14262 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14263 // We have two non-adjacent or clobbered inputs we need to extract from
14264 // the source half. To do this, we need to map them into some adjacent
14265 // dword slot in the source mask.
14266 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14267 IncomingInputs[1] - SourceOffset};
14268
14269 // If there is a free slot in the source half mask adjacent to one of
14270 // the inputs, place the other input in it. We use (Index XOR 1) to
14271 // compute an adjacent index.
14272 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14273 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14274 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14275 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14276 InputsFixed[1] = InputsFixed[0] ^ 1;
14277 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14278 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14279 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14280 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14281 InputsFixed[0] = InputsFixed[1] ^ 1;
14282 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14283 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14284 // The two inputs are in the same DWord but it is clobbered and the
14285 // adjacent DWord isn't used at all. Move both inputs to the free
14286 // slot.
14287 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14288 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14289 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14290 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14291 } else {
14292 // The only way we hit this point is if there is no clobbering
14293 // (because there are no off-half inputs to this half) and there is no
14294 // free slot adjacent to one of the inputs. In this case, we have to
14295 // swap an input with a non-input.
14296 for (int i = 0; i < 4; ++i)
14297 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14298 "We can't handle any clobbers here!");
14299 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14300 "Cannot have adjacent inputs here!");
14301
14302 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14303 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14304
14305 // We also have to update the final source mask in this case because
14306 // it may need to undo the above swap.
14307 for (int &M : FinalSourceHalfMask)
14308 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14309 M = InputsFixed[1] + SourceOffset;
14310 else if (M == InputsFixed[1] + SourceOffset)
14311 M = (InputsFixed[0] ^ 1) + SourceOffset;
14312
14313 InputsFixed[1] = InputsFixed[0] ^ 1;
14314 }
14315
14316 // Point everything at the fixed inputs.
14317 for (int &M : HalfMask)
14318 if (M == IncomingInputs[0])
14319 M = InputsFixed[0] + SourceOffset;
14320 else if (M == IncomingInputs[1])
14321 M = InputsFixed[1] + SourceOffset;
14322
14323 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14324 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14325 }
14326 } else {
14327 llvm_unreachable("Unhandled input size!");
14328 }
14329
14330 // Now hoist the DWord down to the right half.
14331 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14332 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14333 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14334 for (int &M : HalfMask)
14335 for (int Input : IncomingInputs)
14336 if (M == Input)
14337 M = FreeDWord * 2 + Input % 2;
14338 };
14339 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14340 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14341 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14342 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14343
14344 // Now enact all the shuffles we've computed to move the inputs into their
14345 // target half.
14346 if (!isNoopShuffleMask(PSHUFLMask))
14347 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14348 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14349 if (!isNoopShuffleMask(PSHUFHMask))
14350 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14351 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14352 if (!isNoopShuffleMask(PSHUFDMask))
14353 V = DAG.getBitcast(
14354 VT,
14355 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14356 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14357
14358 // At this point, each half should contain all its inputs, and we can then
14359 // just shuffle them into their final position.
14360 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14361 "Failed to lift all the high half inputs to the low mask!");
14362 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14363 "Failed to lift all the low half inputs to the high mask!");
14364
14365 // Do a half shuffle for the low mask.
14366 if (!isNoopShuffleMask(LoMask))
14367 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14368 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14369
14370 // Do a half shuffle with the high mask after shifting its values down.
14371 for (int &M : HiMask)
14372 if (M >= 0)
14373 M -= 4;
14374 if (!isNoopShuffleMask(HiMask))
14375 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14376 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14377
14378 return V;
14379}
14380
14381/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14382/// blend if only one input is used.
14384 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14385 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14387 "Lane crossing shuffle masks not supported");
14388
14389 int NumBytes = VT.getSizeInBits() / 8;
14390 int Size = Mask.size();
14391 int Scale = NumBytes / Size;
14392
14393 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14394 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14395 V1InUse = false;
14396 V2InUse = false;
14397
14398 for (int i = 0; i < NumBytes; ++i) {
14399 int M = Mask[i / Scale];
14400 if (M < 0)
14401 continue;
14402
14403 const int ZeroMask = 0x80;
14404 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14405 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14406 if (Zeroable[i / Scale])
14407 V1Idx = V2Idx = ZeroMask;
14408
14409 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14410 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14411 V1InUse |= (ZeroMask != V1Idx);
14412 V2InUse |= (ZeroMask != V2Idx);
14413 }
14414
14415 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14416 if (V1InUse)
14417 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14418 DAG.getBuildVector(ShufVT, DL, V1Mask));
14419 if (V2InUse)
14420 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14421 DAG.getBuildVector(ShufVT, DL, V2Mask));
14422
14423 // If we need shuffled inputs from both, blend the two.
14424 SDValue V;
14425 if (V1InUse && V2InUse)
14426 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14427 else
14428 V = V1InUse ? V1 : V2;
14429
14430 // Cast the result back to the correct type.
14431 return DAG.getBitcast(VT, V);
14432}
14433
14434/// Generic lowering of 8-lane i16 shuffles.
14435///
14436/// This handles both single-input shuffles and combined shuffle/blends with
14437/// two inputs. The single input shuffles are immediately delegated to
14438/// a dedicated lowering routine.
14439///
14440/// The blends are lowered in one of three fundamental ways. If there are few
14441/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14442/// of the input is significantly cheaper when lowered as an interleaving of
14443/// the two inputs, try to interleave them. Otherwise, blend the low and high
14444/// halves of the inputs separately (making them have relatively few inputs)
14445/// and then concatenate them.
14447 const APInt &Zeroable, SDValue V1, SDValue V2,
14448 const X86Subtarget &Subtarget,
14449 SelectionDAG &DAG) {
14450 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14451 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14452 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14453
14454 // Whenever we can lower this as a zext, that instruction is strictly faster
14455 // than any alternative.
14456 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14457 Zeroable, Subtarget, DAG))
14458 return ZExt;
14459
14460 // Try to use lower using a truncation.
14461 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14462 Subtarget, DAG))
14463 return V;
14464
14465 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14466
14467 if (NumV2Inputs == 0) {
14468 // Try to use shift instructions.
14469 if (SDValue Shift =
14470 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14471 Subtarget, DAG, /*BitwiseOnly*/ false))
14472 return Shift;
14473
14474 // Check for being able to broadcast a single element.
14475 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14476 Mask, Subtarget, DAG))
14477 return Broadcast;
14478
14479 // Try to use bit rotation instructions.
14480 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14481 Subtarget, DAG))
14482 return Rotate;
14483
14484 // Use dedicated unpack instructions for masks that match their pattern.
14485 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14486 return V;
14487
14488 // Use dedicated pack instructions for masks that match their pattern.
14489 if (SDValue V =
14490 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14491 return V;
14492
14493 // Try to use byte rotation instructions.
14494 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14495 Subtarget, DAG))
14496 return Rotate;
14497
14498 // Make a copy of the mask so it can be modified.
14499 SmallVector<int, 8> MutableMask(Mask);
14500 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14501 Subtarget, DAG);
14502 }
14503
14504 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14505 "All single-input shuffles should be canonicalized to be V1-input "
14506 "shuffles.");
14507
14508 // Try to use shift instructions.
14509 if (SDValue Shift =
14510 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14511 DAG, /*BitwiseOnly*/ false))
14512 return Shift;
14513
14514 // See if we can use SSE4A Extraction / Insertion.
14515 if (Subtarget.hasSSE4A())
14516 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14517 Zeroable, DAG))
14518 return V;
14519
14520 // There are special ways we can lower some single-element blends.
14521 if (NumV2Inputs == 1)
14523 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14524 return V;
14525
14526 // We have different paths for blend lowering, but they all must use the
14527 // *exact* same predicate.
14528 bool IsBlendSupported = Subtarget.hasSSE41();
14529 if (IsBlendSupported)
14530 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14531 Zeroable, Subtarget, DAG))
14532 return Blend;
14533
14534 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14535 Zeroable, Subtarget, DAG))
14536 return Masked;
14537
14538 // Use dedicated unpack instructions for masks that match their pattern.
14539 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14540 return V;
14541
14542 // Use dedicated pack instructions for masks that match their pattern.
14543 if (SDValue V =
14544 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14545 return V;
14546
14547 // Try to use lower using a truncation.
14548 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14549 Subtarget, DAG))
14550 return V;
14551
14552 // Try to use byte rotation instructions.
14553 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14554 Subtarget, DAG))
14555 return Rotate;
14556
14557 if (SDValue BitBlend =
14558 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14559 return BitBlend;
14560
14561 // Try to use byte shift instructions to mask.
14562 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14563 Zeroable, Subtarget, DAG))
14564 return V;
14565
14566 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14567 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14568 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14569 !Subtarget.hasVLX()) {
14570 // Check if this is part of a 256-bit vector truncation.
14571 unsigned PackOpc = 0;
14572 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14575 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14576 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14577 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14578 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14579 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14580 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14581 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14582 PackOpc = X86ISD::PACKUS;
14583 } else if (Subtarget.hasSSE41()) {
14584 SmallVector<SDValue, 4> DWordClearOps(4,
14585 DAG.getConstant(0, DL, MVT::i32));
14586 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14587 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14588 SDValue DWordClearMask =
14589 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14590 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14591 DWordClearMask);
14592 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14593 DWordClearMask);
14594 PackOpc = X86ISD::PACKUS;
14595 } else if (!Subtarget.hasSSSE3()) {
14596 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14597 V1 = DAG.getBitcast(MVT::v4i32, V1);
14598 V2 = DAG.getBitcast(MVT::v4i32, V2);
14599 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14600 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14601 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14602 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14603 PackOpc = X86ISD::PACKSS;
14604 }
14605 if (PackOpc) {
14606 // Now pack things back together.
14607 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14608 if (NumEvenDrops == 2) {
14609 Result = DAG.getBitcast(MVT::v4i32, Result);
14610 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14611 }
14612 return Result;
14613 }
14614 }
14615
14616 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14617 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14618 if (NumOddDrops == 1) {
14619 bool HasSSE41 = Subtarget.hasSSE41();
14620 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14621 DAG.getBitcast(MVT::v4i32, V1),
14622 DAG.getTargetConstant(16, DL, MVT::i8));
14623 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14624 DAG.getBitcast(MVT::v4i32, V2),
14625 DAG.getTargetConstant(16, DL, MVT::i8));
14626 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14627 MVT::v8i16, V1, V2);
14628 }
14629
14630 // Try to lower by permuting the inputs into an unpack instruction.
14631 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14632 Mask, Subtarget, DAG))
14633 return Unpack;
14634
14635 // If we can't directly blend but can use PSHUFB, that will be better as it
14636 // can both shuffle and set up the inefficient blend.
14637 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14638 bool V1InUse, V2InUse;
14639 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14640 Zeroable, DAG, V1InUse, V2InUse);
14641 }
14642
14643 // We can always bit-blend if we have to so the fallback strategy is to
14644 // decompose into single-input permutes and blends/unpacks.
14645 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14646 Zeroable, Subtarget, DAG);
14647}
14648
14649/// Lower 8-lane 16-bit floating point shuffles.
14651 const APInt &Zeroable, SDValue V1, SDValue V2,
14652 const X86Subtarget &Subtarget,
14653 SelectionDAG &DAG) {
14654 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14655 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14656 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14657 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14658
14659 if (Subtarget.hasFP16()) {
14660 if (NumV2Elements == 0) {
14661 // Check for being able to broadcast a single element.
14662 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14663 Mask, Subtarget, DAG))
14664 return Broadcast;
14665 }
14666 if (NumV2Elements == 1 && Mask[0] >= 8)
14668 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14669 return V;
14670 }
14671
14672 V1 = DAG.getBitcast(MVT::v8i16, V1);
14673 V2 = DAG.getBitcast(MVT::v8i16, V2);
14674 return DAG.getBitcast(MVT::v8f16,
14675 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14676}
14677
14678// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14679// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14680// the active subvector is extracted.
14682 ArrayRef<int> OriginalMask, SDValue V1,
14683 SDValue V2, const X86Subtarget &Subtarget,
14684 SelectionDAG &DAG) {
14685 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14686 SmallVector<int, 32> Mask(OriginalMask);
14687 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14688 !isShuffleFoldableLoad(V2)) {
14690 std::swap(V1, V2);
14691 }
14692
14693 MVT MaskVT = VT.changeTypeToInteger();
14694 SDValue MaskNode;
14695 MVT ShuffleVT = VT;
14696 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14697 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14698 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14699 ShuffleVT = V1.getSimpleValueType();
14700
14701 // Adjust mask to correct indices for the second input.
14702 int NumElts = VT.getVectorNumElements();
14703 unsigned Scale = 512 / VT.getSizeInBits();
14704 SmallVector<int, 32> AdjustedMask(Mask);
14705 for (int &M : AdjustedMask)
14706 if (NumElts <= M)
14707 M += (Scale - 1) * NumElts;
14708 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14709 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14710 } else {
14711 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14712 }
14713
14714 SDValue Result;
14715 if (V2.isUndef())
14716 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14717 else
14718 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14719
14720 if (VT != ShuffleVT)
14721 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14722
14723 return Result;
14724}
14725
14726/// Generic lowering of v16i8 shuffles.
14727///
14728/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14729/// detect any complexity reducing interleaving. If that doesn't help, it uses
14730/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14731/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14732/// back together.
14734 const APInt &Zeroable, SDValue V1, SDValue V2,
14735 const X86Subtarget &Subtarget,
14736 SelectionDAG &DAG) {
14737 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14738 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14739 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14740
14741 // Try to use shift instructions.
14742 if (SDValue Shift =
14743 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14744 DAG, /*BitwiseOnly*/ false))
14745 return Shift;
14746
14747 // Try to use byte rotation instructions.
14748 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14749 Subtarget, DAG))
14750 return Rotate;
14751
14752 // Use dedicated pack instructions for masks that match their pattern.
14753 if (SDValue V =
14754 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14755 return V;
14756
14757 // Try to use a zext lowering.
14758 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14759 Zeroable, Subtarget, DAG))
14760 return ZExt;
14761
14762 // Try to use lower using a truncation.
14763 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14764 Subtarget, DAG))
14765 return V;
14766
14767 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14768 Subtarget, DAG))
14769 return V;
14770
14771 // See if we can use SSE4A Extraction / Insertion.
14772 if (Subtarget.hasSSE4A())
14773 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14774 Zeroable, DAG))
14775 return V;
14776
14777 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14778
14779 // For single-input shuffles, there are some nicer lowering tricks we can use.
14780 if (NumV2Elements == 0) {
14781 // Check for being able to broadcast a single element.
14782 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14783 Mask, Subtarget, DAG))
14784 return Broadcast;
14785
14786 // Try to use bit rotation instructions.
14787 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14788 Subtarget, DAG))
14789 return Rotate;
14790
14791 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14792 return V;
14793
14794 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14795 // Notably, this handles splat and partial-splat shuffles more efficiently.
14796 // However, it only makes sense if the pre-duplication shuffle simplifies
14797 // things significantly. Currently, this means we need to be able to
14798 // express the pre-duplication shuffle as an i16 shuffle.
14799 //
14800 // FIXME: We should check for other patterns which can be widened into an
14801 // i16 shuffle as well.
14802 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14803 for (int i = 0; i < 16; i += 2)
14804 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14805 return false;
14806
14807 return true;
14808 };
14809 auto tryToWidenViaDuplication = [&]() -> SDValue {
14810 if (!canWidenViaDuplication(Mask))
14811 return SDValue();
14812 SmallVector<int, 4> LoInputs;
14813 copy_if(Mask, std::back_inserter(LoInputs),
14814 [](int M) { return M >= 0 && M < 8; });
14815 array_pod_sort(LoInputs.begin(), LoInputs.end());
14816 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14817 SmallVector<int, 4> HiInputs;
14818 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14819 array_pod_sort(HiInputs.begin(), HiInputs.end());
14820 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14821
14822 bool TargetLo = LoInputs.size() >= HiInputs.size();
14823 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14824 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14825
14826 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14828 for (int I : InPlaceInputs) {
14829 PreDupI16Shuffle[I/2] = I/2;
14830 LaneMap[I] = I;
14831 }
14832 int j = TargetLo ? 0 : 4, je = j + 4;
14833 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14834 // Check if j is already a shuffle of this input. This happens when
14835 // there are two adjacent bytes after we move the low one.
14836 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14837 // If we haven't yet mapped the input, search for a slot into which
14838 // we can map it.
14839 while (j < je && PreDupI16Shuffle[j] >= 0)
14840 ++j;
14841
14842 if (j == je)
14843 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14844 return SDValue();
14845
14846 // Map this input with the i16 shuffle.
14847 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14848 }
14849
14850 // Update the lane map based on the mapping we ended up with.
14851 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14852 }
14853 V1 = DAG.getBitcast(
14854 MVT::v16i8,
14855 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14856 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14857
14858 // Unpack the bytes to form the i16s that will be shuffled into place.
14859 bool EvenInUse = false, OddInUse = false;
14860 for (int i = 0; i < 16; i += 2) {
14861 EvenInUse |= (Mask[i + 0] >= 0);
14862 OddInUse |= (Mask[i + 1] >= 0);
14863 if (EvenInUse && OddInUse)
14864 break;
14865 }
14866 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14867 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14868 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14869
14870 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14871 for (int i = 0; i < 16; ++i)
14872 if (Mask[i] >= 0) {
14873 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14874 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14875 if (PostDupI16Shuffle[i / 2] < 0)
14876 PostDupI16Shuffle[i / 2] = MappedMask;
14877 else
14878 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14879 "Conflicting entries in the original shuffle!");
14880 }
14881 return DAG.getBitcast(
14882 MVT::v16i8,
14883 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14884 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14885 };
14886 if (SDValue V = tryToWidenViaDuplication())
14887 return V;
14888 }
14889
14890 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14891 Zeroable, Subtarget, DAG))
14892 return Masked;
14893
14894 // Use dedicated unpack instructions for masks that match their pattern.
14895 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14896 return V;
14897
14898 // Try to use byte shift instructions to mask.
14899 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14900 Zeroable, Subtarget, DAG))
14901 return V;
14902
14903 // Check for compaction patterns.
14904 bool IsSingleInput = V2.isUndef();
14905 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14906
14907 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14908 // with PSHUFB. It is important to do this before we attempt to generate any
14909 // blends but after all of the single-input lowerings. If the single input
14910 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14911 // want to preserve that and we can DAG combine any longer sequences into
14912 // a PSHUFB in the end. But once we start blending from multiple inputs,
14913 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14914 // and there are *very* few patterns that would actually be faster than the
14915 // PSHUFB approach because of its ability to zero lanes.
14916 //
14917 // If the mask is a binary compaction, we can more efficiently perform this
14918 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14919 //
14920 // FIXME: The only exceptions to the above are blends which are exact
14921 // interleavings with direct instructions supporting them. We currently don't
14922 // handle those well here.
14923 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14924 bool V1InUse = false;
14925 bool V2InUse = false;
14926
14928 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14929
14930 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14931 // do so. This avoids using them to handle blends-with-zero which is
14932 // important as a single pshufb is significantly faster for that.
14933 if (V1InUse && V2InUse) {
14934 if (Subtarget.hasSSE41())
14935 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14936 Zeroable, Subtarget, DAG))
14937 return Blend;
14938
14939 // We can use an unpack to do the blending rather than an or in some
14940 // cases. Even though the or may be (very minorly) more efficient, we
14941 // preference this lowering because there are common cases where part of
14942 // the complexity of the shuffles goes away when we do the final blend as
14943 // an unpack.
14944 // FIXME: It might be worth trying to detect if the unpack-feeding
14945 // shuffles will both be pshufb, in which case we shouldn't bother with
14946 // this.
14948 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14949 return Unpack;
14950
14951 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14952 if (Subtarget.hasVBMI())
14953 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14954 DAG);
14955
14956 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14957 if (Subtarget.hasXOP()) {
14958 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14959 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14960 }
14961
14962 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14963 // PALIGNR will be cheaper than the second PSHUFB+OR.
14965 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14966 return V;
14967 }
14968
14969 return PSHUFB;
14970 }
14971
14972 // There are special ways we can lower some single-element blends.
14973 if (NumV2Elements == 1)
14975 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14976 return V;
14977
14978 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14979 return Blend;
14980
14981 // Check whether a compaction lowering can be done. This handles shuffles
14982 // which take every Nth element for some even N. See the helper function for
14983 // details.
14984 //
14985 // We special case these as they can be particularly efficiently handled with
14986 // the PACKUSB instruction on x86 and they show up in common patterns of
14987 // rearranging bytes to truncate wide elements.
14988 if (NumEvenDrops) {
14989 // NumEvenDrops is the power of two stride of the elements. Another way of
14990 // thinking about it is that we need to drop the even elements this many
14991 // times to get the original input.
14992
14993 // First we need to zero all the dropped bytes.
14994 assert(NumEvenDrops <= 3 &&
14995 "No support for dropping even elements more than 3 times.");
14996 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14997 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14998 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14999 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15000 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15001 WordClearMask);
15002 if (!IsSingleInput)
15003 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15004 WordClearMask);
15005
15006 // Now pack things back together.
15007 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15008 IsSingleInput ? V1 : V2);
15009 for (int i = 1; i < NumEvenDrops; ++i) {
15010 Result = DAG.getBitcast(MVT::v8i16, Result);
15011 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15012 }
15013 return Result;
15014 }
15015
15016 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15017 if (NumOddDrops == 1) {
15018 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15019 DAG.getBitcast(MVT::v8i16, V1),
15020 DAG.getTargetConstant(8, DL, MVT::i8));
15021 if (!IsSingleInput)
15022 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15023 DAG.getBitcast(MVT::v8i16, V2),
15024 DAG.getTargetConstant(8, DL, MVT::i8));
15025 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15026 IsSingleInput ? V1 : V2);
15027 }
15028
15029 // Handle multi-input cases by blending/unpacking single-input shuffles.
15030 if (NumV2Elements > 0)
15031 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15032 Zeroable, Subtarget, DAG);
15033
15034 // The fallback path for single-input shuffles widens this into two v8i16
15035 // vectors with unpacks, shuffles those, and then pulls them back together
15036 // with a pack.
15037 SDValue V = V1;
15038
15039 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15040 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15041 for (int i = 0; i < 16; ++i)
15042 if (Mask[i] >= 0)
15043 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15044
15045 SDValue VLoHalf, VHiHalf;
15046 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15047 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15048 // i16s.
15049 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15050 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15051 // Use a mask to drop the high bytes.
15052 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15053 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15054 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15055
15056 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15057 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15058
15059 // Squash the masks to point directly into VLoHalf.
15060 for (int &M : LoBlendMask)
15061 if (M >= 0)
15062 M /= 2;
15063 for (int &M : HiBlendMask)
15064 if (M >= 0)
15065 M /= 2;
15066 } else {
15067 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15068 // VHiHalf so that we can blend them as i16s.
15069 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15070
15071 VLoHalf = DAG.getBitcast(
15072 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15073 VHiHalf = DAG.getBitcast(
15074 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15075 }
15076
15077 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15078 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15079
15080 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15081}
15082
15083/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15084///
15085/// This routine breaks down the specific type of 128-bit shuffle and
15086/// dispatches to the lowering routines accordingly.
15088 MVT VT, SDValue V1, SDValue V2,
15089 const APInt &Zeroable,
15090 const X86Subtarget &Subtarget,
15091 SelectionDAG &DAG) {
15092 if (VT == MVT::v8bf16) {
15093 V1 = DAG.getBitcast(MVT::v8i16, V1);
15094 V2 = DAG.getBitcast(MVT::v8i16, V2);
15095 return DAG.getBitcast(VT,
15096 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15097 }
15098
15099 switch (VT.SimpleTy) {
15100 case MVT::v2i64:
15101 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15102 case MVT::v2f64:
15103 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15104 case MVT::v4i32:
15105 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15106 case MVT::v4f32:
15107 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15108 case MVT::v8i16:
15109 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15110 case MVT::v8f16:
15111 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15112 case MVT::v16i8:
15113 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15114
15115 default:
15116 llvm_unreachable("Unimplemented!");
15117 }
15118}
15119
15120/// Generic routine to split vector shuffle into half-sized shuffles.
15121///
15122/// This routine just extracts two subvectors, shuffles them independently, and
15123/// then concatenates them back together. This should work effectively with all
15124/// AVX vector shuffle types.
15126 SDValue V2, ArrayRef<int> Mask,
15127 SelectionDAG &DAG, bool SimpleOnly) {
15128 assert(VT.getSizeInBits() >= 256 &&
15129 "Only for 256-bit or wider vector shuffles!");
15130 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15131 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15132
15133 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15134 if (VT == MVT::v8f32) {
15135 SDValue BC1 = peekThroughBitcasts(V1);
15136 SDValue BC2 = peekThroughBitcasts(V2);
15137 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15138 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15139 DAG, SimpleOnly))
15140 return DAG.getBitcast(VT, Split);
15141 }
15142 }
15143
15144 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15145 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15146
15147 int NumElements = VT.getVectorNumElements();
15148 int SplitNumElements = NumElements / 2;
15149 MVT ScalarVT = VT.getVectorElementType();
15150 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15151
15152 // Use splitVector/extractSubVector so that split build-vectors just build two
15153 // narrower build vectors. This helps shuffling with splats and zeros.
15154 auto SplitVector = [&](SDValue V) {
15155 SDValue LoV, HiV;
15156 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15157 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15158 DAG.getBitcast(SplitVT, HiV));
15159 };
15160
15161 SDValue LoV1, HiV1, LoV2, HiV2;
15162 std::tie(LoV1, HiV1) = SplitVector(V1);
15163 std::tie(LoV2, HiV2) = SplitVector(V2);
15164
15165 // Now create two 4-way blends of these half-width vectors.
15166 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15167 bool &UseHiV1, bool &UseLoV2,
15168 bool &UseHiV2) {
15169 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15170 for (int i = 0; i < SplitNumElements; ++i) {
15171 int M = HalfMask[i];
15172 if (M >= NumElements) {
15173 if (M >= NumElements + SplitNumElements)
15174 UseHiV2 = true;
15175 else
15176 UseLoV2 = true;
15177 } else if (M >= 0) {
15178 if (M >= SplitNumElements)
15179 UseHiV1 = true;
15180 else
15181 UseLoV1 = true;
15182 }
15183 }
15184 };
15185
15186 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15187 if (!SimpleOnly)
15188 return true;
15189
15190 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15191 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15192
15193 return !(UseHiV1 || UseHiV2);
15194 };
15195
15196 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15197 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15198 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15199 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15200 for (int i = 0; i < SplitNumElements; ++i) {
15201 int M = HalfMask[i];
15202 if (M >= NumElements) {
15203 V2BlendMask[i] = M - NumElements;
15204 BlendMask[i] = SplitNumElements + i;
15205 } else if (M >= 0) {
15206 V1BlendMask[i] = M;
15207 BlendMask[i] = i;
15208 }
15209 }
15210
15211 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15212 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15213
15214 // Because the lowering happens after all combining takes place, we need to
15215 // manually combine these blend masks as much as possible so that we create
15216 // a minimal number of high-level vector shuffle nodes.
15217 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15218
15219 // First try just blending the halves of V1 or V2.
15220 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15221 return DAG.getUNDEF(SplitVT);
15222 if (!UseLoV2 && !UseHiV2)
15223 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15224 if (!UseLoV1 && !UseHiV1)
15225 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15226
15227 SDValue V1Blend, V2Blend;
15228 if (UseLoV1 && UseHiV1) {
15229 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15230 } else {
15231 // We only use half of V1 so map the usage down into the final blend mask.
15232 V1Blend = UseLoV1 ? LoV1 : HiV1;
15233 for (int i = 0; i < SplitNumElements; ++i)
15234 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15235 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15236 }
15237 if (UseLoV2 && UseHiV2) {
15238 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15239 } else {
15240 // We only use half of V2 so map the usage down into the final blend mask.
15241 V2Blend = UseLoV2 ? LoV2 : HiV2;
15242 for (int i = 0; i < SplitNumElements; ++i)
15243 if (BlendMask[i] >= SplitNumElements)
15244 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15245 }
15246 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15247 };
15248
15249 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15250 return SDValue();
15251
15252 SDValue Lo = HalfBlend(LoMask);
15253 SDValue Hi = HalfBlend(HiMask);
15254 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15255}
15256
15257/// Either split a vector in halves or decompose the shuffles and the
15258/// blend/unpack.
15259///
15260/// This is provided as a good fallback for many lowerings of non-single-input
15261/// shuffles with more than one 128-bit lane. In those cases, we want to select
15262/// between splitting the shuffle into 128-bit components and stitching those
15263/// back together vs. extracting the single-input shuffles and blending those
15264/// results.
15266 SDValue V2, ArrayRef<int> Mask,
15267 const APInt &Zeroable,
15268 const X86Subtarget &Subtarget,
15269 SelectionDAG &DAG) {
15270 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15271 "shuffles as it could then recurse on itself.");
15272 int Size = Mask.size();
15273
15274 // If this can be modeled as a broadcast of two elements followed by a blend,
15275 // prefer that lowering. This is especially important because broadcasts can
15276 // often fold with memory operands.
15277 auto DoBothBroadcast = [&] {
15278 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15279 for (int M : Mask)
15280 if (M >= Size) {
15281 if (V2BroadcastIdx < 0)
15282 V2BroadcastIdx = M - Size;
15283 else if ((M - Size) != V2BroadcastIdx &&
15284 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15285 return false;
15286 } else if (M >= 0) {
15287 if (V1BroadcastIdx < 0)
15288 V1BroadcastIdx = M;
15289 else if (M != V1BroadcastIdx &&
15290 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15291 return false;
15292 }
15293 return true;
15294 };
15295 if (DoBothBroadcast())
15296 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15297 Subtarget, DAG);
15298
15299 // If the inputs all stem from a single 128-bit lane of each input, then we
15300 // split them rather than blending because the split will decompose to
15301 // unusually few instructions.
15302 int LaneCount = VT.getSizeInBits() / 128;
15303 int LaneSize = Size / LaneCount;
15304 SmallBitVector LaneInputs[2];
15305 LaneInputs[0].resize(LaneCount, false);
15306 LaneInputs[1].resize(LaneCount, false);
15307 for (int i = 0; i < Size; ++i)
15308 if (Mask[i] >= 0)
15309 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15310 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15311 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15312 /*SimpleOnly*/ false);
15313
15314 // Without AVX2, if we can freely split the subvectors then we're better off
15315 // performing half width shuffles.
15316 if (!Subtarget.hasAVX2()) {
15317 SDValue BC1 = peekThroughBitcasts(V1);
15318 SDValue BC2 = peekThroughBitcasts(V2);
15319 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15320 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15321 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15322 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15323 if (SplatOrSplitV1 && SplatOrSplitV2)
15324 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15325 /*SimpleOnly*/ false);
15326 }
15327
15328 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15329 // requires that the decomposed single-input shuffles don't end up here.
15330 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15331 Subtarget, DAG);
15332}
15333
15334// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15335// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15337 SDValue V1, SDValue V2,
15338 ArrayRef<int> Mask,
15339 SelectionDAG &DAG) {
15340 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15341
15342 int LHSMask[4] = {-1, -1, -1, -1};
15343 int RHSMask[4] = {-1, -1, -1, -1};
15344 int SHUFPDMask[4] = {-1, -1, -1, -1};
15345
15346 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15347 // perform the shuffle once the lanes have been shuffled in place.
15348 for (int i = 0; i != 4; ++i) {
15349 int M = Mask[i];
15350 if (M < 0)
15351 continue;
15352 int LaneBase = i & ~1;
15353 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15354 LaneMask[LaneBase + (M & 1)] = M;
15355 SHUFPDMask[i] = M & 1;
15356 }
15357
15358 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15359 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15360 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15361 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15362}
15363
15364/// Lower a vector shuffle crossing multiple 128-bit lanes as
15365/// a lane permutation followed by a per-lane permutation.
15366///
15367/// This is mainly for cases where we can have non-repeating permutes
15368/// in each lane.
15369///
15370/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15371/// we should investigate merging them.
15373 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15374 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15375 int NumElts = VT.getVectorNumElements();
15376 int NumLanes = VT.getSizeInBits() / 128;
15377 int NumEltsPerLane = NumElts / NumLanes;
15378 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15379
15380 /// Attempts to find a sublane permute with the given size
15381 /// that gets all elements into their target lanes.
15382 ///
15383 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15384 /// If unsuccessful, returns false and may overwrite InLaneMask.
15385 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15386 int NumSublanesPerLane = NumSublanes / NumLanes;
15387 int NumEltsPerSublane = NumElts / NumSublanes;
15388
15389 SmallVector<int, 16> CrossLaneMask;
15390 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15391 // CrossLaneMask but one entry == one sublane.
15392 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15393 APInt DemandedCrossLane = APInt::getZero(NumElts);
15394
15395 for (int i = 0; i != NumElts; ++i) {
15396 int M = Mask[i];
15397 if (M < 0)
15398 continue;
15399
15400 int SrcSublane = M / NumEltsPerSublane;
15401 int DstLane = i / NumEltsPerLane;
15402
15403 // We only need to get the elements into the right lane, not sublane.
15404 // So search all sublanes that make up the destination lane.
15405 bool Found = false;
15406 int DstSubStart = DstLane * NumSublanesPerLane;
15407 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15408 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15409 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15410 continue;
15411
15412 Found = true;
15413 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15414 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15415 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15416 DemandedCrossLane.setBit(InLaneMask[i]);
15417 break;
15418 }
15419 if (!Found)
15420 return SDValue();
15421 }
15422
15423 // Fill CrossLaneMask using CrossLaneMaskLarge.
15424 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15425
15426 if (!CanUseSublanes) {
15427 // If we're only shuffling a single lowest lane and the rest are identity
15428 // then don't bother.
15429 // TODO - isShuffleMaskInputInPlace could be extended to something like
15430 // this.
15431 int NumIdentityLanes = 0;
15432 bool OnlyShuffleLowestLane = true;
15433 for (int i = 0; i != NumLanes; ++i) {
15434 int LaneOffset = i * NumEltsPerLane;
15435 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15436 i * NumEltsPerLane))
15437 NumIdentityLanes++;
15438 else if (CrossLaneMask[LaneOffset] != 0)
15439 OnlyShuffleLowestLane = false;
15440 }
15441 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15442 return SDValue();
15443 }
15444
15445 // Simplify CrossLaneMask based on the actual demanded elements.
15446 if (V1.hasOneUse())
15447 for (int i = 0; i != NumElts; ++i)
15448 if (!DemandedCrossLane[i])
15449 CrossLaneMask[i] = SM_SentinelUndef;
15450
15451 // Avoid returning the same shuffle operation. For example,
15452 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15453 // undef:v16i16
15454 if (CrossLaneMask == Mask || InLaneMask == Mask)
15455 return SDValue();
15456
15457 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15458 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15459 InLaneMask);
15460 };
15461
15462 // First attempt a solution with full lanes.
15463 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15464 return V;
15465
15466 // The rest of the solutions use sublanes.
15467 if (!CanUseSublanes)
15468 return SDValue();
15469
15470 // Then attempt a solution with 64-bit sublanes (vpermq).
15471 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15472 return V;
15473
15474 // If that doesn't work and we have fast variable cross-lane shuffle,
15475 // attempt 32-bit sublanes (vpermd).
15476 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15477 return SDValue();
15478
15479 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15480}
15481
15482/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15483static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15484 SmallVector<int> &InLaneMask) {
15485 int Size = Mask.size();
15486 InLaneMask.assign(Mask.begin(), Mask.end());
15487 for (int i = 0; i < Size; ++i) {
15488 int &M = InLaneMask[i];
15489 if (M < 0)
15490 continue;
15491 if (((M % Size) / LaneSize) != (i / LaneSize))
15492 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15493 }
15494}
15495
15496/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15497/// source with a lane permutation.
15498///
15499/// This lowering strategy results in four instructions in the worst case for a
15500/// single-input cross lane shuffle which is lower than any other fully general
15501/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15502/// shuffle pattern should be handled prior to trying this lowering.
15504 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15505 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15506 // FIXME: This should probably be generalized for 512-bit vectors as well.
15507 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15508 int Size = Mask.size();
15509 int LaneSize = Size / 2;
15510
15511 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15512 // Only do this if the elements aren't all from the lower lane,
15513 // otherwise we're (probably) better off doing a split.
15514 if (VT == MVT::v4f64 &&
15515 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15516 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15517
15518 // If there are only inputs from one 128-bit lane, splitting will in fact be
15519 // less expensive. The flags track whether the given lane contains an element
15520 // that crosses to another lane.
15521 bool AllLanes;
15522 if (!Subtarget.hasAVX2()) {
15523 bool LaneCrossing[2] = {false, false};
15524 for (int i = 0; i < Size; ++i)
15525 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15526 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15527 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15528 } else {
15529 bool LaneUsed[2] = {false, false};
15530 for (int i = 0; i < Size; ++i)
15531 if (Mask[i] >= 0)
15532 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15533 AllLanes = LaneUsed[0] && LaneUsed[1];
15534 }
15535
15536 // TODO - we could support shuffling V2 in the Flipped input.
15537 assert(V2.isUndef() &&
15538 "This last part of this routine only works on single input shuffles");
15539
15540 SmallVector<int> InLaneMask;
15541 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15542
15543 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15544 "In-lane shuffle mask expected");
15545
15546 // If we're not using both lanes in each lane and the inlane mask is not
15547 // repeating, then we're better off splitting.
15548 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15549 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15550 /*SimpleOnly*/ false);
15551
15552 // Flip the lanes, and shuffle the results which should now be in-lane.
15553 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15554 SDValue Flipped = DAG.getBitcast(PVT, V1);
15555 Flipped =
15556 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15557 Flipped = DAG.getBitcast(VT, Flipped);
15558 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15559}
15560
15561/// Handle lowering 2-lane 128-bit shuffles.
15563 SDValue V2, ArrayRef<int> Mask,
15564 const APInt &Zeroable,
15565 const X86Subtarget &Subtarget,
15566 SelectionDAG &DAG) {
15567 if (V2.isUndef()) {
15568 // Attempt to match VBROADCAST*128 subvector broadcast load.
15569 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15570 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15571 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15573 MVT MemVT = VT.getHalfNumVectorElementsVT();
15574 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15577 VT, MemVT, Ld, Ofs, DAG))
15578 return BcstLd;
15579 }
15580
15581 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15582 if (Subtarget.hasAVX2())
15583 return SDValue();
15584 }
15585
15586 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15587
15588 SmallVector<int, 4> WidenedMask;
15589 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15590 return SDValue();
15591
15592 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15593 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15594
15595 // Try to use an insert into a zero vector.
15596 if (WidenedMask[0] == 0 && IsHighZero) {
15597 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15598 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15599 DAG.getVectorIdxConstant(0, DL));
15600 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15601 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15602 DAG.getVectorIdxConstant(0, DL));
15603 }
15604
15605 // TODO: If minimizing size and one of the inputs is a zero vector and the
15606 // the zero vector has only one use, we could use a VPERM2X128 to save the
15607 // instruction bytes needed to explicitly generate the zero vector.
15608
15609 // Blends are faster and handle all the non-lane-crossing cases.
15610 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15611 Subtarget, DAG))
15612 return Blend;
15613
15614 // If either input operand is a zero vector, use VPERM2X128 because its mask
15615 // allows us to replace the zero input with an implicit zero.
15616 if (!IsLowZero && !IsHighZero) {
15617 // Check for patterns which can be matched with a single insert of a 128-bit
15618 // subvector.
15619 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15620 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15621
15622 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15623 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15625 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15626 SDValue SubVec =
15627 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15628 DAG.getVectorIdxConstant(0, DL));
15629 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15630 DAG.getVectorIdxConstant(2, DL));
15631 }
15632 }
15633
15634 // Try to use SHUF128 if possible.
15635 if (Subtarget.hasVLX()) {
15636 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15637 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15638 ((WidenedMask[1] % 2) << 1);
15639 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15640 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15641 }
15642 }
15643 }
15644
15645 // Otherwise form a 128-bit permutation. After accounting for undefs,
15646 // convert the 64-bit shuffle mask selection values into 128-bit
15647 // selection bits by dividing the indexes by 2 and shifting into positions
15648 // defined by a vperm2*128 instruction's immediate control byte.
15649
15650 // The immediate permute control byte looks like this:
15651 // [1:0] - select 128 bits from sources for low half of destination
15652 // [2] - ignore
15653 // [3] - zero low half of destination
15654 // [5:4] - select 128 bits from sources for high half of destination
15655 // [6] - ignore
15656 // [7] - zero high half of destination
15657
15658 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15659 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15660
15661 unsigned PermMask = 0;
15662 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15663 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15664
15665 // Check the immediate mask and replace unused sources with undef.
15666 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15667 V1 = DAG.getUNDEF(VT);
15668 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15669 V2 = DAG.getUNDEF(VT);
15670
15671 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15672 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15673}
15674
15675/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15676/// shuffling each lane.
15677///
15678/// This attempts to create a repeated lane shuffle where each lane uses one
15679/// or two of the lanes of the inputs. The lanes of the input vectors are
15680/// shuffled in one or two independent shuffles to get the lanes into the
15681/// position needed by the final shuffle.
15683 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15684 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15685 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15686
15687 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15688 return SDValue();
15689
15690 int NumElts = Mask.size();
15691 int NumLanes = VT.getSizeInBits() / 128;
15692 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15693 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15694 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15695
15696 // First pass will try to fill in the RepeatMask from lanes that need two
15697 // sources.
15698 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15699 int Srcs[2] = {-1, -1};
15700 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15701 for (int i = 0; i != NumLaneElts; ++i) {
15702 int M = Mask[(Lane * NumLaneElts) + i];
15703 if (M < 0)
15704 continue;
15705 // Determine which of the possible input lanes (NumLanes from each source)
15706 // this element comes from. Assign that as one of the sources for this
15707 // lane. We can assign up to 2 sources for this lane. If we run out
15708 // sources we can't do anything.
15709 int LaneSrc = M / NumLaneElts;
15710 int Src;
15711 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15712 Src = 0;
15713 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15714 Src = 1;
15715 else
15716 return SDValue();
15717
15718 Srcs[Src] = LaneSrc;
15719 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15720 }
15721
15722 // If this lane has two sources, see if it fits with the repeat mask so far.
15723 if (Srcs[1] < 0)
15724 continue;
15725
15726 LaneSrcs[Lane][0] = Srcs[0];
15727 LaneSrcs[Lane][1] = Srcs[1];
15728
15729 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15730 assert(M1.size() == M2.size() && "Unexpected mask size");
15731 for (int i = 0, e = M1.size(); i != e; ++i)
15732 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15733 return false;
15734 return true;
15735 };
15736
15737 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15738 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15739 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15740 int M = Mask[i];
15741 if (M < 0)
15742 continue;
15743 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15744 "Unexpected mask element");
15745 MergedMask[i] = M;
15746 }
15747 };
15748
15749 if (MatchMasks(InLaneMask, RepeatMask)) {
15750 // Merge this lane mask into the final repeat mask.
15751 MergeMasks(InLaneMask, RepeatMask);
15752 continue;
15753 }
15754
15755 // Didn't find a match. Swap the operands and try again.
15756 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15758
15759 if (MatchMasks(InLaneMask, RepeatMask)) {
15760 // Merge this lane mask into the final repeat mask.
15761 MergeMasks(InLaneMask, RepeatMask);
15762 continue;
15763 }
15764
15765 // Couldn't find a match with the operands in either order.
15766 return SDValue();
15767 }
15768
15769 // Now handle any lanes with only one source.
15770 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15771 // If this lane has already been processed, skip it.
15772 if (LaneSrcs[Lane][0] >= 0)
15773 continue;
15774
15775 for (int i = 0; i != NumLaneElts; ++i) {
15776 int M = Mask[(Lane * NumLaneElts) + i];
15777 if (M < 0)
15778 continue;
15779
15780 // If RepeatMask isn't defined yet we can define it ourself.
15781 if (RepeatMask[i] < 0)
15782 RepeatMask[i] = M % NumLaneElts;
15783
15784 if (RepeatMask[i] < NumElts) {
15785 if (RepeatMask[i] != M % NumLaneElts)
15786 return SDValue();
15787 LaneSrcs[Lane][0] = M / NumLaneElts;
15788 } else {
15789 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15790 return SDValue();
15791 LaneSrcs[Lane][1] = M / NumLaneElts;
15792 }
15793 }
15794
15795 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15796 return SDValue();
15797 }
15798
15799 SmallVector<int, 16> NewMask(NumElts, -1);
15800 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15801 int Src = LaneSrcs[Lane][0];
15802 for (int i = 0; i != NumLaneElts; ++i) {
15803 int M = -1;
15804 if (Src >= 0)
15805 M = Src * NumLaneElts + i;
15806 NewMask[Lane * NumLaneElts + i] = M;
15807 }
15808 }
15809 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15810 // Ensure we didn't get back the shuffle we started with.
15811 // FIXME: This is a hack to make up for some splat handling code in
15812 // getVectorShuffle.
15813 if (isa<ShuffleVectorSDNode>(NewV1) &&
15814 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15815 return SDValue();
15816
15817 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15818 int Src = LaneSrcs[Lane][1];
15819 for (int i = 0; i != NumLaneElts; ++i) {
15820 int M = -1;
15821 if (Src >= 0)
15822 M = Src * NumLaneElts + i;
15823 NewMask[Lane * NumLaneElts + i] = M;
15824 }
15825 }
15826 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15827 // Ensure we didn't get back the shuffle we started with.
15828 // FIXME: This is a hack to make up for some splat handling code in
15829 // getVectorShuffle.
15830 if (isa<ShuffleVectorSDNode>(NewV2) &&
15831 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15832 return SDValue();
15833
15834 for (int i = 0; i != NumElts; ++i) {
15835 if (Mask[i] < 0) {
15836 NewMask[i] = -1;
15837 continue;
15838 }
15839 NewMask[i] = RepeatMask[i % NumLaneElts];
15840 if (NewMask[i] < 0)
15841 continue;
15842
15843 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15844 }
15845 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15846}
15847
15848/// If the input shuffle mask results in a vector that is undefined in all upper
15849/// or lower half elements and that mask accesses only 2 halves of the
15850/// shuffle's operands, return true. A mask of half the width with mask indexes
15851/// adjusted to access the extracted halves of the original shuffle operands is
15852/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15853/// lower half of each input operand is accessed.
15854static bool
15856 int &HalfIdx1, int &HalfIdx2) {
15857 assert((Mask.size() == HalfMask.size() * 2) &&
15858 "Expected input mask to be twice as long as output");
15859
15860 // Exactly one half of the result must be undef to allow narrowing.
15861 bool UndefLower = isUndefLowerHalf(Mask);
15862 bool UndefUpper = isUndefUpperHalf(Mask);
15863 if (UndefLower == UndefUpper)
15864 return false;
15865
15866 unsigned HalfNumElts = HalfMask.size();
15867 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15868 HalfIdx1 = -1;
15869 HalfIdx2 = -1;
15870 for (unsigned i = 0; i != HalfNumElts; ++i) {
15871 int M = Mask[i + MaskIndexOffset];
15872 if (M < 0) {
15873 HalfMask[i] = M;
15874 continue;
15875 }
15876
15877 // Determine which of the 4 half vectors this element is from.
15878 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15879 int HalfIdx = M / HalfNumElts;
15880
15881 // Determine the element index into its half vector source.
15882 int HalfElt = M % HalfNumElts;
15883
15884 // We can shuffle with up to 2 half vectors, set the new 'half'
15885 // shuffle mask accordingly.
15886 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15887 HalfMask[i] = HalfElt;
15888 HalfIdx1 = HalfIdx;
15889 continue;
15890 }
15891 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15892 HalfMask[i] = HalfElt + HalfNumElts;
15893 HalfIdx2 = HalfIdx;
15894 continue;
15895 }
15896
15897 // Too many half vectors referenced.
15898 return false;
15899 }
15900
15901 return true;
15902}
15903
15904/// Given the output values from getHalfShuffleMask(), create a half width
15905/// shuffle of extracted vectors followed by an insert back to full width.
15907 ArrayRef<int> HalfMask, int HalfIdx1,
15908 int HalfIdx2, bool UndefLower,
15909 SelectionDAG &DAG, bool UseConcat = false) {
15910 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15911 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15912
15913 MVT VT = V1.getSimpleValueType();
15914 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15915 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15916
15917 auto getHalfVector = [&](int HalfIdx) {
15918 if (HalfIdx < 0)
15919 return DAG.getUNDEF(HalfVT);
15920 SDValue V = (HalfIdx < 2 ? V1 : V2);
15921 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15922 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15923 DAG.getVectorIdxConstant(HalfIdx, DL));
15924 };
15925
15926 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15927 SDValue Half1 = getHalfVector(HalfIdx1);
15928 SDValue Half2 = getHalfVector(HalfIdx2);
15929 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15930 if (UseConcat) {
15931 SDValue Op0 = V;
15932 SDValue Op1 = DAG.getUNDEF(HalfVT);
15933 if (UndefLower)
15934 std::swap(Op0, Op1);
15935 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15936 }
15937
15938 unsigned Offset = UndefLower ? HalfNumElts : 0;
15939 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15941}
15942
15943/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15944/// This allows for fast cases such as subvector extraction/insertion
15945/// or shuffling smaller vector types which can lower more efficiently.
15947 SDValue V2, ArrayRef<int> Mask,
15948 const X86Subtarget &Subtarget,
15949 SelectionDAG &DAG) {
15950 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15951 "Expected 256-bit or 512-bit vector");
15952
15953 bool UndefLower = isUndefLowerHalf(Mask);
15954 if (!UndefLower && !isUndefUpperHalf(Mask))
15955 return SDValue();
15956
15957 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15958 "Completely undef shuffle mask should have been simplified already");
15959
15960 // Upper half is undef and lower half is whole upper subvector.
15961 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15962 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15963 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15964 if (!UndefLower &&
15965 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15966 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15967 DAG.getVectorIdxConstant(HalfNumElts, DL));
15968 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15969 DAG.getVectorIdxConstant(0, DL));
15970 }
15971
15972 // Lower half is undef and upper half is whole lower subvector.
15973 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15974 if (UndefLower &&
15975 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15976 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15977 DAG.getVectorIdxConstant(0, DL));
15978 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15979 DAG.getVectorIdxConstant(HalfNumElts, DL));
15980 }
15981
15982 int HalfIdx1, HalfIdx2;
15983 SmallVector<int, 8> HalfMask(HalfNumElts);
15984 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15985 return SDValue();
15986
15987 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15988
15989 // Only shuffle the halves of the inputs when useful.
15990 unsigned NumLowerHalves =
15991 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15992 unsigned NumUpperHalves =
15993 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15994 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15995
15996 // Determine the larger pattern of undef/halves, then decide if it's worth
15997 // splitting the shuffle based on subtarget capabilities and types.
15998 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15999 if (!UndefLower) {
16000 // XXXXuuuu: no insert is needed.
16001 // Always extract lowers when setting lower - these are all free subreg ops.
16002 if (NumUpperHalves == 0)
16003 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16004 UndefLower, DAG);
16005
16006 if (NumUpperHalves == 1) {
16007 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16008 if (Subtarget.hasAVX2()) {
16009 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16010 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16011 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16012 (!isSingleSHUFPSMask(HalfMask) ||
16013 Subtarget.hasFastVariableCrossLaneShuffle()))
16014 return SDValue();
16015 // If this is an unary shuffle (assume that the 2nd operand is
16016 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16017 // are better off extracting the upper half of 1 operand and using a
16018 // narrow shuffle.
16019 if (EltWidth == 64 && V2.isUndef())
16020 return SDValue();
16021 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16022 // full width pshufb, and then merge.
16023 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16024 return SDValue();
16025 }
16026 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16027 if (Subtarget.hasAVX512() && VT.is512BitVector())
16028 return SDValue();
16029 // Extract + narrow shuffle is better than the wide alternative.
16030 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16031 UndefLower, DAG);
16032 }
16033
16034 // Don't extract both uppers, instead shuffle and then extract.
16035 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16036 return SDValue();
16037 }
16038
16039 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16040 if (NumUpperHalves == 0) {
16041 // AVX2 has efficient 64-bit element cross-lane shuffles.
16042 // TODO: Refine to account for unary shuffle, splat, and other masks?
16043 if (Subtarget.hasAVX2() && EltWidth == 64)
16044 return SDValue();
16045 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16046 if (Subtarget.hasAVX512() && VT.is512BitVector())
16047 return SDValue();
16048 // Narrow shuffle + insert is better than the wide alternative.
16049 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16050 UndefLower, DAG);
16051 }
16052
16053 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16054 return SDValue();
16055}
16056
16057/// Handle case where shuffle sources are coming from the same 128-bit lane and
16058/// every lane can be represented as the same repeating mask - allowing us to
16059/// shuffle the sources with the repeating shuffle and then permute the result
16060/// to the destination lanes.
16062 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16063 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16064 int NumElts = VT.getVectorNumElements();
16065 int NumLanes = VT.getSizeInBits() / 128;
16066 int NumLaneElts = NumElts / NumLanes;
16067
16068 // On AVX2 we may be able to just shuffle the lowest elements and then
16069 // broadcast the result.
16070 if (Subtarget.hasAVX2()) {
16071 for (unsigned BroadcastSize : {16, 32, 64}) {
16072 if (BroadcastSize <= VT.getScalarSizeInBits())
16073 continue;
16074 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16075
16076 // Attempt to match a repeating pattern every NumBroadcastElts,
16077 // accounting for UNDEFs but only references the lowest 128-bit
16078 // lane of the inputs.
16079 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16080 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16081 for (int j = 0; j != NumBroadcastElts; ++j) {
16082 int M = Mask[i + j];
16083 if (M < 0)
16084 continue;
16085 int &R = RepeatMask[j];
16086 if (0 != ((M % NumElts) / NumLaneElts))
16087 return false;
16088 if (0 <= R && R != M)
16089 return false;
16090 R = M;
16091 }
16092 return true;
16093 };
16094
16095 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16096 if (!FindRepeatingBroadcastMask(RepeatMask))
16097 continue;
16098
16099 // Shuffle the (lowest) repeated elements in place for broadcast.
16100 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16101
16102 // Shuffle the actual broadcast.
16103 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16104 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16105 for (int j = 0; j != NumBroadcastElts; ++j)
16106 BroadcastMask[i + j] = j;
16107
16108 // Avoid returning the same shuffle operation. For example,
16109 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16110 if (BroadcastMask == Mask)
16111 return SDValue();
16112
16113 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16114 BroadcastMask);
16115 }
16116 }
16117
16118 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16119 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16120 return SDValue();
16121
16122 // Bail if we already have a repeated lane shuffle mask.
16123 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16124 return SDValue();
16125
16126 // Helper to look for repeated mask in each split sublane, and that those
16127 // sublanes can then be permuted into place.
16128 auto ShuffleSubLanes = [&](int SubLaneScale) {
16129 int NumSubLanes = NumLanes * SubLaneScale;
16130 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16131
16132 // Check that all the sources are coming from the same lane and see if we
16133 // can form a repeating shuffle mask (local to each sub-lane). At the same
16134 // time, determine the source sub-lane for each destination sub-lane.
16135 int TopSrcSubLane = -1;
16136 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16137 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16138 SubLaneScale,
16139 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16140
16141 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16142 // Extract the sub-lane mask, check that it all comes from the same lane
16143 // and normalize the mask entries to come from the first lane.
16144 int SrcLane = -1;
16145 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16146 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16147 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16148 if (M < 0)
16149 continue;
16150 int Lane = (M % NumElts) / NumLaneElts;
16151 if ((0 <= SrcLane) && (SrcLane != Lane))
16152 return SDValue();
16153 SrcLane = Lane;
16154 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16155 SubLaneMask[Elt] = LocalM;
16156 }
16157
16158 // Whole sub-lane is UNDEF.
16159 if (SrcLane < 0)
16160 continue;
16161
16162 // Attempt to match against the candidate repeated sub-lane masks.
16163 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16164 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16165 for (int i = 0; i != NumSubLaneElts; ++i) {
16166 if (M1[i] < 0 || M2[i] < 0)
16167 continue;
16168 if (M1[i] != M2[i])
16169 return false;
16170 }
16171 return true;
16172 };
16173
16174 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16175 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16176 continue;
16177
16178 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16179 for (int i = 0; i != NumSubLaneElts; ++i) {
16180 int M = SubLaneMask[i];
16181 if (M < 0)
16182 continue;
16183 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16184 "Unexpected mask element");
16185 RepeatedSubLaneMask[i] = M;
16186 }
16187
16188 // Track the top most source sub-lane - by setting the remaining to
16189 // UNDEF we can greatly simplify shuffle matching.
16190 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16191 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16192 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16193 break;
16194 }
16195
16196 // Bail if we failed to find a matching repeated sub-lane mask.
16197 if (Dst2SrcSubLanes[DstSubLane] < 0)
16198 return SDValue();
16199 }
16200 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16201 "Unexpected source lane");
16202
16203 // Create a repeating shuffle mask for the entire vector.
16204 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16205 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16206 int Lane = SubLane / SubLaneScale;
16207 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16208 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16209 int M = RepeatedSubLaneMask[Elt];
16210 if (M < 0)
16211 continue;
16212 int Idx = (SubLane * NumSubLaneElts) + Elt;
16213 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16214 }
16215 }
16216
16217 // Shuffle each source sub-lane to its destination.
16218 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16219 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16220 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16221 if (SrcSubLane < 0)
16222 continue;
16223 for (int j = 0; j != NumSubLaneElts; ++j)
16224 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16225 }
16226
16227 // Avoid returning the same shuffle operation.
16228 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16229 if (RepeatedMask == Mask || SubLaneMask == Mask)
16230 return SDValue();
16231
16232 SDValue RepeatedShuffle =
16233 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16234
16235 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16236 SubLaneMask);
16237 };
16238
16239 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16240 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16241 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16242 // Otherwise we can only permute whole 128-bit lanes.
16243 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16244 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16245 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16246 MinSubLaneScale = 2;
16247 MaxSubLaneScale =
16248 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16249 }
16250 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16251 MinSubLaneScale = MaxSubLaneScale = 4;
16252
16253 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16254 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16255 return Shuffle;
16256
16257 return SDValue();
16258}
16259
16261 bool &ForceV1Zero, bool &ForceV2Zero,
16262 unsigned &ShuffleImm, ArrayRef<int> Mask,
16263 const APInt &Zeroable) {
16264 int NumElts = VT.getVectorNumElements();
16265 assert(VT.getScalarSizeInBits() == 64 &&
16266 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16267 "Unexpected data type for VSHUFPD");
16268 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16269 "Illegal shuffle mask");
16270
16271 bool ZeroLane[2] = { true, true };
16272 for (int i = 0; i < NumElts; ++i)
16273 ZeroLane[i & 1] &= Zeroable[i];
16274
16275 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16276 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16277 bool IsSHUFPD = true;
16278 bool IsCommutable = true;
16279 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16280 for (int i = 0; i < NumElts; ++i) {
16281 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16282 continue;
16283 if (Mask[i] < 0)
16284 return false;
16285 int Val = (i & 6) + NumElts * (i & 1);
16286 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16287 if (Mask[i] < Val || Mask[i] > Val + 1)
16288 IsSHUFPD = false;
16289 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16290 IsCommutable = false;
16291 SHUFPDMask[i] = Mask[i] % 2;
16292 }
16293
16294 if (!IsSHUFPD && !IsCommutable)
16295 return false;
16296
16297 if (!IsSHUFPD && IsCommutable)
16298 std::swap(V1, V2);
16299
16300 ForceV1Zero = ZeroLane[0];
16301 ForceV2Zero = ZeroLane[1];
16302 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16303 return true;
16304}
16305
16307 SDValue V2, ArrayRef<int> Mask,
16308 const APInt &Zeroable,
16309 const X86Subtarget &Subtarget,
16310 SelectionDAG &DAG) {
16311 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16312 "Unexpected data type for VSHUFPD");
16313
16314 unsigned Immediate = 0;
16315 bool ForceV1Zero = false, ForceV2Zero = false;
16316 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16317 Mask, Zeroable))
16318 return SDValue();
16319
16320 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16321 if (ForceV1Zero)
16322 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16323 if (ForceV2Zero)
16324 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16325
16326 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16327 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16328}
16329
16330// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16331// by zeroable elements in the remaining 24 elements. Turn this into two
16332// vmovqb instructions shuffled together.
16334 SDValue V1, SDValue V2,
16335 ArrayRef<int> Mask,
16336 const APInt &Zeroable,
16337 SelectionDAG &DAG) {
16338 assert(VT == MVT::v32i8 && "Unexpected type!");
16339
16340 // The first 8 indices should be every 8th element.
16341 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16342 return SDValue();
16343
16344 // Remaining elements need to be zeroable.
16345 if (Zeroable.countl_one() < (Mask.size() - 8))
16346 return SDValue();
16347
16348 V1 = DAG.getBitcast(MVT::v4i64, V1);
16349 V2 = DAG.getBitcast(MVT::v4i64, V2);
16350
16351 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16352 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16353
16354 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16355 // the upper bits of the result using an unpckldq.
16356 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16357 { 0, 1, 2, 3, 16, 17, 18, 19,
16358 4, 5, 6, 7, 20, 21, 22, 23 });
16359 // Insert the unpckldq into a zero vector to widen to v32i8.
16360 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16361 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16362 DAG.getVectorIdxConstant(0, DL));
16363}
16364
16365// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16366// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16367// =>
16368// ul = unpckl v1, v2
16369// uh = unpckh v1, v2
16370// a = vperm ul, uh
16371// b = vperm ul, uh
16372//
16373// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16374// and permute. We cannot directly match v3 because it is split into two
16375// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16376// pair of 256-bit shuffles and makes sure the masks are consecutive.
16377//
16378// Once unpck and permute nodes are created, the permute corresponding to this
16379// shuffle is returned, while the other permute replaces the other half of the
16380// shuffle in the selection dag.
16382 SDValue V1, SDValue V2,
16383 ArrayRef<int> Mask,
16384 SelectionDAG &DAG) {
16385 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16386 VT != MVT::v32i8)
16387 return SDValue();
16388 // <B0, B1, B0+1, B1+1, ..., >
16389 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16390 unsigned Begin1) {
16391 size_t Size = Mask.size();
16392 assert(Size % 2 == 0 && "Expected even mask size");
16393 for (unsigned I = 0; I < Size; I += 2) {
16394 if (Mask[I] != (int)(Begin0 + I / 2) ||
16395 Mask[I + 1] != (int)(Begin1 + I / 2))
16396 return false;
16397 }
16398 return true;
16399 };
16400 // Check which half is this shuffle node
16401 int NumElts = VT.getVectorNumElements();
16402 size_t FirstQtr = NumElts / 2;
16403 size_t ThirdQtr = NumElts + NumElts / 2;
16404 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16405 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16406 if (!IsFirstHalf && !IsSecondHalf)
16407 return SDValue();
16408
16409 // Find the intersection between shuffle users of V1 and V2.
16410 SmallVector<SDNode *, 2> Shuffles;
16411 for (SDNode *User : V1->users())
16412 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16413 User->getOperand(1) == V2)
16414 Shuffles.push_back(User);
16415 // Limit user size to two for now.
16416 if (Shuffles.size() != 2)
16417 return SDValue();
16418 // Find out which half of the 512-bit shuffles is each smaller shuffle
16419 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16420 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16421 SDNode *FirstHalf;
16422 SDNode *SecondHalf;
16423 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16424 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16425 FirstHalf = Shuffles[0];
16426 SecondHalf = Shuffles[1];
16427 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16428 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16429 FirstHalf = Shuffles[1];
16430 SecondHalf = Shuffles[0];
16431 } else {
16432 return SDValue();
16433 }
16434 // Lower into unpck and perm. Return the perm of this shuffle and replace
16435 // the other.
16436 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16437 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16438 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16439 DAG.getTargetConstant(0x20, DL, MVT::i8));
16440 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16441 DAG.getTargetConstant(0x31, DL, MVT::i8));
16442 if (IsFirstHalf) {
16443 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16444 return Perm1;
16445 }
16446 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16447 return Perm2;
16448}
16449
16450/// Handle lowering of 4-lane 64-bit floating point shuffles.
16451///
16452/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16453/// isn't available.
16455 const APInt &Zeroable, SDValue V1, SDValue V2,
16456 const X86Subtarget &Subtarget,
16457 SelectionDAG &DAG) {
16458 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16459 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16460 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16461
16462 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16463 Subtarget, DAG))
16464 return V;
16465
16466 if (V2.isUndef()) {
16467 // Check for being able to broadcast a single element.
16468 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16469 Mask, Subtarget, DAG))
16470 return Broadcast;
16471
16472 // Use low duplicate instructions for masks that match their pattern.
16473 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16474 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16475
16476 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16477 // Non-half-crossing single input shuffles can be lowered with an
16478 // interleaved permutation.
16479 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16480 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16481 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16482 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16483 }
16484
16485 // With AVX2 we have direct support for this permutation.
16486 if (Subtarget.hasAVX2())
16487 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16488 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16489
16490 // Try to create an in-lane repeating shuffle mask and then shuffle the
16491 // results into the target lanes.
16493 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16494 return V;
16495
16496 // Try to permute the lanes and then use a per-lane permute.
16497 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16498 Mask, DAG, Subtarget))
16499 return V;
16500
16501 // Otherwise, fall back.
16502 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16503 DAG, Subtarget);
16504 }
16505
16506 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16507 Zeroable, Subtarget, DAG))
16508 return Blend;
16509
16510 // Use dedicated unpack instructions for masks that match their pattern.
16511 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16512 return V;
16513
16514 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16515 Zeroable, Subtarget, DAG))
16516 return Op;
16517
16518 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16519 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16520 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16521 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16522
16523 // If we have lane crossing shuffles AND they don't all come from the lower
16524 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16525 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16526 // canonicalize to a blend of splat which isn't necessary for this combine.
16527 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16528 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16529 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16530 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16531 (!Subtarget.hasAVX2() ||
16532 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16533 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16534
16535 // If we have one input in place, then we can permute the other input and
16536 // blend the result.
16537 if (V1IsInPlace || V2IsInPlace)
16538 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16539 Zeroable, Subtarget, DAG);
16540
16541 // Try to create an in-lane repeating shuffle mask and then shuffle the
16542 // results into the target lanes.
16544 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16545 return V;
16546
16547 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16548 // shuffle. However, if we have AVX2 and either inputs are already in place,
16549 // we will be able to shuffle even across lanes the other input in a single
16550 // instruction so skip this pattern.
16551 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16553 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16554 return V;
16555
16556 // If we have VLX support, we can use VEXPAND.
16557 if (Subtarget.hasVLX())
16558 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16559 Zeroable, Subtarget, DAG))
16560 return V;
16561
16562 // If we have AVX2 then we always want to lower with a blend because an v4 we
16563 // can fully permute the elements.
16564 if (Subtarget.hasAVX2())
16565 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16566 Zeroable, Subtarget, DAG);
16567
16568 // Otherwise fall back on generic lowering.
16569 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16570 Subtarget, DAG);
16571}
16572
16573/// Handle lowering of 4-lane 64-bit integer shuffles.
16574///
16575/// This routine is only called when we have AVX2 and thus a reasonable
16576/// instruction set for v4i64 shuffling..
16578 const APInt &Zeroable, SDValue V1, SDValue V2,
16579 const X86Subtarget &Subtarget,
16580 SelectionDAG &DAG) {
16581 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16582 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16583 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16584 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16585
16586 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16587 Subtarget, DAG))
16588 return V;
16589
16590 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16591 Zeroable, Subtarget, DAG))
16592 return Blend;
16593
16594 // Check for being able to broadcast a single element.
16595 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16596 Subtarget, DAG))
16597 return Broadcast;
16598
16599 // Try to use shift instructions if fast.
16600 if (Subtarget.preferLowerShuffleAsShift())
16601 if (SDValue Shift =
16602 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16603 Subtarget, DAG, /*BitwiseOnly*/ true))
16604 return Shift;
16605
16606 if (V2.isUndef()) {
16607 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16608 // can use lower latency instructions that will operate on both lanes.
16609 SmallVector<int, 2> RepeatedMask;
16610 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16611 SmallVector<int, 4> PSHUFDMask;
16612 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16613 return DAG.getBitcast(
16614 MVT::v4i64,
16615 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16616 DAG.getBitcast(MVT::v8i32, V1),
16617 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16618 }
16619
16620 // AVX2 provides a direct instruction for permuting a single input across
16621 // lanes.
16622 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16623 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16624 }
16625
16626 // Try to use shift instructions.
16627 if (SDValue Shift =
16628 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16629 DAG, /*BitwiseOnly*/ false))
16630 return Shift;
16631
16632 // If we have VLX support, we can use VALIGN or VEXPAND.
16633 if (Subtarget.hasVLX()) {
16634 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16635 Zeroable, Subtarget, DAG))
16636 return Rotate;
16637
16638 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16639 Zeroable, Subtarget, DAG))
16640 return V;
16641 }
16642
16643 // Try to use PALIGNR.
16644 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16645 Subtarget, DAG))
16646 return Rotate;
16647
16648 // Use dedicated unpack instructions for masks that match their pattern.
16649 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16650 return V;
16651
16652 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16653 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16654
16655 // If we have one input in place, then we can permute the other input and
16656 // blend the result.
16657 if (V1IsInPlace || V2IsInPlace)
16658 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16659 Zeroable, Subtarget, DAG);
16660
16661 // Try to create an in-lane repeating shuffle mask and then shuffle the
16662 // results into the target lanes.
16664 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16665 return V;
16666
16667 // Try to lower to PERMQ(BLENDD(V1,V2)).
16668 if (SDValue V =
16669 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16670 return V;
16671
16672 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16673 // shuffle. However, if we have AVX2 and either inputs are already in place,
16674 // we will be able to shuffle even across lanes the other input in a single
16675 // instruction so skip this pattern.
16676 if (!V1IsInPlace && !V2IsInPlace)
16678 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16679 return Result;
16680
16681 // Otherwise fall back on generic blend lowering.
16682 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16683 Zeroable, Subtarget, DAG);
16684}
16685
16686/// Handle lowering of 8-lane 32-bit floating point shuffles.
16687///
16688/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16689/// isn't available.
16691 const APInt &Zeroable, SDValue V1, SDValue V2,
16692 const X86Subtarget &Subtarget,
16693 SelectionDAG &DAG) {
16694 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16695 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16696 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16697
16698 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16699 Zeroable, Subtarget, DAG))
16700 return Blend;
16701
16702 // Check for being able to broadcast a single element.
16703 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16704 Subtarget, DAG))
16705 return Broadcast;
16706
16707 if (!Subtarget.hasAVX2()) {
16708 SmallVector<int> InLaneMask;
16709 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16710
16711 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16712 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16713 /*SimpleOnly*/ true))
16714 return R;
16715 }
16716 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16717 Zeroable, Subtarget, DAG))
16718 return DAG.getBitcast(MVT::v8f32, ZExt);
16719
16720 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16721 // options to efficiently lower the shuffle.
16722 SmallVector<int, 4> RepeatedMask;
16723 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16724 assert(RepeatedMask.size() == 4 &&
16725 "Repeated masks must be half the mask width!");
16726
16727 // Use even/odd duplicate instructions for masks that match their pattern.
16728 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16729 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16730 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16731 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16732
16733 if (V2.isUndef())
16734 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16735 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16736
16737 // Use dedicated unpack instructions for masks that match their pattern.
16738 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16739 return V;
16740
16741 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16742 // have already handled any direct blends.
16743 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16744 }
16745
16746 // Try to create an in-lane repeating shuffle mask and then shuffle the
16747 // results into the target lanes.
16749 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16750 return V;
16751
16752 // If we have a single input shuffle with different shuffle patterns in the
16753 // two 128-bit lanes use the variable mask to VPERMILPS.
16754 if (V2.isUndef()) {
16755 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16756 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16757 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16758 }
16759 if (Subtarget.hasAVX2()) {
16760 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16761 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16762 }
16763 // Otherwise, fall back.
16764 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16765 DAG, Subtarget);
16766 }
16767
16768 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16769 // shuffle.
16771 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16772 return Result;
16773
16774 // If we have VLX support, we can use VEXPAND.
16775 if (Subtarget.hasVLX())
16776 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16777 Zeroable, Subtarget, DAG))
16778 return V;
16779
16780 // Try to match an interleave of two v8f32s and lower them as unpck and
16781 // permutes using ymms. This needs to go before we try to split the vectors.
16782 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16783 if ((Subtarget.hasAVX2() ||
16786 !Subtarget.hasAVX512())
16787 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16788 Mask, DAG))
16789 return V;
16790
16791 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16792 // since after split we get a more efficient code using vpunpcklwd and
16793 // vpunpckhwd instrs than vblend.
16794 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16795 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16796 Subtarget, DAG);
16797
16798 // If we have AVX2 then we always want to lower with a blend because at v8 we
16799 // can fully permute the elements.
16800 if (Subtarget.hasAVX2())
16801 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16802 Zeroable, Subtarget, DAG);
16803
16804 // Otherwise fall back on generic lowering.
16805 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16806 Subtarget, DAG);
16807}
16808
16809/// Handle lowering of 8-lane 32-bit integer shuffles.
16810///
16811/// This routine is only called when we have AVX2 and thus a reasonable
16812/// instruction set for v8i32 shuffling..
16814 const APInt &Zeroable, SDValue V1, SDValue V2,
16815 const X86Subtarget &Subtarget,
16816 SelectionDAG &DAG) {
16817 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16818 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16819 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16820 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16821
16822 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16823
16824 // Whenever we can lower this as a zext, that instruction is strictly faster
16825 // than any alternative. It also allows us to fold memory operands into the
16826 // shuffle in many cases.
16827 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16828 Zeroable, Subtarget, DAG))
16829 return ZExt;
16830
16831 // Try to match an interleave of two v8i32s and lower them as unpck and
16832 // permutes using ymms. This needs to go before we try to split the vectors.
16833 if (!Subtarget.hasAVX512())
16834 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16835 Mask, DAG))
16836 return V;
16837
16838 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16839 // since after split we get a more efficient code than vblend by using
16840 // vpunpcklwd and vpunpckhwd instrs.
16841 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16842 !Subtarget.hasAVX512())
16843 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16844 Subtarget, DAG);
16845
16846 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16847 Zeroable, Subtarget, DAG))
16848 return Blend;
16849
16850 // Check for being able to broadcast a single element.
16851 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16852 Subtarget, DAG))
16853 return Broadcast;
16854
16855 // Try to use shift instructions if fast.
16856 if (Subtarget.preferLowerShuffleAsShift()) {
16857 if (SDValue Shift =
16858 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16859 Subtarget, DAG, /*BitwiseOnly*/ true))
16860 return Shift;
16861 if (NumV2Elements == 0)
16862 if (SDValue Rotate =
16863 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16864 return Rotate;
16865 }
16866
16867 // If the shuffle mask is repeated in each 128-bit lane we can use more
16868 // efficient instructions that mirror the shuffles across the two 128-bit
16869 // lanes.
16870 SmallVector<int, 4> RepeatedMask;
16871 bool Is128BitLaneRepeatedShuffle =
16872 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16873 if (Is128BitLaneRepeatedShuffle) {
16874 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16875 if (V2.isUndef())
16876 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16877 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16878
16879 // Use dedicated unpack instructions for masks that match their pattern.
16880 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16881 return V;
16882 }
16883
16884 // Try to use shift instructions.
16885 if (SDValue Shift =
16886 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16887 DAG, /*BitwiseOnly*/ false))
16888 return Shift;
16889
16890 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16891 if (SDValue Rotate =
16892 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16893 return Rotate;
16894
16895 // If we have VLX support, we can use VALIGN or EXPAND.
16896 if (Subtarget.hasVLX()) {
16897 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16898 Zeroable, Subtarget, DAG))
16899 return Rotate;
16900
16901 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16902 Zeroable, Subtarget, DAG))
16903 return V;
16904 }
16905
16906 // Try to use byte rotation instructions.
16907 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16908 Subtarget, DAG))
16909 return Rotate;
16910
16911 // Try to create an in-lane repeating shuffle mask and then shuffle the
16912 // results into the target lanes.
16914 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16915 return V;
16916
16917 if (V2.isUndef()) {
16918 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16919 // because that should be faster than the variable permute alternatives.
16920 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16921 return V;
16922
16923 // If the shuffle patterns aren't repeated but it's a single input, directly
16924 // generate a cross-lane VPERMD instruction.
16925 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16926 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16927 }
16928
16929 // Assume that a single SHUFPS is faster than an alternative sequence of
16930 // multiple instructions (even if the CPU has a domain penalty).
16931 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16932 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16933 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16934 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16935 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16936 CastV1, CastV2, DAG);
16937 return DAG.getBitcast(MVT::v8i32, ShufPS);
16938 }
16939
16940 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16941 // shuffle.
16943 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16944 return Result;
16945
16946 // Otherwise fall back on generic blend lowering.
16947 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16948 Zeroable, Subtarget, DAG);
16949}
16950
16951/// Handle lowering of 16-lane 16-bit integer shuffles.
16952///
16953/// This routine is only called when we have AVX2 and thus a reasonable
16954/// instruction set for v16i16 shuffling..
16956 const APInt &Zeroable, SDValue V1, SDValue V2,
16957 const X86Subtarget &Subtarget,
16958 SelectionDAG &DAG) {
16959 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16960 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16961 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16962 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16963
16964 // Whenever we can lower this as a zext, that instruction is strictly faster
16965 // than any alternative. It also allows us to fold memory operands into the
16966 // shuffle in many cases.
16968 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16969 return ZExt;
16970
16971 // Check for being able to broadcast a single element.
16972 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16973 Subtarget, DAG))
16974 return Broadcast;
16975
16976 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16977 Zeroable, Subtarget, DAG))
16978 return Blend;
16979
16980 // Use dedicated unpack instructions for masks that match their pattern.
16981 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16982 return V;
16983
16984 // Use dedicated pack instructions for masks that match their pattern.
16985 if (SDValue V =
16986 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16987 return V;
16988
16989 // Try to use lower using a truncation.
16990 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16991 Subtarget, DAG))
16992 return V;
16993
16994 // Try to use shift instructions.
16995 if (SDValue Shift =
16996 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16997 Subtarget, DAG, /*BitwiseOnly*/ false))
16998 return Shift;
16999
17000 // Try to use byte rotation instructions.
17001 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17002 Subtarget, DAG))
17003 return Rotate;
17004
17005 // Try to create an in-lane repeating shuffle mask and then shuffle the
17006 // results into the target lanes.
17008 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17009 return V;
17010
17011 if (V2.isUndef()) {
17012 // Try to use bit rotation instructions.
17013 if (SDValue Rotate =
17014 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17015 return Rotate;
17016
17017 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17018 // because that should be faster than the variable permute alternatives.
17019 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17020 return V;
17021
17022 // There are no generalized cross-lane shuffle operations available on i16
17023 // element types.
17024 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17026 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17027 return V;
17028
17029 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17030 DAG, Subtarget);
17031 }
17032
17033 SmallVector<int, 8> RepeatedMask;
17034 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17035 // As this is a single-input shuffle, the repeated mask should be
17036 // a strictly valid v8i16 mask that we can pass through to the v8i16
17037 // lowering to handle even the v16 case.
17039 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17040 }
17041 }
17042
17043 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17044 Zeroable, Subtarget, DAG))
17045 return PSHUFB;
17046
17047 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17048 if (Subtarget.hasBWI())
17049 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17050
17051 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17052 // shuffle.
17054 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17055 return Result;
17056
17057 // Try to permute the lanes and then use a per-lane permute.
17059 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17060 return V;
17061
17062 // Try to match an interleave of two v16i16s and lower them as unpck and
17063 // permutes using ymms.
17064 if (!Subtarget.hasAVX512())
17065 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17066 Mask, DAG))
17067 return V;
17068
17069 // Otherwise fall back on generic lowering.
17070 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17071 Subtarget, DAG);
17072}
17073
17074/// Handle lowering of 32-lane 8-bit integer shuffles.
17075///
17076/// This routine is only called when we have AVX2 and thus a reasonable
17077/// instruction set for v32i8 shuffling..
17079 const APInt &Zeroable, SDValue V1, SDValue V2,
17080 const X86Subtarget &Subtarget,
17081 SelectionDAG &DAG) {
17082 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17083 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17084 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17085 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17086
17087 // Whenever we can lower this as a zext, that instruction is strictly faster
17088 // than any alternative. It also allows us to fold memory operands into the
17089 // shuffle in many cases.
17090 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17091 Zeroable, Subtarget, DAG))
17092 return ZExt;
17093
17094 // Check for being able to broadcast a single element.
17095 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17096 Subtarget, DAG))
17097 return Broadcast;
17098
17099 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17100 Zeroable, Subtarget, DAG))
17101 return Blend;
17102
17103 // Use dedicated unpack instructions for masks that match their pattern.
17104 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17105 return V;
17106
17107 // Use dedicated pack instructions for masks that match their pattern.
17108 if (SDValue V =
17109 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17110 return V;
17111
17112 // Try to use lower using a truncation.
17113 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17114 Subtarget, DAG))
17115 return V;
17116
17117 // Try to use shift instructions.
17118 if (SDValue Shift =
17119 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17120 DAG, /*BitwiseOnly*/ false))
17121 return Shift;
17122
17123 // Try to use byte rotation instructions.
17124 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17125 Subtarget, DAG))
17126 return Rotate;
17127
17128 // Try to use bit rotation instructions.
17129 if (V2.isUndef())
17130 if (SDValue Rotate =
17131 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17132 return Rotate;
17133
17134 // Try to create an in-lane repeating shuffle mask and then shuffle the
17135 // results into the target lanes.
17137 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17138 return V;
17139
17140 // There are no generalized cross-lane shuffle operations available on i8
17141 // element types.
17142 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17143 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17144 // because that should be faster than the variable permute alternatives.
17145 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17146 return V;
17147
17149 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17150 return V;
17151
17152 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17153 DAG, Subtarget);
17154 }
17155
17156 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17157 Zeroable, Subtarget, DAG))
17158 return PSHUFB;
17159
17160 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17161 if (Subtarget.hasVBMI())
17162 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17163
17164 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17165 // shuffle.
17167 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17168 return Result;
17169
17170 // Try to permute the lanes and then use a per-lane permute.
17172 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17173 return V;
17174
17175 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17176 // by zeroable elements in the remaining 24 elements. Turn this into two
17177 // vmovqb instructions shuffled together.
17178 if (Subtarget.hasVLX())
17179 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17180 Mask, Zeroable, DAG))
17181 return V;
17182
17183 // Try to match an interleave of two v32i8s and lower them as unpck and
17184 // permutes using ymms.
17185 if (!Subtarget.hasAVX512())
17186 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17187 Mask, DAG))
17188 return V;
17189
17190 // Otherwise fall back on generic lowering.
17191 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17192 Subtarget, DAG);
17193}
17194
17195/// High-level routine to lower various 256-bit x86 vector shuffles.
17196///
17197/// This routine either breaks down the specific type of a 256-bit x86 vector
17198/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17199/// together based on the available instructions.
17201 SDValue V1, SDValue V2, const APInt &Zeroable,
17202 const X86Subtarget &Subtarget,
17203 SelectionDAG &DAG) {
17204 // If we have a single input to the zero element, insert that into V1 if we
17205 // can do so cheaply.
17206 int NumElts = VT.getVectorNumElements();
17207 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17208
17209 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17211 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17212 return Insertion;
17213
17214 // Handle special cases where the lower or upper half is UNDEF.
17215 if (SDValue V =
17216 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17217 return V;
17218
17219 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17220 // can check for those subtargets here and avoid much of the subtarget
17221 // querying in the per-vector-type lowering routines. With AVX1 we have
17222 // essentially *zero* ability to manipulate a 256-bit vector with integer
17223 // types. Since we'll use floating point types there eventually, just
17224 // immediately cast everything to a float and operate entirely in that domain.
17225 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17226 int ElementBits = VT.getScalarSizeInBits();
17227 if (ElementBits < 32) {
17228 // No floating point type available, if we can't use the bit operations
17229 // for masking/blending then decompose into 128-bit vectors.
17230 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17231 Subtarget, DAG))
17232 return V;
17233 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17234 return V;
17235 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17236 }
17237
17238 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17240 V1 = DAG.getBitcast(FpVT, V1);
17241 V2 = DAG.getBitcast(FpVT, V2);
17242 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17243 }
17244
17245 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17246 V1 = DAG.getBitcast(MVT::v16i16, V1);
17247 V2 = DAG.getBitcast(MVT::v16i16, V2);
17248 return DAG.getBitcast(VT,
17249 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17250 }
17251
17252 switch (VT.SimpleTy) {
17253 case MVT::v4f64:
17254 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17255 case MVT::v4i64:
17256 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17257 case MVT::v8f32:
17258 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17259 case MVT::v8i32:
17260 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17261 case MVT::v16i16:
17262 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17263 case MVT::v32i8:
17264 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17265
17266 default:
17267 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17268 }
17269}
17270
17271/// Try to lower a vector shuffle as a 128-bit shuffles.
17273 const APInt &Zeroable, SDValue V1, SDValue V2,
17274 const X86Subtarget &Subtarget,
17275 SelectionDAG &DAG) {
17276 assert(VT.getScalarSizeInBits() == 64 &&
17277 "Unexpected element type size for 128bit shuffle.");
17278
17279 // To handle 256 bit vector requires VLX and most probably
17280 // function lowerV2X128VectorShuffle() is better solution.
17281 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17282
17283 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17284 SmallVector<int, 4> Widened128Mask;
17285 if (!canWidenShuffleElements(Mask, Widened128Mask))
17286 return SDValue();
17287 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17288
17289 // Try to use an insert into a zero vector.
17290 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17291 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17292 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17293 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17294 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17295 DAG.getVectorIdxConstant(0, DL));
17296 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17297 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17298 DAG.getVectorIdxConstant(0, DL));
17299 }
17300
17301 // Check for patterns which can be matched with a single insert of a 256-bit
17302 // subvector.
17303 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17304 if (OnlyUsesV1 ||
17305 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17306 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17307 SDValue SubVec =
17308 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17309 DAG.getVectorIdxConstant(0, DL));
17310 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17311 DAG.getVectorIdxConstant(4, DL));
17312 }
17313
17314 // See if this is an insertion of the lower 128-bits of V2 into V1.
17315 bool IsInsert = true;
17316 int V2Index = -1;
17317 for (int i = 0; i < 4; ++i) {
17318 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17319 if (Widened128Mask[i] < 0)
17320 continue;
17321
17322 // Make sure all V1 subvectors are in place.
17323 if (Widened128Mask[i] < 4) {
17324 if (Widened128Mask[i] != i) {
17325 IsInsert = false;
17326 break;
17327 }
17328 } else {
17329 // Make sure we only have a single V2 index and its the lowest 128-bits.
17330 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17331 IsInsert = false;
17332 break;
17333 }
17334 V2Index = i;
17335 }
17336 }
17337 if (IsInsert && V2Index >= 0) {
17338 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17339 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17340 DAG.getVectorIdxConstant(0, DL));
17341 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17342 }
17343
17344 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17345 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17346 // possible we at least ensure the lanes stay sequential to help later
17347 // combines.
17348 SmallVector<int, 2> Widened256Mask;
17349 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17350 Widened128Mask.clear();
17351 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17352 }
17353
17354 // Try to lower to vshuf64x2/vshuf32x4.
17355 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17356 int PermMask[4] = {-1, -1, -1, -1};
17357 // Ensure elements came from the same Op.
17358 for (int i = 0; i < 4; ++i) {
17359 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17360 if (Widened128Mask[i] < 0)
17361 continue;
17362
17363 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17364 unsigned OpIndex = i / 2;
17365 if (Ops[OpIndex].isUndef())
17366 Ops[OpIndex] = Op;
17367 else if (Ops[OpIndex] != Op)
17368 return SDValue();
17369
17370 PermMask[i] = Widened128Mask[i] % 4;
17371 }
17372
17373 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17374 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17375}
17376
17377/// Handle lowering of 8-lane 64-bit floating point shuffles.
17379 const APInt &Zeroable, SDValue V1, SDValue V2,
17380 const X86Subtarget &Subtarget,
17381 SelectionDAG &DAG) {
17382 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17383 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17384 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17385
17386 if (V2.isUndef()) {
17387 // Use low duplicate instructions for masks that match their pattern.
17388 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17389 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17390
17391 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17392 // Non-half-crossing single input shuffles can be lowered with an
17393 // interleaved permutation.
17394 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17395 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17396 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17397 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17398 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17399 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17400 }
17401
17402 SmallVector<int, 4> RepeatedMask;
17403 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17404 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17405 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17406 }
17407
17408 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17409 V2, Subtarget, DAG))
17410 return Shuf128;
17411
17412 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17413 return Unpck;
17414
17415 // Check if the blend happens to exactly fit that of SHUFPD.
17416 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17417 Zeroable, Subtarget, DAG))
17418 return Op;
17419
17420 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17421 Subtarget, DAG))
17422 return V;
17423
17424 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17425 Zeroable, Subtarget, DAG))
17426 return Blend;
17427
17428 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17429}
17430
17431/// Handle lowering of 16-lane 32-bit floating point shuffles.
17433 const APInt &Zeroable, SDValue V1, SDValue V2,
17434 const X86Subtarget &Subtarget,
17435 SelectionDAG &DAG) {
17436 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17437 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17438 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17439
17440 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17441 // options to efficiently lower the shuffle.
17442 SmallVector<int, 4> RepeatedMask;
17443 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17444 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17445
17446 // Use even/odd duplicate instructions for masks that match their pattern.
17447 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17448 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17449 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17450 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17451
17452 if (V2.isUndef())
17453 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17454 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17455
17456 // Use dedicated unpack instructions for masks that match their pattern.
17457 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17458 return V;
17459
17460 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17461 Zeroable, Subtarget, DAG))
17462 return Blend;
17463
17464 // Otherwise, fall back to a SHUFPS sequence.
17465 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17466 }
17467
17468 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17469 Zeroable, Subtarget, DAG))
17470 return Blend;
17471
17473 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17474 return DAG.getBitcast(MVT::v16f32, ZExt);
17475
17476 // Try to create an in-lane repeating shuffle mask and then shuffle the
17477 // results into the target lanes.
17479 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17480 return V;
17481
17482 // If we have a single input shuffle with different shuffle patterns in the
17483 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17484 if (V2.isUndef() &&
17485 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17486 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17487 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17488 }
17489
17490 // If we have AVX512F support, we can use VEXPAND.
17491 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17492 Zeroable, Subtarget, DAG))
17493 return V;
17494
17495 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17496}
17497
17498/// Handle lowering of 8-lane 64-bit integer shuffles.
17500 const APInt &Zeroable, SDValue V1, SDValue V2,
17501 const X86Subtarget &Subtarget,
17502 SelectionDAG &DAG) {
17503 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17504 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17505 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17506
17507 // Try to use shift instructions if fast.
17508 if (Subtarget.preferLowerShuffleAsShift())
17509 if (SDValue Shift =
17510 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17511 Subtarget, DAG, /*BitwiseOnly*/ true))
17512 return Shift;
17513
17514 if (V2.isUndef()) {
17515 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17516 // can use lower latency instructions that will operate on all four
17517 // 128-bit lanes.
17518 SmallVector<int, 2> Repeated128Mask;
17519 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17520 SmallVector<int, 4> PSHUFDMask;
17521 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17522 return DAG.getBitcast(
17523 MVT::v8i64,
17524 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17525 DAG.getBitcast(MVT::v16i32, V1),
17526 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17527 }
17528
17529 SmallVector<int, 4> Repeated256Mask;
17530 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17531 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17532 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17533 }
17534
17535 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17536 V2, Subtarget, DAG))
17537 return Shuf128;
17538
17539 // Try to use shift instructions.
17540 if (SDValue Shift =
17541 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17542 DAG, /*BitwiseOnly*/ false))
17543 return Shift;
17544
17545 // Try to use VALIGN.
17546 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17547 Zeroable, Subtarget, DAG))
17548 return Rotate;
17549
17550 // Try to use PALIGNR.
17551 if (Subtarget.hasBWI())
17552 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17553 Subtarget, DAG))
17554 return Rotate;
17555
17556 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17557 return Unpck;
17558
17559 // If we have AVX512F support, we can use VEXPAND.
17560 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17561 Subtarget, DAG))
17562 return V;
17563
17564 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17565 Zeroable, Subtarget, DAG))
17566 return Blend;
17567
17568 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17569}
17570
17571/// Handle lowering of 16-lane 32-bit integer shuffles.
17573 const APInt &Zeroable, SDValue V1, SDValue V2,
17574 const X86Subtarget &Subtarget,
17575 SelectionDAG &DAG) {
17576 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17577 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17578 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17579
17580 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17581
17582 // Whenever we can lower this as a zext, that instruction is strictly faster
17583 // than any alternative. It also allows us to fold memory operands into the
17584 // shuffle in many cases.
17586 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17587 return ZExt;
17588
17589 // Try to use shift instructions if fast.
17590 if (Subtarget.preferLowerShuffleAsShift()) {
17591 if (SDValue Shift =
17592 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17593 Subtarget, DAG, /*BitwiseOnly*/ true))
17594 return Shift;
17595 if (NumV2Elements == 0)
17596 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17597 Subtarget, DAG))
17598 return Rotate;
17599 }
17600
17601 // If the shuffle mask is repeated in each 128-bit lane we can use more
17602 // efficient instructions that mirror the shuffles across the four 128-bit
17603 // lanes.
17604 SmallVector<int, 4> RepeatedMask;
17605 bool Is128BitLaneRepeatedShuffle =
17606 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17607 if (Is128BitLaneRepeatedShuffle) {
17608 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17609 if (V2.isUndef())
17610 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17611 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17612
17613 // Use dedicated unpack instructions for masks that match their pattern.
17614 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17615 return V;
17616 }
17617
17618 // Try to use shift instructions.
17619 if (SDValue Shift =
17620 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17621 Subtarget, DAG, /*BitwiseOnly*/ false))
17622 return Shift;
17623
17624 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17625 if (SDValue Rotate =
17626 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17627 return Rotate;
17628
17629 // Try to use VALIGN.
17630 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17631 Zeroable, Subtarget, DAG))
17632 return Rotate;
17633
17634 // Try to use byte rotation instructions.
17635 if (Subtarget.hasBWI())
17636 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17637 Subtarget, DAG))
17638 return Rotate;
17639
17640 // Assume that a single SHUFPS is faster than using a permv shuffle.
17641 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17642 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17643 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17644 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17645 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17646 CastV1, CastV2, DAG);
17647 return DAG.getBitcast(MVT::v16i32, ShufPS);
17648 }
17649
17650 // Try to create an in-lane repeating shuffle mask and then shuffle the
17651 // results into the target lanes.
17653 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17654 return V;
17655
17656 // If we have AVX512F support, we can use VEXPAND.
17657 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17658 Zeroable, Subtarget, DAG))
17659 return V;
17660
17661 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17662 Zeroable, Subtarget, DAG))
17663 return Blend;
17664
17665 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17666}
17667
17668/// Handle lowering of 32-lane 16-bit integer shuffles.
17670 const APInt &Zeroable, SDValue V1, SDValue V2,
17671 const X86Subtarget &Subtarget,
17672 SelectionDAG &DAG) {
17673 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17674 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17675 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17676 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17677
17678 // Whenever we can lower this as a zext, that instruction is strictly faster
17679 // than any alternative. It also allows us to fold memory operands into the
17680 // shuffle in many cases.
17682 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17683 return ZExt;
17684
17685 // Use dedicated unpack instructions for masks that match their pattern.
17686 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17687 return V;
17688
17689 // Use dedicated pack instructions for masks that match their pattern.
17690 if (SDValue V =
17691 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17692 return V;
17693
17694 // Try to use shift instructions.
17695 if (SDValue Shift =
17696 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17697 Subtarget, DAG, /*BitwiseOnly*/ false))
17698 return Shift;
17699
17700 // Try to use byte rotation instructions.
17701 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17702 Subtarget, DAG))
17703 return Rotate;
17704
17705 if (V2.isUndef()) {
17706 // Try to use bit rotation instructions.
17707 if (SDValue Rotate =
17708 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17709 return Rotate;
17710
17711 SmallVector<int, 8> RepeatedMask;
17712 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17713 // As this is a single-input shuffle, the repeated mask should be
17714 // a strictly valid v8i16 mask that we can pass through to the v8i16
17715 // lowering to handle even the v32 case.
17716 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17717 RepeatedMask, Subtarget, DAG);
17718 }
17719 }
17720
17721 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17722 Zeroable, Subtarget, DAG))
17723 return Blend;
17724
17725 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17726 Zeroable, Subtarget, DAG))
17727 return PSHUFB;
17728
17729 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17730 // shuffle.
17731 if (!V2.isUndef())
17733 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17734 return Result;
17735
17736 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17737}
17738
17739/// Handle lowering of 64-lane 8-bit integer shuffles.
17741 const APInt &Zeroable, SDValue V1, SDValue V2,
17742 const X86Subtarget &Subtarget,
17743 SelectionDAG &DAG) {
17744 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17745 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17746 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17747 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17748
17749 // Whenever we can lower this as a zext, that instruction is strictly faster
17750 // than any alternative. It also allows us to fold memory operands into the
17751 // shuffle in many cases.
17753 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17754 return ZExt;
17755
17756 // Use dedicated unpack instructions for masks that match their pattern.
17757 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17758 return V;
17759
17760 // Use dedicated pack instructions for masks that match their pattern.
17761 if (SDValue V =
17762 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17763 return V;
17764
17765 // Try to use shift instructions.
17766 if (SDValue Shift =
17767 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17768 DAG, /*BitwiseOnly*/ false))
17769 return Shift;
17770
17771 // Try to use byte rotation instructions.
17772 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17773 Subtarget, DAG))
17774 return Rotate;
17775
17776 // Try to use bit rotation instructions.
17777 if (V2.isUndef())
17778 if (SDValue Rotate =
17779 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17780 return Rotate;
17781
17782 // Lower as AND if possible.
17783 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17784 Zeroable, Subtarget, DAG))
17785 return Masked;
17786
17787 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17788 Zeroable, Subtarget, DAG))
17789 return PSHUFB;
17790
17791 // Try to create an in-lane repeating shuffle mask and then shuffle the
17792 // results into the target lanes.
17794 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17795 return V;
17796
17798 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17799 return Result;
17800
17801 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17802 Zeroable, Subtarget, DAG))
17803 return Blend;
17804
17805 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17806 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17807 // PALIGNR will be cheaper than the second PSHUFB+OR.
17808 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17809 Mask, Subtarget, DAG))
17810 return V;
17811
17812 // If we can't directly blend but can use PSHUFB, that will be better as it
17813 // can both shuffle and set up the inefficient blend.
17814 bool V1InUse, V2InUse;
17815 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17816 DAG, V1InUse, V2InUse);
17817 }
17818
17819 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17820 // shuffle.
17821 if (!V2.isUndef())
17823 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17824 return Result;
17825
17826 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17827 if (Subtarget.hasVBMI())
17828 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17829
17830 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17831}
17832
17833/// High-level routine to lower various 512-bit x86 vector shuffles.
17834///
17835/// This routine either breaks down the specific type of a 512-bit x86 vector
17836/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17837/// together based on the available instructions.
17839 MVT VT, SDValue V1, SDValue V2,
17840 const APInt &Zeroable,
17841 const X86Subtarget &Subtarget,
17842 SelectionDAG &DAG) {
17843 assert(Subtarget.hasAVX512() &&
17844 "Cannot lower 512-bit vectors w/ basic ISA!");
17845
17846 // If we have a single input to the zero element, insert that into V1 if we
17847 // can do so cheaply.
17848 int NumElts = Mask.size();
17849 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17850
17851 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17853 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17854 return Insertion;
17855
17856 // Handle special cases where the lower or upper half is UNDEF.
17857 if (SDValue V =
17858 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17859 return V;
17860
17861 // Check for being able to broadcast a single element.
17862 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17863 Subtarget, DAG))
17864 return Broadcast;
17865
17866 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17867 // Try using bit ops for masking and blending before falling back to
17868 // splitting.
17869 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17870 Subtarget, DAG))
17871 return V;
17872 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17873 return V;
17874
17875 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17876 }
17877
17878 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17879 if (!Subtarget.hasBWI())
17880 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17881 /*SimpleOnly*/ false);
17882
17883 V1 = DAG.getBitcast(MVT::v32i16, V1);
17884 V2 = DAG.getBitcast(MVT::v32i16, V2);
17885 return DAG.getBitcast(VT,
17886 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17887 }
17888
17889 // Dispatch to each element type for lowering. If we don't have support for
17890 // specific element type shuffles at 512 bits, immediately split them and
17891 // lower them. Each lowering routine of a given type is allowed to assume that
17892 // the requisite ISA extensions for that element type are available.
17893 switch (VT.SimpleTy) {
17894 case MVT::v8f64:
17895 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17896 case MVT::v16f32:
17897 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17898 case MVT::v8i64:
17899 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17900 case MVT::v16i32:
17901 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17902 case MVT::v32i16:
17903 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17904 case MVT::v64i8:
17905 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17906
17907 default:
17908 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17909 }
17910}
17911
17913 MVT VT, SDValue V1, SDValue V2,
17914 const X86Subtarget &Subtarget,
17915 SelectionDAG &DAG) {
17916 // Shuffle should be unary.
17917 if (!V2.isUndef())
17918 return SDValue();
17919
17920 int ShiftAmt = -1;
17921 int NumElts = Mask.size();
17922 for (int i = 0; i != NumElts; ++i) {
17923 int M = Mask[i];
17924 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17925 "Unexpected mask index.");
17926 if (M < 0)
17927 continue;
17928
17929 // The first non-undef element determines our shift amount.
17930 if (ShiftAmt < 0) {
17931 ShiftAmt = M - i;
17932 // Need to be shifting right.
17933 if (ShiftAmt <= 0)
17934 return SDValue();
17935 }
17936 // All non-undef elements must shift by the same amount.
17937 if (ShiftAmt != M - i)
17938 return SDValue();
17939 }
17940 assert(ShiftAmt >= 0 && "All undef?");
17941
17942 // Great we found a shift right.
17943 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17944 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17945 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17946 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17947 DAG.getVectorIdxConstant(0, DL));
17948}
17949
17950// Determine if this shuffle can be implemented with a KSHIFT instruction.
17951// Returns the shift amount if possible or -1 if not. This is a simplified
17952// version of matchShuffleAsShift.
17953static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17954 int MaskOffset, const APInt &Zeroable) {
17955 int Size = Mask.size();
17956
17957 auto CheckZeros = [&](int Shift, bool Left) {
17958 for (int j = 0; j < Shift; ++j)
17959 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17960 return false;
17961
17962 return true;
17963 };
17964
17965 auto MatchShift = [&](int Shift, bool Left) {
17966 unsigned Pos = Left ? Shift : 0;
17967 unsigned Low = Left ? 0 : Shift;
17968 unsigned Len = Size - Shift;
17969 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17970 };
17971
17972 for (int Shift = 1; Shift != Size; ++Shift)
17973 for (bool Left : {true, false})
17974 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17976 return Shift;
17977 }
17978
17979 return -1;
17980}
17981
17982
17983// Lower vXi1 vector shuffles.
17984// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17985// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17986// vector, shuffle and then truncate it back.
17988 MVT VT, SDValue V1, SDValue V2,
17989 const APInt &Zeroable,
17990 const X86Subtarget &Subtarget,
17991 SelectionDAG &DAG) {
17992 assert(Subtarget.hasAVX512() &&
17993 "Cannot lower 512-bit vectors w/o basic ISA!");
17994
17995 int NumElts = Mask.size();
17996 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17997
17998 // Try to recognize shuffles that are just padding a subvector with zeros.
17999 int SubvecElts = 0;
18000 int Src = -1;
18001 for (int i = 0; i != NumElts; ++i) {
18002 if (Mask[i] >= 0) {
18003 // Grab the source from the first valid mask. All subsequent elements need
18004 // to use this same source.
18005 if (Src < 0)
18006 Src = Mask[i] / NumElts;
18007 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18008 break;
18009 }
18010
18011 ++SubvecElts;
18012 }
18013 assert(SubvecElts != NumElts && "Identity shuffle?");
18014
18015 // Clip to a power 2.
18016 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18017
18018 // Make sure the number of zeroable bits in the top at least covers the bits
18019 // not covered by the subvector.
18020 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18021 assert(Src >= 0 && "Expected a source!");
18022 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18023 SDValue Extract =
18024 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18025 DAG.getVectorIdxConstant(0, DL));
18026 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18027 DAG.getConstant(0, DL, VT), Extract,
18028 DAG.getVectorIdxConstant(0, DL));
18029 }
18030
18031 // Try a simple shift right with undef elements. Later we'll try with zeros.
18032 if (SDValue Shift =
18033 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18034 return Shift;
18035
18036 // Try to match KSHIFTs.
18037 unsigned Offset = 0;
18038 for (SDValue V : {V1, V2}) {
18039 unsigned Opcode;
18040 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18041 if (ShiftAmt >= 0) {
18042 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18043 MVT WideVT = Res.getSimpleValueType();
18044 // Widened right shifts need two shifts to ensure we shift in zeroes.
18045 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18046 int WideElts = WideVT.getVectorNumElements();
18047 // Shift left to put the original vector in the MSBs of the new size.
18048 Res =
18049 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18050 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18051 // Increase the shift amount to account for the left shift.
18052 ShiftAmt += WideElts - NumElts;
18053 }
18054
18055 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18056 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18057 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18058 DAG.getVectorIdxConstant(0, DL));
18059 }
18060 Offset += NumElts; // Increment for next iteration.
18061 }
18062
18063 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18064 // ops instead.
18065 // TODO: What other unary shuffles would benefit from this?
18066 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18067 SDValue Op0 = V1.getOperand(0);
18068 SDValue Op1 = V1.getOperand(1);
18070 EVT OpVT = Op0.getValueType();
18071 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18072 return DAG.getSetCC(
18073 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18074 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18075 }
18076
18077 MVT ExtVT;
18078 switch (VT.SimpleTy) {
18079 default:
18080 llvm_unreachable("Expected a vector of i1 elements");
18081 case MVT::v2i1:
18082 ExtVT = MVT::v2i64;
18083 break;
18084 case MVT::v4i1:
18085 ExtVT = MVT::v4i32;
18086 break;
18087 case MVT::v8i1:
18088 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18089 // shuffle.
18090 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18091 break;
18092 case MVT::v16i1:
18093 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18094 // 256-bit operation available.
18095 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18096 break;
18097 case MVT::v32i1:
18098 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18099 // 256-bit operation available.
18100 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18101 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18102 break;
18103 case MVT::v64i1:
18104 // Fall back to scalarization. FIXME: We can do better if the shuffle
18105 // can be partitioned cleanly.
18106 if (!Subtarget.useBWIRegs())
18107 return SDValue();
18108 ExtVT = MVT::v64i8;
18109 break;
18110 }
18111
18112 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18113 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18114
18115 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18116 // i1 was sign extended we can use X86ISD::CVT2MASK.
18117 int NumElems = VT.getVectorNumElements();
18118 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18119 (Subtarget.hasDQI() && (NumElems < 32)))
18120 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18121 Shuffle, ISD::SETGT);
18122
18123 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18124}
18125
18126/// Helper function that returns true if the shuffle mask should be
18127/// commuted to improve canonicalization.
18129 int NumElements = Mask.size();
18130
18131 int NumV1Elements = 0, NumV2Elements = 0;
18132 for (int M : Mask)
18133 if (M < 0)
18134 continue;
18135 else if (M < NumElements)
18136 ++NumV1Elements;
18137 else
18138 ++NumV2Elements;
18139
18140 // Commute the shuffle as needed such that more elements come from V1 than
18141 // V2. This allows us to match the shuffle pattern strictly on how many
18142 // elements come from V1 without handling the symmetric cases.
18143 if (NumV2Elements > NumV1Elements)
18144 return true;
18145
18146 assert(NumV1Elements > 0 && "No V1 indices");
18147
18148 if (NumV2Elements == 0)
18149 return false;
18150
18151 // When the number of V1 and V2 elements are the same, try to minimize the
18152 // number of uses of V2 in the low half of the vector. When that is tied,
18153 // ensure that the sum of indices for V1 is equal to or lower than the sum
18154 // indices for V2. When those are equal, try to ensure that the number of odd
18155 // indices for V1 is lower than the number of odd indices for V2.
18156 if (NumV1Elements == NumV2Elements) {
18157 int LowV1Elements = 0, LowV2Elements = 0;
18158 for (int M : Mask.slice(0, NumElements / 2))
18159 if (M >= NumElements)
18160 ++LowV2Elements;
18161 else if (M >= 0)
18162 ++LowV1Elements;
18163 if (LowV2Elements > LowV1Elements)
18164 return true;
18165 if (LowV2Elements == LowV1Elements) {
18166 int SumV1Indices = 0, SumV2Indices = 0;
18167 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18168 if (Mask[i] >= NumElements)
18169 SumV2Indices += i;
18170 else if (Mask[i] >= 0)
18171 SumV1Indices += i;
18172 if (SumV2Indices < SumV1Indices)
18173 return true;
18174 if (SumV2Indices == SumV1Indices) {
18175 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18176 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18177 if (Mask[i] >= NumElements)
18178 NumV2OddIndices += i % 2;
18179 else if (Mask[i] >= 0)
18180 NumV1OddIndices += i % 2;
18181 if (NumV2OddIndices < NumV1OddIndices)
18182 return true;
18183 }
18184 }
18185 }
18186
18187 return false;
18188}
18189
18191 const X86Subtarget &Subtarget) {
18192 if (!Subtarget.hasAVX512())
18193 return false;
18194
18195 if (!V.getValueType().isSimple())
18196 return false;
18197
18198 MVT VT = V.getSimpleValueType().getScalarType();
18199 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18200 return false;
18201
18202 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18203 // are preferable to blendw/blendvb/masked-mov.
18204 if ((VT == MVT::i16 || VT == MVT::i8) &&
18205 V.getSimpleValueType().getSizeInBits() < 512)
18206 return false;
18207
18208 auto HasMaskOperation = [&](SDValue V) {
18209 // TODO: Currently we only check limited opcode. We probably extend
18210 // it to all binary operation by checking TLI.isBinOp().
18211 switch (V->getOpcode()) {
18212 default:
18213 return false;
18214 case ISD::ADD:
18215 case ISD::SUB:
18216 case ISD::AND:
18217 case ISD::XOR:
18218 case ISD::OR:
18219 case ISD::SMAX:
18220 case ISD::SMIN:
18221 case ISD::UMAX:
18222 case ISD::UMIN:
18223 case ISD::ABS:
18224 case ISD::SHL:
18225 case ISD::SRL:
18226 case ISD::SRA:
18227 case ISD::MUL:
18228 break;
18229 }
18230 if (!V->hasOneUse())
18231 return false;
18232
18233 return true;
18234 };
18235
18236 if (HasMaskOperation(V))
18237 return true;
18238
18239 return false;
18240}
18241
18242// Forward declaration.
18245 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18246 const X86Subtarget &Subtarget);
18247
18248 /// Top-level lowering for x86 vector shuffles.
18249///
18250/// This handles decomposition, canonicalization, and lowering of all x86
18251/// vector shuffles. Most of the specific lowering strategies are encapsulated
18252/// above in helper routines. The canonicalization attempts to widen shuffles
18253/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18254/// s.t. only one of the two inputs needs to be tested, etc.
18256 SelectionDAG &DAG) {
18258 ArrayRef<int> OrigMask = SVOp->getMask();
18259 SDValue V1 = Op.getOperand(0);
18260 SDValue V2 = Op.getOperand(1);
18261 MVT VT = Op.getSimpleValueType();
18262 int NumElements = VT.getVectorNumElements();
18263 SDLoc DL(Op);
18264 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18265
18266 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18267 "Can't lower MMX shuffles");
18268
18269 bool V1IsUndef = V1.isUndef();
18270 bool V2IsUndef = V2.isUndef();
18271 if (V1IsUndef && V2IsUndef)
18272 return DAG.getUNDEF(VT);
18273
18274 // When we create a shuffle node we put the UNDEF node to second operand,
18275 // but in some cases the first operand may be transformed to UNDEF.
18276 // In this case we should just commute the node.
18277 if (V1IsUndef)
18278 return DAG.getCommutedVectorShuffle(*SVOp);
18279
18280 // Check for non-undef masks pointing at an undef vector and make the masks
18281 // undef as well. This makes it easier to match the shuffle based solely on
18282 // the mask.
18283 if (V2IsUndef &&
18284 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18285 SmallVector<int, 8> NewMask(OrigMask);
18286 for (int &M : NewMask)
18287 if (M >= NumElements)
18288 M = -1;
18289 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18290 }
18291
18292 // Check for illegal shuffle mask element index values.
18293 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18294 (void)MaskUpperLimit;
18295 assert(llvm::all_of(OrigMask,
18296 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18297 "Out of bounds shuffle index");
18298
18299 // We actually see shuffles that are entirely re-arrangements of a set of
18300 // zero inputs. This mostly happens while decomposing complex shuffles into
18301 // simple ones. Directly lower these as a buildvector of zeros.
18302 APInt KnownUndef, KnownZero;
18303 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18304
18305 APInt Zeroable = KnownUndef | KnownZero;
18306 if (Zeroable.isAllOnes())
18307 return getZeroVector(VT, Subtarget, DAG, DL);
18308
18309 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18310
18311 // Try to collapse shuffles into using a vector type with fewer elements but
18312 // wider element types. We cap this to not form integers or floating point
18313 // elements wider than 64 bits. It does not seem beneficial to form i128
18314 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18315 SmallVector<int, 16> WidenedMask;
18316 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18317 !canCombineAsMaskOperation(V1, Subtarget) &&
18318 !canCombineAsMaskOperation(V2, Subtarget) &&
18319 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18320 // Shuffle mask widening should not interfere with a broadcast opportunity
18321 // by obfuscating the operands with bitcasts.
18322 // TODO: Avoid lowering directly from this top-level function: make this
18323 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18324 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18325 Subtarget, DAG))
18326 return Broadcast;
18327
18328 MVT NewEltVT = VT.isFloatingPoint()
18331 int NewNumElts = NumElements / 2;
18332 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18333 // Make sure that the new vector type is legal. For example, v2f64 isn't
18334 // legal on SSE1.
18335 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18336 if (V2IsZero) {
18337 // Modify the new Mask to take all zeros from the all-zero vector.
18338 // Choose indices that are blend-friendly.
18339 bool UsedZeroVector = false;
18340 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18341 "V2's non-undef elements are used?!");
18342 for (int i = 0; i != NewNumElts; ++i)
18343 if (WidenedMask[i] == SM_SentinelZero) {
18344 WidenedMask[i] = i + NewNumElts;
18345 UsedZeroVector = true;
18346 }
18347 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18348 // some elements to be undef.
18349 if (UsedZeroVector)
18350 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18351 }
18352 V1 = DAG.getBitcast(NewVT, V1);
18353 V2 = DAG.getBitcast(NewVT, V2);
18354 return DAG.getBitcast(
18355 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18356 }
18357 }
18358
18359 SmallVector<SDValue> Ops = {V1, V2};
18360 SmallVector<int> Mask(OrigMask);
18361
18362 // Canonicalize the shuffle with any horizontal ops inputs.
18363 // NOTE: This may update Ops and Mask.
18365 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18366 return DAG.getBitcast(VT, HOp);
18367
18368 V1 = DAG.getBitcast(VT, Ops[0]);
18369 V2 = DAG.getBitcast(VT, Ops[1]);
18370 assert(NumElements == (int)Mask.size() &&
18371 "canonicalizeShuffleMaskWithHorizOp "
18372 "shouldn't alter the shuffle mask size");
18373
18374 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18375 // These will be materialized uniformly anyway, so make splat matching easier.
18376 // TODO: Allow all int constants?
18377 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18378 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18379 BitVector Undefs;
18380 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18381 if (Undefs.any() &&
18384 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18385 }
18386 }
18387 }
18388 return V;
18389 };
18390 V1 = CanonicalizeConstant(V1);
18391 V2 = CanonicalizeConstant(V2);
18392
18393 // Commute the shuffle if it will improve canonicalization.
18396 std::swap(V1, V2);
18397 }
18398
18399 // For each vector width, delegate to a specialized lowering routine.
18400 if (VT.is128BitVector())
18401 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18402
18403 if (VT.is256BitVector())
18404 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18405
18406 if (VT.is512BitVector())
18407 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18408
18409 if (Is1BitVector)
18410 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18411
18412 llvm_unreachable("Unimplemented!");
18413}
18414
18415// As legal vpcompress instructions depend on various AVX512 extensions, try to
18416// convert illegal vector sizes to legal ones to avoid expansion.
18418 SelectionDAG &DAG) {
18419 assert(Subtarget.hasAVX512() &&
18420 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18421
18422 SDLoc DL(Op);
18423 SDValue Vec = Op.getOperand(0);
18424 SDValue Mask = Op.getOperand(1);
18425 SDValue Passthru = Op.getOperand(2);
18426
18427 EVT VecVT = Vec.getValueType();
18428 EVT ElementVT = VecVT.getVectorElementType();
18429 unsigned NumElements = VecVT.getVectorNumElements();
18430 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18431 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18432
18433 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18434 // compressed as 512-bit vectors in AVX512F.
18435 if (NumVecBits != 128 && NumVecBits != 256)
18436 return SDValue();
18437
18438 if (NumElementBits == 32 || NumElementBits == 64) {
18439 unsigned NumLargeElements = 512 / NumElementBits;
18440 MVT LargeVecVT =
18441 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18442 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18443
18444 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18445 DAG, DL);
18446 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18447 Subtarget, DAG, DL);
18448 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18449 : widenSubVector(LargeVecVT, Passthru,
18450 /*ZeroNewElements=*/false,
18451 Subtarget, DAG, DL);
18452
18453 SDValue Compressed =
18454 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18455 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18456 DAG.getConstant(0, DL, MVT::i64));
18457 }
18458
18459 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18460 VecVT == MVT::v16i16) {
18461 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18462 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18463
18464 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18465 Passthru = Passthru.isUndef()
18466 ? DAG.getUNDEF(LargeVecVT)
18467 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18468
18469 SDValue Compressed =
18470 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18471 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18472 }
18473
18474 return SDValue();
18475}
18476
18477/// Try to lower a VSELECT instruction to a vector shuffle.
18479 const X86Subtarget &Subtarget,
18480 SelectionDAG &DAG) {
18481 SDValue Cond = Op.getOperand(0);
18482 SDValue LHS = Op.getOperand(1);
18483 SDValue RHS = Op.getOperand(2);
18484 MVT VT = Op.getSimpleValueType();
18485
18486 // Only non-legal VSELECTs reach this lowering, convert those into generic
18487 // shuffles and re-use the shuffle lowering path for blends.
18491 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18492 }
18493
18494 return SDValue();
18495}
18496
18497SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18498 SDValue Cond = Op.getOperand(0);
18499 SDValue LHS = Op.getOperand(1);
18500 SDValue RHS = Op.getOperand(2);
18501
18502 SDLoc dl(Op);
18503 MVT VT = Op.getSimpleValueType();
18504 if (isSoftF16(VT, Subtarget)) {
18505 MVT NVT = VT.changeVectorElementTypeToInteger();
18506 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18507 DAG.getBitcast(NVT, LHS),
18508 DAG.getBitcast(NVT, RHS)));
18509 }
18510
18511 // A vselect where all conditions and data are constants can be optimized into
18512 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18516 return SDValue();
18517
18518 // Try to lower this to a blend-style vector shuffle. This can handle all
18519 // constant condition cases.
18520 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18521 return BlendOp;
18522
18523 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18524 // with patterns on the mask registers on AVX-512.
18525 MVT CondVT = Cond.getSimpleValueType();
18526 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18527 if (CondEltSize == 1)
18528 return Op;
18529
18530 // Variable blends are only legal from SSE4.1 onward.
18531 if (!Subtarget.hasSSE41())
18532 return SDValue();
18533
18534 unsigned EltSize = VT.getScalarSizeInBits();
18535 unsigned NumElts = VT.getVectorNumElements();
18536
18537 // Expand v32i16/v64i8 without BWI.
18538 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18539 return SDValue();
18540
18541 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18542 // into an i1 condition so that we can use the mask-based 512-bit blend
18543 // instructions.
18544 if (VT.getSizeInBits() == 512) {
18545 // Build a mask by testing the condition against zero.
18546 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18547 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18548 DAG.getConstant(0, dl, CondVT),
18549 ISD::SETNE);
18550 // Now return a new VSELECT using the mask.
18551 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18552 }
18553
18554 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18555 if (CondEltSize != EltSize) {
18556 // If we don't have a sign splat, rely on the expansion.
18557 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18558 return SDValue();
18559
18560 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18561 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18562 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18563 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18564 }
18565
18566 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18567 // are free to split, then better to split before expanding the
18568 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18569 // TODO: This is very similar to narrowVectorSelect.
18570 // TODO: Add Load splitting to isFreeToSplitVector ?
18571 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18572 !Subtarget.hasXOP()) {
18573 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18574 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18575 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18576 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18577 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18578 if (FreeCond && (FreeLHS || FreeRHS))
18579 return splitVectorOp(Op, DAG, dl);
18580 }
18581
18582 // Only some types will be legal on some subtargets. If we can emit a legal
18583 // VSELECT-matching blend, return Op, and but if we need to expand, return
18584 // a null value.
18585 switch (VT.SimpleTy) {
18586 default:
18587 // Most of the vector types have blends past SSE4.1.
18588 return Op;
18589
18590 case MVT::v32i8:
18591 // The byte blends for AVX vectors were introduced only in AVX2.
18592 if (Subtarget.hasAVX2())
18593 return Op;
18594
18595 return SDValue();
18596
18597 case MVT::v8i16:
18598 case MVT::v16i16:
18599 case MVT::v8f16:
18600 case MVT::v16f16: {
18601 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18602 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18603 Cond = DAG.getBitcast(CastVT, Cond);
18604 LHS = DAG.getBitcast(CastVT, LHS);
18605 RHS = DAG.getBitcast(CastVT, RHS);
18606 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18607 return DAG.getBitcast(VT, Select);
18608 }
18609 }
18610}
18611
18613 MVT VT = Op.getSimpleValueType();
18614 SDValue Vec = Op.getOperand(0);
18615 SDValue Idx = Op.getOperand(1);
18616 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18617 SDLoc dl(Op);
18618
18620 return SDValue();
18621
18622 if (VT.getSizeInBits() == 8) {
18623 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18624 // we're going to zero extend the register or fold the store.
18627 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18628 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18629 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18630
18631 unsigned IdxVal = Idx->getAsZExtVal();
18632 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18633 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18634 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18635 }
18636
18637 if (VT == MVT::f32) {
18638 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18639 // the result back to FR32 register. It's only worth matching if the
18640 // result has a single use which is a store or a bitcast to i32. And in
18641 // the case of a store, it's not worth it if the index is a constant 0,
18642 // because a MOVSSmr can be used instead, which is smaller and faster.
18643 if (!Op.hasOneUse())
18644 return SDValue();
18645 SDNode *User = *Op.getNode()->user_begin();
18646 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18647 (User->getOpcode() != ISD::BITCAST ||
18648 User->getValueType(0) != MVT::i32))
18649 return SDValue();
18650 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18651 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18652 return DAG.getBitcast(MVT::f32, Extract);
18653 }
18654
18655 if (VT == MVT::i32 || VT == MVT::i64)
18656 return Op;
18657
18658 return SDValue();
18659}
18660
18661/// Extract one bit from mask vector, like v16i1 or v8i1.
18662/// AVX-512 feature.
18664 const X86Subtarget &Subtarget) {
18665 SDValue Vec = Op.getOperand(0);
18666 SDLoc dl(Vec);
18667 MVT VecVT = Vec.getSimpleValueType();
18668 SDValue Idx = Op.getOperand(1);
18669 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18670 MVT EltVT = Op.getSimpleValueType();
18671
18672 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18673 "Unexpected vector type in ExtractBitFromMaskVector");
18674
18675 // variable index can't be handled in mask registers,
18676 // extend vector to VR512/128
18677 if (!IdxC) {
18678 unsigned NumElts = VecVT.getVectorNumElements();
18679 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18680 // than extending to 128/256bit.
18681 if (NumElts == 1) {
18682 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18684 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18685 }
18686 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18687 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18688 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18689 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18690 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18691 }
18692
18693 unsigned IdxVal = IdxC->getZExtValue();
18694 if (IdxVal == 0) // the operation is legal
18695 return Op;
18696
18697 // Extend to natively supported kshift.
18698 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18699
18700 // Use kshiftr instruction to move to the lower element.
18701 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18702 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18703
18704 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18705 DAG.getVectorIdxConstant(0, dl));
18706}
18707
18708// Helper to find all the extracted elements from a vector.
18710 MVT VT = N->getSimpleValueType(0);
18711 unsigned NumElts = VT.getVectorNumElements();
18712 APInt DemandedElts = APInt::getZero(NumElts);
18713 for (SDNode *User : N->users()) {
18714 switch (User->getOpcode()) {
18715 case X86ISD::PEXTRB:
18716 case X86ISD::PEXTRW:
18719 DemandedElts.setAllBits();
18720 return DemandedElts;
18721 }
18722 DemandedElts.setBit(User->getConstantOperandVal(1));
18723 break;
18724 case ISD::BITCAST: {
18725 if (!User->getValueType(0).isSimple() ||
18726 !User->getValueType(0).isVector()) {
18727 DemandedElts.setAllBits();
18728 return DemandedElts;
18729 }
18730 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18731 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18732 break;
18733 }
18734 default:
18735 DemandedElts.setAllBits();
18736 return DemandedElts;
18737 }
18738 }
18739 return DemandedElts;
18740}
18741
18742SDValue
18743X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18744 SelectionDAG &DAG) const {
18745 SDLoc dl(Op);
18746 SDValue Vec = Op.getOperand(0);
18747 MVT VecVT = Vec.getSimpleValueType();
18748 SDValue Idx = Op.getOperand(1);
18749 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18750
18751 if (VecVT.getVectorElementType() == MVT::i1)
18752 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18753
18754 if (!IdxC) {
18755 // Its more profitable to go through memory (1 cycles throughput)
18756 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18757 // IACA tool was used to get performance estimation
18758 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18759 //
18760 // example : extractelement <16 x i8> %a, i32 %i
18761 //
18762 // Block Throughput: 3.00 Cycles
18763 // Throughput Bottleneck: Port5
18764 //
18765 // | Num Of | Ports pressure in cycles | |
18766 // | Uops | 0 - DV | 5 | 6 | 7 | |
18767 // ---------------------------------------------
18768 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18769 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18770 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18771 // Total Num Of Uops: 4
18772 //
18773 //
18774 // Block Throughput: 1.00 Cycles
18775 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18776 //
18777 // | | Ports pressure in cycles | |
18778 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18779 // ---------------------------------------------------------
18780 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18781 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18782 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18783 // Total Num Of Uops: 4
18784
18785 return SDValue();
18786 }
18787
18788 unsigned IdxVal = IdxC->getZExtValue();
18789
18790 // If this is a 256-bit vector result, first extract the 128-bit vector and
18791 // then extract the element from the 128-bit vector.
18792 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18793 // Get the 128-bit vector.
18794 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18795 MVT EltVT = VecVT.getVectorElementType();
18796
18797 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18798 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18799
18800 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18801 // this can be done with a mask.
18802 IdxVal &= ElemsPerChunk - 1;
18803 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18804 DAG.getVectorIdxConstant(IdxVal, dl));
18805 }
18806
18807 assert(VecVT.is128BitVector() && "Unexpected vector length");
18808
18809 MVT VT = Op.getSimpleValueType();
18810
18811 if (VT == MVT::i16) {
18812 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18813 // we're going to zero extend the register or fold the store (SSE41 only).
18814 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18815 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18816 if (Subtarget.hasFP16())
18817 return Op;
18818
18819 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18820 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18821 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18822 }
18823
18824 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18825 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18826 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18827 }
18828
18829 if (Subtarget.hasSSE41())
18830 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18831 return Res;
18832
18833 // Only extract a single element from a v16i8 source - determine the common
18834 // DWORD/WORD that all extractions share, and extract the sub-byte.
18835 // TODO: Add QWORD MOVQ extraction?
18836 if (VT == MVT::i8) {
18837 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18838 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18839
18840 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18841 int DWordIdx = IdxVal / 4;
18842 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18843 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18844 DAG.getBitcast(MVT::v4i32, Vec),
18845 DAG.getVectorIdxConstant(DWordIdx, dl));
18846 int ShiftVal = (IdxVal % 4) * 8;
18847 if (ShiftVal != 0)
18848 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18849 DAG.getConstant(ShiftVal, dl, MVT::i8));
18850 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18851 }
18852
18853 int WordIdx = IdxVal / 2;
18854 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18855 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18856 DAG.getBitcast(MVT::v8i16, Vec),
18857 DAG.getVectorIdxConstant(WordIdx, dl));
18858 int ShiftVal = (IdxVal % 2) * 8;
18859 if (ShiftVal != 0)
18860 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18861 DAG.getConstant(ShiftVal, dl, MVT::i8));
18862 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18863 }
18864 }
18865
18866 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18867 if (IdxVal == 0)
18868 return Op;
18869
18870 // Shuffle the element to the lowest element, then movss or movsh.
18871 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18872 Mask[0] = static_cast<int>(IdxVal);
18873 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18874 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18875 DAG.getVectorIdxConstant(0, dl));
18876 }
18877
18878 if (VT.getSizeInBits() == 64) {
18879 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18880 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18881 // to match extract_elt for f64.
18882 if (IdxVal == 0)
18883 return Op;
18884
18885 // UNPCKHPD the element to the lowest double word, then movsd.
18886 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18887 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18888 int Mask[2] = { 1, -1 };
18889 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18890 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18891 DAG.getVectorIdxConstant(0, dl));
18892 }
18893
18894 return SDValue();
18895}
18896
18897/// Insert one bit to mask vector, like v16i1 or v8i1.
18898/// AVX-512 feature.
18900 const X86Subtarget &Subtarget) {
18901 SDLoc dl(Op);
18902 SDValue Vec = Op.getOperand(0);
18903 SDValue Elt = Op.getOperand(1);
18904 SDValue Idx = Op.getOperand(2);
18905 MVT VecVT = Vec.getSimpleValueType();
18906
18907 if (!isa<ConstantSDNode>(Idx)) {
18908 // Non constant index. Extend source and destination,
18909 // insert element and then truncate the result.
18910 unsigned NumElts = VecVT.getVectorNumElements();
18911 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18912 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18913 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18914 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18915 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18916 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18917 }
18918
18919 // Copy into a k-register, extract to v1i1 and insert_subvector.
18920 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18921 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18922}
18923
18924SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18925 SelectionDAG &DAG) const {
18926 MVT VT = Op.getSimpleValueType();
18927 MVT EltVT = VT.getVectorElementType();
18928 unsigned NumElts = VT.getVectorNumElements();
18929 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18930
18931 if (EltVT == MVT::i1)
18932 return InsertBitToMaskVector(Op, DAG, Subtarget);
18933
18934 SDLoc dl(Op);
18935 SDValue N0 = Op.getOperand(0);
18936 SDValue N1 = Op.getOperand(1);
18937 SDValue N2 = Op.getOperand(2);
18938 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18939
18940 if (EltVT == MVT::bf16) {
18941 MVT IVT = VT.changeVectorElementTypeToInteger();
18942 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18943 DAG.getBitcast(IVT, N0),
18944 DAG.getBitcast(MVT::i16, N1), N2);
18945 return DAG.getBitcast(VT, Res);
18946 }
18947
18948 if (!N2C) {
18949 // Variable insertion indices, usually we're better off spilling to stack,
18950 // but AVX512 can use a variable compare+select by comparing against all
18951 // possible vector indices, and FP insertion has less gpr->simd traffic.
18952 if (!(Subtarget.hasBWI() ||
18953 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18954 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18955 return SDValue();
18956
18957 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18958 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18959 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18960 return SDValue();
18961
18962 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18963 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18964 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18965
18966 SmallVector<SDValue, 16> RawIndices;
18967 for (unsigned I = 0; I != NumElts; ++I)
18968 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18969 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18970
18971 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18972 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18974 }
18975
18976 if (N2C->getAPIntValue().uge(NumElts))
18977 return SDValue();
18978 uint64_t IdxVal = N2C->getZExtValue();
18979
18980 bool IsZeroElt = X86::isZeroNode(N1);
18981 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18982
18983 if (IsZeroElt || IsAllOnesElt) {
18984 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18985 // We don't deal with i8 0 since it appears to be handled elsewhere.
18986 if (IsAllOnesElt &&
18987 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18988 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18989 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18990 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18991 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18992 CstVectorElts[IdxVal] = OnesCst;
18993 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18994 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18995 }
18996 // See if we can do this more efficiently with a blend shuffle with a
18997 // rematerializable vector.
18998 if (Subtarget.hasSSE41() &&
18999 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19000 SmallVector<int, 8> BlendMask;
19001 for (unsigned i = 0; i != NumElts; ++i)
19002 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19003 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19004 : getOnesVector(VT, DAG, dl);
19005 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19006 }
19007 }
19008
19009 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19010 // into that, and then insert the subvector back into the result.
19011 if (VT.is256BitVector() || VT.is512BitVector()) {
19012 // With a 256-bit vector, we can insert into the zero element efficiently
19013 // using a blend if we have AVX or AVX2 and the right data type.
19014 if (VT.is256BitVector() && IdxVal == 0) {
19015 // TODO: It is worthwhile to cast integer to floating point and back
19016 // and incur a domain crossing penalty if that's what we'll end up
19017 // doing anyway after extracting to a 128-bit vector.
19018 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19019 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19020 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19021 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19022 DAG.getTargetConstant(1, dl, MVT::i8));
19023 }
19024 }
19025
19026 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19027 assert(isPowerOf2_32(NumEltsIn128) &&
19028 "Vectors will always have power-of-two number of elements.");
19029
19030 // If we are not inserting into the low 128-bit vector chunk,
19031 // then prefer the broadcast+blend sequence.
19032 // FIXME: relax the profitability check iff all N1 uses are insertions.
19033 if (IdxVal >= NumEltsIn128 &&
19034 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19035 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19036 X86::mayFoldLoad(N1, Subtarget)))) {
19037 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19038 SmallVector<int, 8> BlendMask;
19039 for (unsigned i = 0; i != NumElts; ++i)
19040 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19041 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19042 }
19043
19044 // Get the desired 128-bit vector chunk.
19045 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19046
19047 // Insert the element into the desired chunk.
19048 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19049 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19050
19051 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19052 DAG.getVectorIdxConstant(IdxIn128, dl));
19053
19054 // Insert the changed part back into the bigger vector
19055 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19056 }
19057 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19058
19059 // This will be just movw/movd/movq/movsh/movss/movsd.
19060 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19061 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19062 EltVT == MVT::f16 || EltVT == MVT::i64) {
19063 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19064 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19065 }
19066
19067 // We can't directly insert an i8 or i16 into a vector, so zero extend
19068 // it to i32 first.
19069 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19070 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19071 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19072 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19073 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19074 return DAG.getBitcast(VT, N1);
19075 }
19076 }
19077
19078 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19079 // argument. SSE41 required for pinsrb.
19080 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19081 unsigned Opc;
19082 if (VT == MVT::v8i16) {
19083 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19085 } else {
19086 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19087 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19089 }
19090
19091 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19092 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19093 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19094 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19095 }
19096
19097 if (Subtarget.hasSSE41()) {
19098 if (EltVT == MVT::f32) {
19099 // Bits [7:6] of the constant are the source select. This will always be
19100 // zero here. The DAG Combiner may combine an extract_elt index into
19101 // these bits. For example (insert (extract, 3), 2) could be matched by
19102 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19103 // Bits [5:4] of the constant are the destination select. This is the
19104 // value of the incoming immediate.
19105 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19106 // combine either bitwise AND or insert of float 0.0 to set these bits.
19107
19108 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19109 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19110 // If this is an insertion of 32-bits into the low 32-bits of
19111 // a vector, we prefer to generate a blend with immediate rather
19112 // than an insertps. Blends are simpler operations in hardware and so
19113 // will always have equal or better performance than insertps.
19114 // But if optimizing for size and there's a load folding opportunity,
19115 // generate insertps because blendps does not have a 32-bit memory
19116 // operand form.
19117 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19118 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19119 DAG.getTargetConstant(1, dl, MVT::i8));
19120 }
19121 // Create this as a scalar to vector..
19122 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19123 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19124 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19125 }
19126
19127 // PINSR* works with constant index.
19128 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19129 return Op;
19130 }
19131
19132 return SDValue();
19133}
19134
19136 SelectionDAG &DAG) {
19137 SDLoc dl(Op);
19138 MVT OpVT = Op.getSimpleValueType();
19139
19140 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19141 // combines.
19142 if (X86::isZeroNode(Op.getOperand(0)))
19143 return getZeroVector(OpVT, Subtarget, DAG, dl);
19144
19145 // If this is a 256-bit vector result, first insert into a 128-bit
19146 // vector and then insert into the 256-bit vector.
19147 if (!OpVT.is128BitVector()) {
19148 // Insert into a 128-bit vector.
19149 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19151 OpVT.getVectorNumElements() / SizeFactor);
19152
19153 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19154
19155 // Insert the 128-bit vector.
19156 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19157 }
19158 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19159 "Expected an SSE type!");
19160
19161 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19162 // tblgen.
19163 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19164 return Op;
19165
19166 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19167 return DAG.getBitcast(
19168 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19169}
19170
19171// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19172// simple superregister reference or explicit instructions to insert
19173// the upper bits of a vector.
19175 SelectionDAG &DAG) {
19176 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19177
19178 return insert1BitVector(Op, DAG, Subtarget);
19179}
19180
19182 SelectionDAG &DAG) {
19183 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19184 "Only vXi1 extract_subvectors need custom lowering");
19185
19186 SDLoc dl(Op);
19187 SDValue Vec = Op.getOperand(0);
19188 uint64_t IdxVal = Op.getConstantOperandVal(1);
19189
19190 if (IdxVal == 0) // the operation is legal
19191 return Op;
19192
19193 // Extend to natively supported kshift.
19194 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19195
19196 // Shift to the LSB.
19197 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19198 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19199
19200 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19201 DAG.getVectorIdxConstant(0, dl));
19202}
19203
19204// Returns the appropriate wrapper opcode for a global reference.
19205unsigned X86TargetLowering::getGlobalWrapperKind(
19206 const GlobalValue *GV, const unsigned char OpFlags) const {
19207 // References to absolute symbols are never PC-relative.
19208 if (GV && GV->isAbsoluteSymbolRef())
19209 return X86ISD::Wrapper;
19210
19211 // The following OpFlags under RIP-rel PIC use RIP.
19212 if (Subtarget.isPICStyleRIPRel() &&
19213 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19214 OpFlags == X86II::MO_DLLIMPORT))
19215 return X86ISD::WrapperRIP;
19216
19217 // GOTPCREL references must always use RIP.
19218 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19219 return X86ISD::WrapperRIP;
19220
19221 return X86ISD::Wrapper;
19222}
19223
19224// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19225// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19226// one of the above mentioned nodes. It has to be wrapped because otherwise
19227// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19228// be used to form addressing mode. These wrapped nodes will be selected
19229// into MOV32ri.
19230SDValue
19231X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19232 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19233
19234 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19235 // global base reg.
19236 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19237
19238 auto PtrVT = getPointerTy(DAG.getDataLayout());
19240 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19241 SDLoc DL(CP);
19242 Result =
19243 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19244 // With PIC, the address is actually $g + Offset.
19245 if (OpFlag) {
19246 Result =
19247 DAG.getNode(ISD::ADD, DL, PtrVT,
19248 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19249 }
19250
19251 return Result;
19252}
19253
19254SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19255 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19256
19257 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19258 // global base reg.
19259 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19260
19261 EVT PtrVT = Op.getValueType();
19262 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19263 SDLoc DL(JT);
19264 Result =
19265 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19266
19267 // With PIC, the address is actually $g + Offset.
19268 if (OpFlag)
19269 Result =
19270 DAG.getNode(ISD::ADD, DL, PtrVT,
19271 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19272
19273 return Result;
19274}
19275
19276SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19277 SelectionDAG &DAG) const {
19278 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19279}
19280
19281SDValue
19282X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19283 // Create the TargetBlockAddressAddress node.
19284 unsigned char OpFlags =
19285 Subtarget.classifyBlockAddressReference();
19286 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19287 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19288 SDLoc dl(Op);
19289 EVT PtrVT = Op.getValueType();
19290 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19291 Result =
19292 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19293
19294 // With PIC, the address is actually $g + Offset.
19295 if (isGlobalRelativeToPICBase(OpFlags)) {
19296 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19297 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19298 }
19299
19300 return Result;
19301}
19302
19303/// Creates target global address or external symbol nodes for calls or
19304/// other uses.
19305SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19306 bool ForCall,
19307 bool *IsImpCall) const {
19308 // Unpack the global address or external symbol.
19309 SDLoc dl(Op);
19310 const GlobalValue *GV = nullptr;
19311 int64_t Offset = 0;
19312 const char *ExternalSym = nullptr;
19313 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19314 GV = G->getGlobal();
19315 Offset = G->getOffset();
19316 } else {
19317 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19318 ExternalSym = ES->getSymbol();
19319 }
19320
19321 // Calculate some flags for address lowering.
19323 unsigned char OpFlags;
19324 if (ForCall)
19325 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19326 else
19327 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19328 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19329 bool NeedsLoad = isGlobalStubReference(OpFlags);
19330
19332 EVT PtrVT = Op.getValueType();
19334
19335 if (GV) {
19336 // Create a target global address if this is a global. If possible, fold the
19337 // offset into the global address reference. Otherwise, ADD it on later.
19338 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19339 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19340 // relocation will compute to a negative value, which is invalid.
19341 int64_t GlobalOffset = 0;
19342 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19344 std::swap(GlobalOffset, Offset);
19345 }
19346 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19347 } else {
19348 // If this is not a global address, this must be an external symbol.
19349 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19350 }
19351
19352 // If this is a direct call, avoid the wrapper if we don't need to do any
19353 // loads or adds. This allows SDAG ISel to match direct calls.
19354 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19355 return Result;
19356
19357 // If Import Call Optimization is enabled and this is an imported function
19358 // then make a note of it and return the global address without wrapping.
19359 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19360 Mod.getModuleFlag("import-call-optimization")) {
19361 assert(ForCall && "Should only enable import call optimization if we are "
19362 "lowering a call");
19363 *IsImpCall = true;
19364 return Result;
19365 }
19366
19367 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19368
19369 // With PIC, the address is actually $g + Offset.
19370 if (HasPICReg) {
19371 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19372 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19373 }
19374
19375 // For globals that require a load from a stub to get the address, emit the
19376 // load.
19377 if (NeedsLoad)
19378 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19380
19381 // If there was a non-zero offset that we didn't fold, create an explicit
19382 // addition for it.
19383 if (Offset != 0)
19384 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19385 DAG.getSignedConstant(Offset, dl, PtrVT));
19386
19387 return Result;
19388}
19389
19390SDValue
19391X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19392 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19393}
19394
19396 const EVT PtrVT, unsigned ReturnReg,
19397 unsigned char OperandFlags,
19398 bool LoadGlobalBaseReg = false,
19399 bool LocalDynamic = false) {
19401 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19402 SDLoc dl(GA);
19403 SDValue TGA;
19404 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19405 SDValue Chain = DAG.getEntryNode();
19406 SDValue Ret;
19407 if (LocalDynamic && UseTLSDESC) {
19408 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19409 // Reuse existing GetTLSADDR node if we can find it.
19410 if (TGA->hasOneUse()) {
19411 // TLSDESC uses TGA.
19412 SDNode *TLSDescOp = *TGA->user_begin();
19413 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19414 "Unexpected TLSDESC DAG");
19415 // CALLSEQ_END uses TGA via a chain and glue.
19416 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19417 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19418 "Unexpected TLSDESC DAG");
19419 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19420 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19421 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19422 "Unexpected TLSDESC DAG");
19423 Ret = SDValue(CopyFromRegOp, 0);
19424 }
19425 } else {
19426 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19427 GA->getOffset(), OperandFlags);
19428 }
19429
19430 if (!Ret) {
19431 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19432 : LocalDynamic ? X86ISD::TLSBASEADDR
19434
19435 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19436 if (LoadGlobalBaseReg) {
19437 SDValue InGlue;
19438 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19439 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19440 InGlue);
19441 InGlue = Chain.getValue(1);
19442 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19443 } else {
19444 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19445 }
19446 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19447
19448 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19449 MFI.setHasCalls(true);
19450
19451 SDValue Glue = Chain.getValue(1);
19452 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19453 }
19454
19455 if (!UseTLSDESC)
19456 return Ret;
19457
19458 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19459 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19460
19462 SDValue Offset =
19463 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19465 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19466}
19467
19468// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19469static SDValue
19471 const EVT PtrVT) {
19472 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19473 /*LoadGlobalBaseReg=*/true);
19474}
19475
19476// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19477static SDValue
19479 const EVT PtrVT) {
19480 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19481}
19482
19483// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19484static SDValue
19486 const EVT PtrVT) {
19487 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19488}
19489
19491 SelectionDAG &DAG, const EVT PtrVT,
19492 bool Is64Bit, bool Is64BitLP64) {
19493 SDLoc dl(GA);
19494
19495 // Get the start address of the TLS block for this module.
19499
19500 SDValue Base;
19501 if (Is64Bit) {
19502 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19503 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19504 /*LoadGlobalBaseReg=*/false,
19505 /*LocalDynamic=*/true);
19506 } else {
19507 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19508 /*LoadGlobalBaseReg=*/true,
19509 /*LocalDynamic=*/true);
19510 }
19511
19512 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19513 // of Base.
19514
19515 // Build x@dtpoff.
19516 unsigned char OperandFlags = X86II::MO_DTPOFF;
19517 unsigned WrapperKind = X86ISD::Wrapper;
19518 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19519 GA->getValueType(0),
19520 GA->getOffset(), OperandFlags);
19521 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19522
19523 // Add x@dtpoff with the base.
19524 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19525}
19526
19527// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19529 const EVT PtrVT, TLSModel::Model model,
19530 bool is64Bit, bool isPIC) {
19531 SDLoc dl(GA);
19532
19533 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19536
19537 SDValue ThreadPointer =
19538 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19540
19541 unsigned char OperandFlags = 0;
19542 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19543 // initialexec.
19544 unsigned WrapperKind = X86ISD::Wrapper;
19545 if (model == TLSModel::LocalExec) {
19546 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19547 } else if (model == TLSModel::InitialExec) {
19548 if (is64Bit) {
19549 OperandFlags = X86II::MO_GOTTPOFF;
19550 WrapperKind = X86ISD::WrapperRIP;
19551 } else {
19552 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19553 }
19554 } else {
19555 llvm_unreachable("Unexpected model");
19556 }
19557
19558 // emit "addl x@ntpoff,%eax" (local exec)
19559 // or "addl x@indntpoff,%eax" (initial exec)
19560 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19561 SDValue TGA =
19562 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19563 GA->getOffset(), OperandFlags);
19564 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19565
19566 if (model == TLSModel::InitialExec) {
19567 if (isPIC && !is64Bit) {
19568 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19569 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19570 Offset);
19571 }
19572
19573 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19575 }
19576
19577 // The address of the thread local variable is the add of the thread
19578 // pointer with the offset of the variable.
19579 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19580}
19581
19582SDValue
19583X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19584
19585 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19586
19587 if (DAG.getTarget().useEmulatedTLS())
19588 return LowerToTLSEmulatedModel(GA, DAG);
19589
19590 const GlobalValue *GV = GA->getGlobal();
19591 EVT PtrVT = Op.getValueType();
19592 bool PositionIndependent = isPositionIndependent();
19593
19594 if (Subtarget.isTargetELF()) {
19595 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19596 switch (model) {
19598 if (Subtarget.is64Bit()) {
19599 if (Subtarget.isTarget64BitLP64())
19600 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19601 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19602 }
19603 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19605 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19606 Subtarget.isTarget64BitLP64());
19609 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19610 PositionIndependent);
19611 }
19612 llvm_unreachable("Unknown TLS model.");
19613 }
19614
19615 if (Subtarget.isTargetDarwin()) {
19616 // Darwin only has one model of TLS. Lower to that.
19617 unsigned char OpFlag = 0;
19618 unsigned WrapperKind = 0;
19619
19620 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19621 // global base reg.
19622 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19623 if (PIC32) {
19624 OpFlag = X86II::MO_TLVP_PIC_BASE;
19625 WrapperKind = X86ISD::Wrapper;
19626 } else {
19627 OpFlag = X86II::MO_TLVP;
19628 WrapperKind = X86ISD::WrapperRIP;
19629 }
19630 SDLoc DL(Op);
19632 GA->getValueType(0),
19633 GA->getOffset(), OpFlag);
19634 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19635
19636 // With PIC32, the address is actually $g + Offset.
19637 if (PIC32)
19638 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19639 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19640 Offset);
19641
19642 // Lowering the machine isd will make sure everything is in the right
19643 // location.
19644 SDValue Chain = DAG.getEntryNode();
19645 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19646 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19647 SDValue Args[] = { Chain, Offset };
19648 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19649 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19650
19651 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19652 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19653 MFI.setAdjustsStack(true);
19654
19655 // And our return value (tls address) is in the standard call return value
19656 // location.
19657 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19658 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19659 }
19660
19661 if (Subtarget.isOSWindows()) {
19662 // Just use the implicit TLS architecture
19663 // Need to generate something similar to:
19664 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19665 // ; from TEB
19666 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19667 // mov rcx, qword [rdx+rcx*8]
19668 // mov eax, .tls$:tlsvar
19669 // [rax+rcx] contains the address
19670 // Windows 64bit: gs:0x58
19671 // Windows 32bit: fs:__tls_array
19672
19673 SDLoc dl(GA);
19674 SDValue Chain = DAG.getEntryNode();
19675
19676 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19677 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19678 // use its literal value of 0x2C.
19680 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19682
19683 SDValue TlsArray = Subtarget.is64Bit()
19684 ? DAG.getIntPtrConstant(0x58, dl)
19685 : (Subtarget.isTargetWindowsGNU()
19686 ? DAG.getIntPtrConstant(0x2C, dl)
19687 : DAG.getExternalSymbol("_tls_array", PtrVT));
19688
19690 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19691
19692 SDValue res;
19694 res = ThreadPointer;
19695 } else {
19696 // Load the _tls_index variable
19697 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19698 if (Subtarget.is64Bit())
19699 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19700 MachinePointerInfo(), MVT::i32);
19701 else
19702 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19703
19704 const DataLayout &DL = DAG.getDataLayout();
19705 SDValue Scale =
19706 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19707 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19708
19709 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19710 }
19711
19712 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19713
19714 // Get the offset of start of .tls section
19715 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19716 GA->getValueType(0),
19718 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19719
19720 // The address of the thread local variable is the add of the thread
19721 // pointer with the offset of the variable.
19722 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19723 }
19724
19725 llvm_unreachable("TLS not implemented for this target.");
19726}
19727
19729 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19730 const TargetMachine &TM = getTargetMachine();
19731 TLSModel::Model Model = TM.getTLSModel(&GV);
19732 switch (Model) {
19735 // We can include the %fs segment register in addressing modes.
19736 return true;
19739 // These models do not result in %fs relative addresses unless
19740 // TLS descriptior are used.
19741 //
19742 // Even in the case of TLS descriptors we currently have no way to model
19743 // the difference between %fs access and the computations needed for the
19744 // offset and returning `true` for TLS-desc currently duplicates both
19745 // which is detrimental :-/
19746 return false;
19747 }
19748 }
19749 return false;
19750}
19751
19752/// Lower SRA_PARTS and friends, which return two i32 values
19753/// and take a 2 x i32 value to shift plus a shift amount.
19754/// TODO: Can this be moved to general expansion code?
19756 SDValue Lo, Hi;
19757 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19758 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19759}
19760
19761// Try to use a packed vector operation to handle i64 on 32-bit targets when
19762// AVX512DQ is enabled.
19764 SelectionDAG &DAG,
19765 const X86Subtarget &Subtarget) {
19766 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19767 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19768 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19769 Op.getOpcode() == ISD::UINT_TO_FP) &&
19770 "Unexpected opcode!");
19771 bool IsStrict = Op->isStrictFPOpcode();
19772 unsigned OpNo = IsStrict ? 1 : 0;
19773 SDValue Src = Op.getOperand(OpNo);
19774 MVT SrcVT = Src.getSimpleValueType();
19775 MVT VT = Op.getSimpleValueType();
19776
19777 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19778 (VT != MVT::f32 && VT != MVT::f64))
19779 return SDValue();
19780
19781 // Pack the i64 into a vector, do the operation and extract.
19782
19783 // Using 256-bit to ensure result is 128-bits for f32 case.
19784 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19785 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19786 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19787
19788 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19789 if (IsStrict) {
19790 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19791 {Op.getOperand(0), InVec});
19792 SDValue Chain = CvtVec.getValue(1);
19793 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19794 DAG.getVectorIdxConstant(0, dl));
19795 return DAG.getMergeValues({Value, Chain}, dl);
19796 }
19797
19798 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19799
19800 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19801 DAG.getVectorIdxConstant(0, dl));
19802}
19803
19804// Try to use a packed vector operation to handle i64 on 32-bit targets.
19806 const X86Subtarget &Subtarget) {
19807 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19808 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19809 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19810 Op.getOpcode() == ISD::UINT_TO_FP) &&
19811 "Unexpected opcode!");
19812 bool IsStrict = Op->isStrictFPOpcode();
19813 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19814 MVT SrcVT = Src.getSimpleValueType();
19815 MVT VT = Op.getSimpleValueType();
19816
19817 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19818 return SDValue();
19819
19820 // Pack the i64 into a vector, do the operation and extract.
19821
19822 assert(Subtarget.hasFP16() && "Expected FP16");
19823
19824 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19825 if (IsStrict) {
19826 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19827 {Op.getOperand(0), InVec});
19828 SDValue Chain = CvtVec.getValue(1);
19829 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19830 DAG.getVectorIdxConstant(0, dl));
19831 return DAG.getMergeValues({Value, Chain}, dl);
19832 }
19833
19834 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19835
19836 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19837 DAG.getVectorIdxConstant(0, dl));
19838}
19839
19840static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19841 const X86Subtarget &Subtarget) {
19842 switch (Opcode) {
19843 case ISD::SINT_TO_FP:
19844 // TODO: Handle wider types with AVX/AVX512.
19845 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19846 return false;
19847 // CVTDQ2PS or (V)CVTDQ2PD
19848 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19849
19850 case ISD::UINT_TO_FP:
19851 // TODO: Handle wider types and i64 elements.
19852 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19853 return false;
19854 // VCVTUDQ2PS or VCVTUDQ2PD
19855 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19856
19857 default:
19858 return false;
19859 }
19860}
19861
19862/// Given a scalar cast operation that is extracted from a vector, try to
19863/// vectorize the cast op followed by extraction. This will avoid an expensive
19864/// round-trip between XMM and GPR.
19866 SelectionDAG &DAG,
19867 const X86Subtarget &Subtarget) {
19868 // TODO: This could be enhanced to handle smaller integer types by peeking
19869 // through an extend.
19870 SDValue Extract = Cast.getOperand(0);
19871 MVT DestVT = Cast.getSimpleValueType();
19872 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19873 !isa<ConstantSDNode>(Extract.getOperand(1)))
19874 return SDValue();
19875
19876 // See if we have a 128-bit vector cast op for this type of cast.
19877 SDValue VecOp = Extract.getOperand(0);
19878 MVT FromVT = VecOp.getSimpleValueType();
19879 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19880 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19881 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19882 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19883 return SDValue();
19884
19885 // If we are extracting from a non-zero element, first shuffle the source
19886 // vector to allow extracting from element zero.
19887 if (!isNullConstant(Extract.getOperand(1))) {
19888 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19889 Mask[0] = Extract.getConstantOperandVal(1);
19890 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19891 }
19892 // If the source vector is wider than 128-bits, extract the low part. Do not
19893 // create an unnecessarily wide vector cast op.
19894 if (FromVT != Vec128VT)
19895 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19896
19897 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19898 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19899 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19900 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19901 DAG.getVectorIdxConstant(0, DL));
19902}
19903
19904/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19905/// try to vectorize the cast ops. This will avoid an expensive round-trip
19906/// between XMM and GPR.
19907static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19908 SelectionDAG &DAG,
19909 const X86Subtarget &Subtarget) {
19910 // TODO: Allow FP_TO_UINT.
19911 SDValue CastToInt = CastToFP.getOperand(0);
19912 MVT VT = CastToFP.getSimpleValueType();
19913 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19914 return SDValue();
19915
19916 MVT IntVT = CastToInt.getSimpleValueType();
19917 SDValue X = CastToInt.getOperand(0);
19918 MVT SrcVT = X.getSimpleValueType();
19919 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19920 return SDValue();
19921
19922 // See if we have 128-bit vector cast instructions for this type of cast.
19923 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19924 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19925 IntVT != MVT::i32)
19926 return SDValue();
19927
19928 unsigned SrcSize = SrcVT.getSizeInBits();
19929 unsigned IntSize = IntVT.getSizeInBits();
19930 unsigned VTSize = VT.getSizeInBits();
19931 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19932 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19933 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19934
19935 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19936 unsigned ToIntOpcode =
19937 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19938 unsigned ToFPOpcode =
19939 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19940
19941 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19942 //
19943 // We are not defining the high elements (for example, zero them) because
19944 // that could nullify any performance advantage that we hoped to gain from
19945 // this vector op hack. We do not expect any adverse effects (like denorm
19946 // penalties) with cast ops.
19947 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19948 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19949 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19950 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19951 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19952}
19953
19955 SelectionDAG &DAG,
19956 const X86Subtarget &Subtarget) {
19957 bool IsStrict = Op->isStrictFPOpcode();
19958 MVT VT = Op->getSimpleValueType(0);
19959 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19960
19961 if (Subtarget.hasDQI()) {
19962 assert(!Subtarget.hasVLX() && "Unexpected features");
19963
19964 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19965 Src.getSimpleValueType() == MVT::v4i64) &&
19966 "Unsupported custom type");
19967
19968 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19969 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19970 "Unexpected VT!");
19971 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19972
19973 // Need to concat with zero vector for strict fp to avoid spurious
19974 // exceptions.
19975 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19976 : DAG.getUNDEF(MVT::v8i64);
19977 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19978 DAG.getVectorIdxConstant(0, DL));
19979 SDValue Res, Chain;
19980 if (IsStrict) {
19981 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19982 {Op->getOperand(0), Src});
19983 Chain = Res.getValue(1);
19984 } else {
19985 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19986 }
19987
19988 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19989 DAG.getVectorIdxConstant(0, DL));
19990
19991 if (IsStrict)
19992 return DAG.getMergeValues({Res, Chain}, DL);
19993 return Res;
19994 }
19995
19996 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19997 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19998 if (VT != MVT::v4f32 || IsSigned)
19999 return SDValue();
20000
20001 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20002 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20003 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20004 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20005 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20006 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20007 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20008 SmallVector<SDValue, 4> SignCvts(4);
20009 SmallVector<SDValue, 4> Chains(4);
20010 for (int i = 0; i != 4; ++i) {
20011 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20012 DAG.getVectorIdxConstant(i, DL));
20013 if (IsStrict) {
20014 SignCvts[i] =
20015 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20016 {Op.getOperand(0), Elt});
20017 Chains[i] = SignCvts[i].getValue(1);
20018 } else {
20019 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20020 }
20021 }
20022 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20023
20024 SDValue Slow, Chain;
20025 if (IsStrict) {
20026 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20027 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20028 {Chain, SignCvt, SignCvt});
20029 Chain = Slow.getValue(1);
20030 } else {
20031 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20032 }
20033
20034 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20035 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20036
20037 if (IsStrict)
20038 return DAG.getMergeValues({Cvt, Chain}, DL);
20039
20040 return Cvt;
20041}
20042
20044 SelectionDAG &DAG) {
20045 bool IsStrict = Op->isStrictFPOpcode();
20046 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20047 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20048 MVT VT = Op.getSimpleValueType();
20049 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20050
20051 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20052 if (IsStrict)
20053 return DAG.getNode(
20054 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20055 {Chain,
20056 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20057 Rnd});
20058 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20059 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20060}
20061
20062static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20063 const X86Subtarget &Subtarget) {
20064 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20065 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20066 return true;
20067 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20068 return true;
20069 }
20070 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20071 return true;
20072 if (Subtarget.useAVX512Regs()) {
20073 if (VT == MVT::v16i32)
20074 return true;
20075 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20076 return true;
20077 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20078 return true;
20079 }
20080 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20081 (VT == MVT::v2i64 || VT == MVT::v4i64))
20082 return true;
20083 return false;
20084}
20085
20086SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20087 SelectionDAG &DAG) const {
20088 bool IsStrict = Op->isStrictFPOpcode();
20089 unsigned OpNo = IsStrict ? 1 : 0;
20090 SDValue Src = Op.getOperand(OpNo);
20091 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20092 MVT SrcVT = Src.getSimpleValueType();
20093 MVT VT = Op.getSimpleValueType();
20094 SDLoc dl(Op);
20095
20096 if (isSoftF16(VT, Subtarget))
20097 return promoteXINT_TO_FP(Op, dl, DAG);
20098 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20099 return Op;
20100
20101 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20102 return LowerWin64_INT128_TO_FP(Op, DAG);
20103
20104 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20105 return Extract;
20106
20107 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20108 return R;
20109
20110 if (SrcVT.isVector()) {
20111 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20112 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20113 // source for strict FP.
20114 if (IsStrict)
20115 return DAG.getNode(
20116 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20117 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20118 DAG.getUNDEF(SrcVT))});
20119 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20120 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20121 DAG.getUNDEF(SrcVT)));
20122 }
20123 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20124 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20125
20126 return SDValue();
20127 }
20128
20129 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20130 "Unknown SINT_TO_FP to lower!");
20131
20132 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20133
20134 // These are really Legal; return the operand so the caller accepts it as
20135 // Legal.
20136 if (SrcVT == MVT::i32 && UseSSEReg)
20137 return Op;
20138 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20139 return Op;
20140
20141 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20142 return V;
20143 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20144 return V;
20145
20146 // SSE doesn't have an i16 conversion so we need to promote.
20147 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20148 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20149 if (IsStrict)
20150 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20151 {Chain, Ext});
20152
20153 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20154 }
20155
20156 if (VT == MVT::f128 || !Subtarget.hasX87())
20157 return SDValue();
20158
20159 SDValue ValueToStore = Src;
20160 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20161 // Bitcasting to f64 here allows us to do a single 64-bit store from
20162 // an SSE register, avoiding the store forwarding penalty that would come
20163 // with two 32-bit stores.
20164 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20165
20166 unsigned Size = SrcVT.getStoreSize();
20167 Align Alignment(Size);
20168 MachineFunction &MF = DAG.getMachineFunction();
20169 auto PtrVT = getPointerTy(MF.getDataLayout());
20170 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20171 MachinePointerInfo MPI =
20173 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20174 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20175 std::pair<SDValue, SDValue> Tmp =
20176 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20177
20178 if (IsStrict)
20179 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20180
20181 return Tmp.first;
20182}
20183
20184std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20185 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20186 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20187 // Build the FILD
20188 SDVTList Tys;
20189 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20190 if (useSSE)
20191 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20192 else
20193 Tys = DAG.getVTList(DstVT, MVT::Other);
20194
20195 SDValue FILDOps[] = {Chain, Pointer};
20196 SDValue Result =
20197 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20198 Alignment, MachineMemOperand::MOLoad);
20199 Chain = Result.getValue(1);
20200
20201 if (useSSE) {
20203 unsigned SSFISize = DstVT.getStoreSize();
20204 int SSFI =
20205 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20206 auto PtrVT = getPointerTy(MF.getDataLayout());
20207 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20208 Tys = DAG.getVTList(MVT::Other);
20209 SDValue FSTOps[] = {Chain, Result, StackSlot};
20212 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20213
20214 Chain =
20215 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20216 Result = DAG.getLoad(
20217 DstVT, DL, Chain, StackSlot,
20219 Chain = Result.getValue(1);
20220 }
20221
20222 return { Result, Chain };
20223}
20224
20225/// Horizontal vector math instructions may be slower than normal math with
20226/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20227/// implementation, and likely shuffle complexity of the alternate sequence.
20228static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20229 const X86Subtarget &Subtarget) {
20230 bool IsOptimizingSize = DAG.shouldOptForSize();
20231 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20232 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20233}
20234
20235/// 64-bit unsigned integer to double expansion.
20237 SelectionDAG &DAG,
20238 const X86Subtarget &Subtarget) {
20239 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20240 // when converting 0 when rounding toward negative infinity. Caller will
20241 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20242 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20243 // This algorithm is not obvious. Here it is what we're trying to output:
20244 /*
20245 movq %rax, %xmm0
20246 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20247 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20248 #ifdef __SSE3__
20249 haddpd %xmm0, %xmm0
20250 #else
20251 pshufd $0x4e, %xmm0, %xmm1
20252 addpd %xmm1, %xmm0
20253 #endif
20254 */
20255
20256 LLVMContext *Context = DAG.getContext();
20257
20258 // Build some magic constants.
20259 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20260 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20261 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20262 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20263
20265 CV1.push_back(
20266 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20267 APInt(64, 0x4330000000000000ULL))));
20268 CV1.push_back(
20269 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20270 APInt(64, 0x4530000000000000ULL))));
20271 Constant *C1 = ConstantVector::get(CV1);
20272 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20273
20274 // Load the 64-bit value into an XMM register.
20275 SDValue XR1 =
20276 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20277 SDValue CLod0 = DAG.getLoad(
20278 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20280 SDValue Unpck1 =
20281 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20282
20283 SDValue CLod1 = DAG.getLoad(
20284 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20286 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20287 // TODO: Are there any fast-math-flags to propagate here?
20288 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20289 SDValue Result;
20290
20291 if (Subtarget.hasSSE3() &&
20292 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20293 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20294 } else {
20295 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20296 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20297 }
20298 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20299 DAG.getVectorIdxConstant(0, dl));
20300 return Result;
20301}
20302
20303/// 32-bit unsigned integer to float expansion.
20305 SelectionDAG &DAG,
20306 const X86Subtarget &Subtarget) {
20307 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20308 // FP constant to bias correct the final result.
20309 SDValue Bias = DAG.getConstantFP(
20310 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20311
20312 // Load the 32-bit value into an XMM register.
20313 SDValue Load =
20314 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20315
20316 // Zero out the upper parts of the register.
20317 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20318
20319 // Or the load with the bias.
20320 SDValue Or = DAG.getNode(
20321 ISD::OR, dl, MVT::v2i64,
20322 DAG.getBitcast(MVT::v2i64, Load),
20323 DAG.getBitcast(MVT::v2i64,
20324 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20325 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20326 DAG.getBitcast(MVT::v2f64, Or),
20327 DAG.getVectorIdxConstant(0, dl));
20328
20329 if (Op.getNode()->isStrictFPOpcode()) {
20330 // Subtract the bias.
20331 // TODO: Are there any fast-math-flags to propagate here?
20332 SDValue Chain = Op.getOperand(0);
20333 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20334 {Chain, Or, Bias});
20335
20336 if (Op.getValueType() == Sub.getValueType())
20337 return Sub;
20338
20339 // Handle final rounding.
20340 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20341 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20342
20343 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20344 }
20345
20346 // Subtract the bias.
20347 // TODO: Are there any fast-math-flags to propagate here?
20348 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20349
20350 // Handle final rounding.
20351 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20352}
20353
20355 SelectionDAG &DAG,
20356 const X86Subtarget &Subtarget) {
20357 if (Op.getSimpleValueType() != MVT::v2f64)
20358 return SDValue();
20359
20360 bool IsStrict = Op->isStrictFPOpcode();
20361
20362 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20363 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20364
20365 if (Subtarget.hasAVX512()) {
20366 if (!Subtarget.hasVLX()) {
20367 // Let generic type legalization widen this.
20368 if (!IsStrict)
20369 return SDValue();
20370 // Otherwise pad the integer input with 0s and widen the operation.
20371 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20372 DAG.getConstant(0, DL, MVT::v2i32));
20373 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20374 {Op.getOperand(0), N0});
20375 SDValue Chain = Res.getValue(1);
20376 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20377 DAG.getVectorIdxConstant(0, DL));
20378 return DAG.getMergeValues({Res, Chain}, DL);
20379 }
20380
20381 // Legalize to v4i32 type.
20382 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20383 DAG.getUNDEF(MVT::v2i32));
20384 if (IsStrict)
20385 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20386 {Op.getOperand(0), N0});
20387 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20388 }
20389
20390 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20391 // This gives us the floating point equivalent of 2^52 + the i32 integer
20392 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20393 // point leaving just our i32 integers in double format.
20394 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20395 SDValue VBias = DAG.getConstantFP(
20396 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20397 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20398 DAG.getBitcast(MVT::v2i64, VBias));
20399 Or = DAG.getBitcast(MVT::v2f64, Or);
20400
20401 if (IsStrict)
20402 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20403 {Op.getOperand(0), Or, VBias});
20404 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20405}
20406
20408 SelectionDAG &DAG,
20409 const X86Subtarget &Subtarget) {
20410 bool IsStrict = Op->isStrictFPOpcode();
20411 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20412 MVT VecIntVT = V.getSimpleValueType();
20413 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20414 "Unsupported custom type");
20415
20416 if (Subtarget.hasAVX512()) {
20417 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20418 assert(!Subtarget.hasVLX() && "Unexpected features");
20419 MVT VT = Op->getSimpleValueType(0);
20420
20421 // v8i32->v8f64 is legal with AVX512 so just return it.
20422 if (VT == MVT::v8f64)
20423 return Op;
20424
20425 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20426 VT == MVT::v8f16) &&
20427 "Unexpected VT!");
20428 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20429 MVT WideIntVT = MVT::v16i32;
20430 if (VT == MVT::v4f64) {
20431 WideVT = MVT::v8f64;
20432 WideIntVT = MVT::v8i32;
20433 }
20434
20435 // Need to concat with zero vector for strict fp to avoid spurious
20436 // exceptions.
20437 SDValue Tmp =
20438 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20439 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20440 DAG.getVectorIdxConstant(0, DL));
20441 SDValue Res, Chain;
20442 if (IsStrict) {
20443 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20444 {Op->getOperand(0), V});
20445 Chain = Res.getValue(1);
20446 } else {
20447 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20448 }
20449
20450 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20451 DAG.getVectorIdxConstant(0, DL));
20452
20453 if (IsStrict)
20454 return DAG.getMergeValues({Res, Chain}, DL);
20455 return Res;
20456 }
20457
20458 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20459 Op->getSimpleValueType(0) == MVT::v4f64) {
20460 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20461 Constant *Bias = ConstantFP::get(
20462 *DAG.getContext(),
20463 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20464 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20465 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20466 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20467 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20468 SDValue VBias = DAG.getMemIntrinsicNode(
20469 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20472
20473 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20474 DAG.getBitcast(MVT::v4i64, VBias));
20475 Or = DAG.getBitcast(MVT::v4f64, Or);
20476
20477 if (IsStrict)
20478 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20479 {Op.getOperand(0), Or, VBias});
20480 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20481 }
20482
20483 // The algorithm is the following:
20484 // #ifdef __SSE4_1__
20485 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20486 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20487 // (uint4) 0x53000000, 0xaa);
20488 // #else
20489 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20490 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20491 // #endif
20492 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20493 // return (float4) lo + fhi;
20494
20495 bool Is128 = VecIntVT == MVT::v4i32;
20496 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20497 // If we convert to something else than the supported type, e.g., to v4f64,
20498 // abort early.
20499 if (VecFloatVT != Op->getSimpleValueType(0))
20500 return SDValue();
20501
20502 // In the #idef/#else code, we have in common:
20503 // - The vector of constants:
20504 // -- 0x4b000000
20505 // -- 0x53000000
20506 // - A shift:
20507 // -- v >> 16
20508
20509 // Create the splat vector for 0x4b000000.
20510 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20511 // Create the splat vector for 0x53000000.
20512 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20513
20514 // Create the right shift.
20515 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20516 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20517
20518 SDValue Low, High;
20519 if (Subtarget.hasSSE41()) {
20520 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20521 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20522 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20523 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20524 // Low will be bitcasted right away, so do not bother bitcasting back to its
20525 // original type.
20526 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20527 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20528 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20529 // (uint4) 0x53000000, 0xaa);
20530 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20531 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20532 // High will be bitcasted right away, so do not bother bitcasting back to
20533 // its original type.
20534 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20535 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20536 } else {
20537 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20538 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20539 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20540 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20541
20542 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20543 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20544 }
20545
20546 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20547 SDValue VecCstFSub = DAG.getConstantFP(
20548 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20549
20550 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20551 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20552 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20553 // enabled. See PR24512.
20554 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20555 // TODO: Are there any fast-math-flags to propagate here?
20556 // (float4) lo;
20557 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20558 // return (float4) lo + fhi;
20559 if (IsStrict) {
20560 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20561 {Op.getOperand(0), HighBitcast, VecCstFSub});
20562 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20563 {FHigh.getValue(1), LowBitcast, FHigh});
20564 }
20565
20566 SDValue FHigh =
20567 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20568 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20569}
20570
20572 const X86Subtarget &Subtarget) {
20573 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20574 SDValue N0 = Op.getOperand(OpNo);
20575 MVT SrcVT = N0.getSimpleValueType();
20576
20577 switch (SrcVT.SimpleTy) {
20578 default:
20579 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20580 case MVT::v2i32:
20581 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20582 case MVT::v4i32:
20583 case MVT::v8i32:
20584 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20585 case MVT::v2i64:
20586 case MVT::v4i64:
20587 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20588 }
20589}
20590
20591SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20592 SelectionDAG &DAG) const {
20593 bool IsStrict = Op->isStrictFPOpcode();
20594 unsigned OpNo = IsStrict ? 1 : 0;
20595 SDValue Src = Op.getOperand(OpNo);
20596 SDLoc dl(Op);
20597 auto PtrVT = getPointerTy(DAG.getDataLayout());
20598 MVT SrcVT = Src.getSimpleValueType();
20599 MVT DstVT = Op->getSimpleValueType(0);
20600 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20601
20602 // Bail out when we don't have native conversion instructions.
20603 if (DstVT == MVT::f128)
20604 return SDValue();
20605
20606 if (isSoftF16(DstVT, Subtarget))
20607 return promoteXINT_TO_FP(Op, dl, DAG);
20608 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20609 return Op;
20610
20611 if (DstVT.isVector())
20612 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20613
20614 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20615 return LowerWin64_INT128_TO_FP(Op, DAG);
20616
20617 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20618 return Extract;
20619
20620 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20621 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20622 // Conversions from unsigned i32 to f32/f64 are legal,
20623 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20624 return Op;
20625 }
20626
20627 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20628 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20629 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20630 if (IsStrict)
20631 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20632 {Chain, Src});
20633 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20634 }
20635
20636 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20637 return V;
20638 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20639 return V;
20640
20641 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20642 // infinity. It produces -0.0, so disable under strictfp.
20643 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20644 !IsStrict)
20645 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20646 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20647 // negative infinity. So disable under strictfp. Using FILD instead.
20648 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20649 !IsStrict)
20650 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20651 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20652 (DstVT == MVT::f32 || DstVT == MVT::f64))
20653 return SDValue();
20654
20655 // Make a 64-bit buffer, and use it to build an FILD.
20656 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20657 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20658 Align SlotAlign(8);
20659 MachinePointerInfo MPI =
20661 if (SrcVT == MVT::i32) {
20662 SDValue OffsetSlot =
20663 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20664 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20665 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20666 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20667 std::pair<SDValue, SDValue> Tmp =
20668 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20669 if (IsStrict)
20670 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20671
20672 return Tmp.first;
20673 }
20674
20675 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20676 SDValue ValueToStore = Src;
20677 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20678 // Bitcasting to f64 here allows us to do a single 64-bit store from
20679 // an SSE register, avoiding the store forwarding penalty that would come
20680 // with two 32-bit stores.
20681 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20682 }
20683 SDValue Store =
20684 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20685 // For i64 source, we need to add the appropriate power of 2 if the input
20686 // was negative. We must be careful to do the computation in x87 extended
20687 // precision, not in SSE.
20688 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20689 SDValue Ops[] = {Store, StackSlot};
20690 SDValue Fild =
20691 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20692 SlotAlign, MachineMemOperand::MOLoad);
20693 Chain = Fild.getValue(1);
20694
20695 // Check whether the sign bit is set.
20696 SDValue SignSet = DAG.getSetCC(
20697 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20698 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20699
20700 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20701 APInt FF(64, 0x5F80000000000000ULL);
20702 SDValue FudgePtr =
20703 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20704 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20705
20706 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20707 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20708 SDValue Four = DAG.getIntPtrConstant(4, dl);
20709 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20710 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20711
20712 // Load the value out, extending it from f32 to f80.
20713 SDValue Fudge = DAG.getExtLoad(
20714 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20716 CPAlignment);
20717 Chain = Fudge.getValue(1);
20718 // Extend everything to 80 bits to force it to be done on x87.
20719 // TODO: Are there any fast-math-flags to propagate here?
20720 if (IsStrict) {
20721 unsigned Opc = ISD::STRICT_FADD;
20722 // Windows needs the precision control changed to 80bits around this add.
20723 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20725
20726 SDValue Add =
20727 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20728 // STRICT_FP_ROUND can't handle equal types.
20729 if (DstVT == MVT::f80)
20730 return Add;
20731 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20732 {Add.getValue(1), Add,
20733 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20734 }
20735 unsigned Opc = ISD::FADD;
20736 // Windows needs the precision control changed to 80bits around this add.
20737 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20739
20740 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20741 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20742 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20743}
20744
20745// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20746// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20747// just return an SDValue().
20748// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20749// to i16, i32 or i64, and we lower it to a legal sequence and return the
20750// result.
20751SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20752 bool IsSigned,
20753 SDValue &Chain) const {
20754 bool IsStrict = Op->isStrictFPOpcode();
20755 SDLoc DL(Op);
20756
20757 EVT DstTy = Op.getValueType();
20758 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20759 EVT TheVT = Value.getValueType();
20760 auto PtrVT = getPointerTy(DAG.getDataLayout());
20761
20762 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20763 // f16 must be promoted before using the lowering in this routine.
20764 // fp128 does not use this lowering.
20765 return SDValue();
20766 }
20767
20768 // If using FIST to compute an unsigned i64, we'll need some fixup
20769 // to handle values above the maximum signed i64. A FIST is always
20770 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20771 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20772
20773 // FIXME: This does not generate an invalid exception if the input does not
20774 // fit in i32. PR44019
20775 if (!IsSigned && DstTy != MVT::i64) {
20776 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20777 // The low 32 bits of the fist result will have the correct uint32 result.
20778 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20779 DstTy = MVT::i64;
20780 }
20781
20782 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20783 DstTy.getSimpleVT() >= MVT::i16 &&
20784 "Unknown FP_TO_INT to lower!");
20785
20786 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20787 // stack slot.
20788 MachineFunction &MF = DAG.getMachineFunction();
20789 unsigned MemSize = DstTy.getStoreSize();
20790 int SSFI =
20791 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20792 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20793
20794 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20795
20796 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20797
20798 if (UnsignedFixup) {
20799 //
20800 // Conversion to unsigned i64 is implemented with a select,
20801 // depending on whether the source value fits in the range
20802 // of a signed i64. Let Thresh be the FP equivalent of
20803 // 0x8000000000000000ULL.
20804 //
20805 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20806 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20807 // FistSrc = (Value - FltOfs);
20808 // Fist-to-mem64 FistSrc
20809 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20810 // to XOR'ing the high 32 bits with Adjust.
20811 //
20812 // Being a power of 2, Thresh is exactly representable in all FP formats.
20813 // For X87 we'd like to use the smallest FP type for this constant, but
20814 // for DAG type consistency we have to match the FP operand type.
20815
20816 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20818 bool LosesInfo = false;
20819 if (TheVT == MVT::f64)
20820 // The rounding mode is irrelevant as the conversion should be exact.
20821 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20822 &LosesInfo);
20823 else if (TheVT == MVT::f80)
20824 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20825 APFloat::rmNearestTiesToEven, &LosesInfo);
20826
20827 assert(Status == APFloat::opOK && !LosesInfo &&
20828 "FP conversion should have been exact");
20829
20830 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20831
20832 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20833 *DAG.getContext(), TheVT);
20834 SDValue Cmp;
20835 if (IsStrict) {
20836 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20837 /*IsSignaling*/ true);
20838 Chain = Cmp.getValue(1);
20839 } else {
20840 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20841 }
20842
20843 // Our preferred lowering of
20844 //
20845 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20846 //
20847 // is
20848 //
20849 // (Value >= Thresh) << 63
20850 //
20851 // but since we can get here after LegalOperations, DAGCombine might do the
20852 // wrong thing if we create a select. So, directly create the preferred
20853 // version.
20854 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20855 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20856 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20857
20858 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20859 DAG.getConstantFP(0.0, DL, TheVT));
20860
20861 if (IsStrict) {
20862 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20863 { Chain, Value, FltOfs });
20864 Chain = Value.getValue(1);
20865 } else
20866 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20867 }
20868
20869 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20870
20871 // FIXME This causes a redundant load/store if the SSE-class value is already
20872 // in memory, such as if it is on the callstack.
20873 if (isScalarFPTypeInSSEReg(TheVT)) {
20874 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20875 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20876 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20877 SDValue Ops[] = { Chain, StackSlot };
20878
20879 unsigned FLDSize = TheVT.getStoreSize();
20880 assert(FLDSize <= MemSize && "Stack slot not big enough");
20881 MachineMemOperand *MMO = MF.getMachineMemOperand(
20882 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20883 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20884 Chain = Value.getValue(1);
20885 }
20886
20887 // Build the FP_TO_INT*_IN_MEM
20888 MachineMemOperand *MMO = MF.getMachineMemOperand(
20889 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20890 SDValue Ops[] = { Chain, Value, StackSlot };
20892 DAG.getVTList(MVT::Other),
20893 Ops, DstTy, MMO);
20894
20895 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20896 Chain = Res.getValue(1);
20897
20898 // If we need an unsigned fixup, XOR the result with adjust.
20899 if (UnsignedFixup)
20900 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20901
20902 return Res;
20903}
20904
20906 const X86Subtarget &Subtarget) {
20907 MVT VT = Op.getSimpleValueType();
20908 SDValue In = Op.getOperand(0);
20909 MVT InVT = In.getSimpleValueType();
20910 unsigned Opc = Op.getOpcode();
20911
20912 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20914 "Unexpected extension opcode");
20916 "Expected same number of elements");
20917 assert((VT.getVectorElementType() == MVT::i16 ||
20918 VT.getVectorElementType() == MVT::i32 ||
20919 VT.getVectorElementType() == MVT::i64) &&
20920 "Unexpected element type");
20921 assert((InVT.getVectorElementType() == MVT::i8 ||
20922 InVT.getVectorElementType() == MVT::i16 ||
20923 InVT.getVectorElementType() == MVT::i32) &&
20924 "Unexpected element type");
20925
20926 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20927
20928 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20929 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20930 return splitVectorIntUnary(Op, DAG, dl);
20931 }
20932
20933 if (Subtarget.hasInt256())
20934 return Op;
20935
20936 // Optimize vectors in AVX mode:
20937 //
20938 // v8i16 -> v8i32
20939 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20940 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20941 // Concat upper and lower parts.
20942 //
20943 // v4i32 -> v4i64
20944 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20945 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20946 // Concat upper and lower parts.
20947 //
20948 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20949 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20950
20951 // Short-circuit if we can determine that each 128-bit half is the same value.
20952 // Otherwise, this is difficult to match and optimize.
20953 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20954 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20955 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20956
20957 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20958 SDValue Undef = DAG.getUNDEF(InVT);
20959 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20960 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20961 OpHi = DAG.getBitcast(HalfVT, OpHi);
20962
20963 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20964}
20965
20966// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20967static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20968 const SDLoc &dl, SelectionDAG &DAG) {
20969 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20970 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20971 DAG.getVectorIdxConstant(0, dl));
20972 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20973 DAG.getVectorIdxConstant(8, dl));
20974 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20975 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20976 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20977 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20978}
20979
20981 const X86Subtarget &Subtarget,
20982 SelectionDAG &DAG) {
20983 MVT VT = Op->getSimpleValueType(0);
20984 SDValue In = Op->getOperand(0);
20985 MVT InVT = In.getSimpleValueType();
20986 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20987 unsigned NumElts = VT.getVectorNumElements();
20988
20989 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20990 // avoids a constant pool load.
20991 if (VT.getVectorElementType() != MVT::i8) {
20992 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20993 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20994 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20995 }
20996
20997 // Extend VT if BWI is not supported.
20998 MVT ExtVT = VT;
20999 if (!Subtarget.hasBWI()) {
21000 // If v16i32 is to be avoided, we'll need to split and concatenate.
21001 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21002 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21003
21004 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21005 }
21006
21007 // Widen to 512-bits if VLX is not supported.
21008 MVT WideVT = ExtVT;
21009 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21010 NumElts *= 512 / ExtVT.getSizeInBits();
21011 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21012 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
21013 DAG.getVectorIdxConstant(0, DL));
21014 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
21015 }
21016
21017 SDValue One = DAG.getConstant(1, DL, WideVT);
21018 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21019
21020 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21021
21022 // Truncate if we had to extend above.
21023 if (VT != ExtVT) {
21024 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21025 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21026 }
21027
21028 // Extract back to 128/256-bit if we widened.
21029 if (WideVT != VT)
21030 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21031 DAG.getVectorIdxConstant(0, DL));
21032
21033 return SelectedVal;
21034}
21035
21037 SelectionDAG &DAG) {
21038 SDValue In = Op.getOperand(0);
21039 MVT SVT = In.getSimpleValueType();
21040 SDLoc DL(Op);
21041
21042 if (SVT.getVectorElementType() == MVT::i1)
21043 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21044
21045 assert(Subtarget.hasAVX() && "Expected AVX support");
21046 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21047}
21048
21049/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21050/// It makes use of the fact that vectors with enough leading sign/zero bits
21051/// prevent the PACKSS/PACKUS from saturating the results.
21052/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21053/// within each 128-bit lane.
21054static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21055 const SDLoc &DL, SelectionDAG &DAG,
21056 const X86Subtarget &Subtarget) {
21057 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21058 "Unexpected PACK opcode");
21059 assert(DstVT.isVector() && "VT not a vector?");
21060
21061 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21062 if (!Subtarget.hasSSE2())
21063 return SDValue();
21064
21065 EVT SrcVT = In.getValueType();
21066
21067 // No truncation required, we might get here due to recursive calls.
21068 if (SrcVT == DstVT)
21069 return In;
21070
21071 unsigned NumElems = SrcVT.getVectorNumElements();
21072 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21073 return SDValue();
21074
21075 unsigned DstSizeInBits = DstVT.getSizeInBits();
21076 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21077 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21078 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21079
21080 LLVMContext &Ctx = *DAG.getContext();
21081 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21082 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21083
21084 // Pack to the largest type possible:
21085 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21086 EVT InVT = MVT::i16, OutVT = MVT::i8;
21087 if (SrcVT.getScalarSizeInBits() > 16 &&
21088 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21089 InVT = MVT::i32;
21090 OutVT = MVT::i16;
21091 }
21092
21093 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21094 // On pre-AVX512, pack the src in both halves to help value tracking.
21095 if (SrcSizeInBits <= 128) {
21096 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21097 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21098 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21099 SDValue LHS = DAG.getBitcast(InVT, In);
21100 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21101 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21102 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21103 Res = DAG.getBitcast(PackedVT, Res);
21104 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21105 }
21106
21107 // Split lower/upper subvectors.
21108 SDValue Lo, Hi;
21109 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21110
21111 // If Hi is undef, then don't bother packing it and widen the result instead.
21112 if (Hi.isUndef()) {
21113 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21114 if (SDValue Res =
21115 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21116 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21117 }
21118
21119 unsigned SubSizeInBits = SrcSizeInBits / 2;
21120 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21121 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21122
21123 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21124 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21125 Lo = DAG.getBitcast(InVT, Lo);
21126 Hi = DAG.getBitcast(InVT, Hi);
21127 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21128 return DAG.getBitcast(DstVT, Res);
21129 }
21130
21131 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21132 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21133 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21134 Lo = DAG.getBitcast(InVT, Lo);
21135 Hi = DAG.getBitcast(InVT, Hi);
21136 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21137
21138 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21139 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21140 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21142 int Scale = 64 / OutVT.getScalarSizeInBits();
21143 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21144 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21145
21146 if (DstVT.is256BitVector())
21147 return DAG.getBitcast(DstVT, Res);
21148
21149 // If 512bit -> 128bit truncate another stage.
21150 Res = DAG.getBitcast(PackedVT, Res);
21151 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21152 }
21153
21154 // Recursively pack lower/upper subvectors, concat result and pack again.
21155 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21156
21157 if (PackedVT.is128BitVector()) {
21158 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21159 // type legalization.
21160 SDValue Res =
21161 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21162 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21163 }
21164
21165 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21166 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21167 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21168 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21169 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21170}
21171
21172/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21173/// e.g. trunc <8 x i32> X to <8 x i16> -->
21174/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21175/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21177 const X86Subtarget &Subtarget,
21178 SelectionDAG &DAG) {
21179 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21180 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21181}
21182
21183/// Truncate using inreg sign extension and X86ISD::PACKSS.
21185 const X86Subtarget &Subtarget,
21186 SelectionDAG &DAG) {
21187 EVT SrcVT = In.getValueType();
21188 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21189 DAG.getValueType(DstVT));
21190 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21191}
21192
21193/// Helper to determine if \p In truncated to \p DstVT has the necessary
21194/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21195/// possibly by converting a SRL node to SRA for sign extension.
21196static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21197 SDValue In, const SDLoc &DL,
21198 SelectionDAG &DAG,
21199 const X86Subtarget &Subtarget,
21200 const SDNodeFlags Flags = SDNodeFlags()) {
21201 // Requires SSE2.
21202 if (!Subtarget.hasSSE2())
21203 return SDValue();
21204
21205 EVT SrcVT = In.getValueType();
21206 EVT DstSVT = DstVT.getVectorElementType();
21207 EVT SrcSVT = SrcVT.getVectorElementType();
21208 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21209 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21210
21211 // Check we have a truncation suited for PACKSS/PACKUS.
21212 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21213 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21214 return SDValue();
21215
21216 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21217 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21218
21219 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21220 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21221 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21222 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21223 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21224 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21225 return SDValue();
21226
21227 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21228 // split this for packing.
21229 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21230 !isFreeToSplitVector(In, DAG) &&
21231 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21232 return SDValue();
21233
21234 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21235 if (Subtarget.hasAVX512() && NumStages > 1)
21236 return SDValue();
21237
21238 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21239 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21240
21241 // Truncate with PACKUS if we are truncating a vector with leading zero
21242 // bits that extend all the way to the packed/truncated value.
21243 // e.g. Masks, zext_in_reg, etc.
21244 // Pre-SSE41 we can only use PACKUSWB.
21245 KnownBits Known = DAG.computeKnownBits(In);
21246 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21247 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21248 PackOpcode = X86ISD::PACKUS;
21249 return In;
21250 }
21251
21252 // Truncate with PACKSS if we are truncating a vector with sign-bits
21253 // that extend all the way to the packed/truncated value.
21254 // e.g. Comparison result, sext_in_reg, etc.
21255 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21256
21257 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21258 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21259 // see through BITCASTs later on and combines/simplifications can't then use
21260 // it.
21261 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21262 !Subtarget.hasAVX512())
21263 return SDValue();
21264
21265 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21266 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21267 MinSignBits < NumSignBits) {
21268 PackOpcode = X86ISD::PACKSS;
21269 return In;
21270 }
21271
21272 // If we have a srl that only generates signbits that we will discard in
21273 // the truncation then we can use PACKSS by converting the srl to a sra.
21274 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21275 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21276 if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
21277 if (*ShAmt == MinSignBits) {
21278 PackOpcode = X86ISD::PACKSS;
21279 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21280 }
21281 }
21282
21283 return SDValue();
21284}
21285
21286/// This function lowers a vector truncation of 'extended sign-bits' or
21287/// 'extended zero-bits' values.
21288/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21290 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21291 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21292 MVT SrcVT = In.getSimpleValueType();
21293 MVT DstSVT = DstVT.getVectorElementType();
21294 MVT SrcSVT = SrcVT.getVectorElementType();
21295 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21296 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21297 return SDValue();
21298
21299 // If the upper half of the source is undef, then attempt to split and
21300 // only truncate the lower half.
21301 if (DstVT.getSizeInBits() >= 128) {
21302 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21303 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21304 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21305 Subtarget, DAG))
21306 return widenSubVector(Res, false, Subtarget, DAG, DL,
21307 DstVT.getSizeInBits());
21308 }
21309 }
21310
21311 unsigned PackOpcode;
21312 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21313 Subtarget, Flags))
21314 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21315
21316 return SDValue();
21317}
21318
21319/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21320/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21322 const X86Subtarget &Subtarget,
21323 SelectionDAG &DAG) {
21324 MVT SrcVT = In.getSimpleValueType();
21325 MVT DstSVT = DstVT.getVectorElementType();
21326 MVT SrcSVT = SrcVT.getVectorElementType();
21327 unsigned NumElems = DstVT.getVectorNumElements();
21328 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21329 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21330 NumElems >= 8))
21331 return SDValue();
21332
21333 // SSSE3's pshufb results in less instructions in the cases below.
21334 if (Subtarget.hasSSSE3() && NumElems == 8) {
21335 if (SrcSVT == MVT::i16)
21336 return SDValue();
21337 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21338 return SDValue();
21339 }
21340
21341 // If the upper half of the source is undef, then attempt to split and
21342 // only truncate the lower half.
21343 if (DstVT.getSizeInBits() >= 128) {
21344 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21345 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21346 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21347 return widenSubVector(Res, false, Subtarget, DAG, DL,
21348 DstVT.getSizeInBits());
21349 }
21350 }
21351
21352 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21353 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21354 // truncate 2 x v4i32 to v8i16.
21355 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21356 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21357
21358 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21359 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21360
21361 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21362 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21363 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21364 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21365 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21366 }
21367
21368 return SDValue();
21369}
21370
21372 SelectionDAG &DAG,
21373 const X86Subtarget &Subtarget) {
21374 MVT VT = Op.getSimpleValueType();
21375 SDValue In = Op.getOperand(0);
21376 MVT InVT = In.getSimpleValueType();
21377 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21378
21379 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21380 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21381 if (InVT.getScalarSizeInBits() <= 16) {
21382 if (Subtarget.hasBWI()) {
21383 // legal, will go to VPMOVB2M, VPMOVW2M
21384 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21385 // We need to shift to get the lsb into sign position.
21386 // Shift packed bytes not supported natively, bitcast to word
21387 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21388 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21389 DAG.getBitcast(ExtVT, In),
21390 DAG.getConstant(ShiftInx, DL, ExtVT));
21391 In = DAG.getBitcast(InVT, In);
21392 }
21393 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21394 In, ISD::SETGT);
21395 }
21396 // Use TESTD/Q, extended vector to packed dword/qword.
21397 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21398 "Unexpected vector type.");
21399 unsigned NumElts = InVT.getVectorNumElements();
21400 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21401 // We need to change to a wider element type that we have support for.
21402 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21403 // For 16 element vectors we extend to v16i32 unless we are explicitly
21404 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21405 // we need to split into two 8 element vectors which we can extend to v8i32,
21406 // truncate and concat the results. There's an additional complication if
21407 // the original type is v16i8. In that case we can't split the v16i8
21408 // directly, so we need to shuffle high elements to low and use
21409 // sign_extend_vector_inreg.
21410 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21411 SDValue Lo, Hi;
21412 if (InVT == MVT::v16i8) {
21413 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21414 Hi = DAG.getVectorShuffle(
21415 InVT, DL, In, In,
21416 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21417 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21418 } else {
21419 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21420 Lo = extract128BitVector(In, 0, DAG, DL);
21421 Hi = extract128BitVector(In, 8, DAG, DL);
21422 }
21423 // We're split now, just emit two truncates and a concat. The two
21424 // truncates will trigger legalization to come back to this function.
21425 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21426 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21427 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21428 }
21429 // We either have 8 elements or we're allowed to use 512-bit vectors.
21430 // If we have VLX, we want to use the narrowest vector that can get the
21431 // job done so we use vXi32.
21432 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21433 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21434 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21435 InVT = ExtVT;
21436 ShiftInx = InVT.getScalarSizeInBits() - 1;
21437 }
21438
21439 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21440 // We need to shift to get the lsb into sign position.
21441 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21442 DAG.getConstant(ShiftInx, DL, InVT));
21443 }
21444 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21445 if (Subtarget.hasDQI())
21446 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21447 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21448}
21449
21450SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21451 SDLoc DL(Op);
21452 MVT VT = Op.getSimpleValueType();
21453 SDValue In = Op.getOperand(0);
21454 MVT InVT = In.getSimpleValueType();
21456 "Invalid TRUNCATE operation");
21457
21458 // If we're called by the type legalizer, handle a few cases.
21459 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21460 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21461 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21462 VT.is128BitVector() && Subtarget.hasAVX512()) {
21463 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21464 "Unexpected subtarget!");
21465 // The default behavior is to truncate one step, concatenate, and then
21466 // truncate the remainder. We'd rather produce two 64-bit results and
21467 // concatenate those.
21468 SDValue Lo, Hi;
21469 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21470
21471 EVT LoVT, HiVT;
21472 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21473
21474 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21475 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21476 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21477 }
21478
21479 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21480 if (!Subtarget.hasAVX512() ||
21481 (InVT.is512BitVector() && VT.is256BitVector()))
21483 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21484 return SignPack;
21485
21486 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21487 if (!Subtarget.hasAVX512())
21488 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21489
21490 // Otherwise let default legalization handle it.
21491 return SDValue();
21492 }
21493
21494 if (VT.getVectorElementType() == MVT::i1)
21495 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21496
21497 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21498 // concat from subvectors to use VPTRUNC etc.
21499 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21501 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21502 return SignPack;
21503
21504 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21505 if (Subtarget.hasAVX512()) {
21506 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21507 assert(VT == MVT::v32i8 && "Unexpected VT!");
21508 return splitVectorIntUnary(Op, DAG, DL);
21509 }
21510
21511 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21512 // and then truncate that. But we should only do that if we haven't been
21513 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21514 // handled by isel patterns.
21515 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21516 Subtarget.canExtendTo512DQ())
21517 return Op;
21518 }
21519
21520 // Handle truncation of V256 to V128 using shuffles.
21521 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21522
21523 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21524 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21525 if (Subtarget.hasInt256()) {
21526 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21527 In = DAG.getBitcast(MVT::v8i32, In);
21528 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21529 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21530 DAG.getVectorIdxConstant(0, DL));
21531 }
21532
21533 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21534 DAG.getVectorIdxConstant(0, DL));
21535 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21536 DAG.getVectorIdxConstant(2, DL));
21537 static const int ShufMask[] = {0, 2, 4, 6};
21538 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21539 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21540 }
21541
21542 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21543 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21544 if (Subtarget.hasInt256()) {
21545 // The PSHUFB mask:
21546 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21547 -1, -1, -1, -1, -1, -1, -1, -1,
21548 16, 17, 20, 21, 24, 25, 28, 29,
21549 -1, -1, -1, -1, -1, -1, -1, -1 };
21550 In = DAG.getBitcast(MVT::v32i8, In);
21551 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21552 In = DAG.getBitcast(MVT::v4i64, In);
21553
21554 static const int ShufMask2[] = {0, 2, -1, -1};
21555 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21556 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21557 DAG.getVectorIdxConstant(0, DL));
21558 return DAG.getBitcast(MVT::v8i16, In);
21559 }
21560
21561 return Subtarget.hasSSE41()
21562 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21563 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21564 }
21565
21566 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21567 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21568
21569 llvm_unreachable("All 256->128 cases should have been handled above!");
21570}
21571
21572// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21573// behaves on out of range inputs to generate optimized conversions.
21575 SelectionDAG &DAG,
21576 const X86Subtarget &Subtarget) {
21577 MVT SrcVT = Src.getSimpleValueType();
21578 unsigned DstBits = VT.getScalarSizeInBits();
21579 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21580
21581 // Calculate the converted result for values in the range 0 to
21582 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21583 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21584 SDValue Big =
21585 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21586 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21587 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21588
21589 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21590 // and only if the value was out of range. So we can use that
21591 // as our indicator that we rather use "Big" instead of "Small".
21592 //
21593 // Use "Small" if "IsOverflown" has all bits cleared
21594 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21595
21596 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21597 // use the slightly slower blendv select instead.
21598 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21599 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21600 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21601 }
21602
21603 SDValue IsOverflown =
21604 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21605 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21606 return DAG.getNode(ISD::OR, dl, VT, Small,
21607 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21608}
21609
21610SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21611 bool IsStrict = Op->isStrictFPOpcode();
21612 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21613 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21614 bool HasVLX = Subtarget.hasVLX();
21615 MVT VT = Op->getSimpleValueType(0);
21616 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21617 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21618 MVT SrcVT = Src.getSimpleValueType();
21619 SDLoc dl(Op);
21620
21621 SDValue Res;
21622 if (isSoftF16(SrcVT, Subtarget)) {
21623 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21624 if (IsStrict)
21625 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21626 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21627 {NVT, MVT::Other}, {Chain, Src})});
21628 return DAG.getNode(Op.getOpcode(), dl, VT,
21629 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21630 } else if (isTypeLegal(SrcVT) &&
21631 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21632 return Op;
21633 }
21634
21635 if (VT.isVector()) {
21636 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21637 MVT ResVT = MVT::v4i32;
21638 MVT TruncVT = MVT::v4i1;
21639 unsigned Opc;
21640 if (IsStrict)
21642 else
21643 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21644
21645 if (!IsSigned && !HasVLX) {
21646 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21647 // Widen to 512-bits.
21648 ResVT = MVT::v8i32;
21649 TruncVT = MVT::v8i1;
21650 Opc = Op.getOpcode();
21651 // Need to concat with zero vector for strict fp to avoid spurious
21652 // exceptions.
21653 // TODO: Should we just do this for non-strict as well?
21654 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21655 : DAG.getUNDEF(MVT::v8f64);
21656 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21657 DAG.getVectorIdxConstant(0, dl));
21658 }
21659 if (IsStrict) {
21660 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21661 Chain = Res.getValue(1);
21662 } else {
21663 Res = DAG.getNode(Opc, dl, ResVT, Src);
21664 }
21665
21666 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21667 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21668 DAG.getVectorIdxConstant(0, dl));
21669 if (IsStrict)
21670 return DAG.getMergeValues({Res, Chain}, dl);
21671 return Res;
21672 }
21673
21674 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21675 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21676 VT == MVT::v32i16)
21677 return Op;
21678
21679 MVT ResVT = VT;
21680 MVT EleVT = VT.getVectorElementType();
21681 if (EleVT != MVT::i64)
21682 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21683
21684 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21685 SDValue Tmp =
21686 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21687 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21688 Ops[0] = Src;
21689 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21690 }
21691
21692 if (!HasVLX) {
21693 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21694 // Widen to 512-bits.
21695 unsigned IntSize = EleVT.getSizeInBits();
21696 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21697 ResVT = MVT::getVectorVT(EleVT, Num);
21698 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21699 Subtarget, DAG, dl);
21700 }
21701
21702 if (IsStrict) {
21703 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21705 dl, {ResVT, MVT::Other}, {Chain, Src});
21706 Chain = Res.getValue(1);
21707 } else {
21708 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21709 ResVT, Src);
21710 }
21711
21712 // TODO: Need to add exception check code for strict FP.
21713 if (EleVT.getSizeInBits() < 16) {
21714 if (HasVLX)
21715 ResVT = MVT::getVectorVT(EleVT, 8);
21716 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21717 }
21718
21719 if (ResVT != VT)
21720 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21721 DAG.getVectorIdxConstant(0, dl));
21722
21723 if (IsStrict)
21724 return DAG.getMergeValues({Res, Chain}, dl);
21725 return Res;
21726 }
21727
21728 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21729 if (VT.getVectorElementType() == MVT::i16) {
21730 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21731 SrcVT.getVectorElementType() == MVT::f64) &&
21732 "Expected f32/f64 vector!");
21733 MVT NVT = VT.changeVectorElementType(MVT::i32);
21734 if (IsStrict) {
21735 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21737 dl, {NVT, MVT::Other}, {Chain, Src});
21738 Chain = Res.getValue(1);
21739 } else {
21740 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21741 NVT, Src);
21742 }
21743
21744 // TODO: Need to add exception check code for strict FP.
21745 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21746
21747 if (IsStrict)
21748 return DAG.getMergeValues({Res, Chain}, dl);
21749 return Res;
21750 }
21751
21752 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21753 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21754 assert(!IsSigned && "Expected unsigned conversion!");
21755 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21756 return Op;
21757 }
21758
21759 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21760 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21761 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21762 Subtarget.useAVX512Regs()) {
21763 assert(!IsSigned && "Expected unsigned conversion!");
21764 assert(!Subtarget.hasVLX() && "Unexpected features!");
21765 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21766 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21767 // Need to concat with zero vector for strict fp to avoid spurious
21768 // exceptions.
21769 // TODO: Should we just do this for non-strict as well?
21770 SDValue Tmp =
21771 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21772 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21773 DAG.getVectorIdxConstant(0, dl));
21774
21775 if (IsStrict) {
21776 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21777 {Chain, Src});
21778 Chain = Res.getValue(1);
21779 } else {
21780 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21781 }
21782
21783 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21784 DAG.getVectorIdxConstant(0, dl));
21785
21786 if (IsStrict)
21787 return DAG.getMergeValues({Res, Chain}, dl);
21788 return Res;
21789 }
21790
21791 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21792 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21793 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21794 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21795 assert(!Subtarget.hasVLX() && "Unexpected features!");
21796 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21797 // Need to concat with zero vector for strict fp to avoid spurious
21798 // exceptions.
21799 // TODO: Should we just do this for non-strict as well?
21800 SDValue Tmp =
21801 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21802 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21803 DAG.getVectorIdxConstant(0, dl));
21804
21805 if (IsStrict) {
21806 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21807 {Chain, Src});
21808 Chain = Res.getValue(1);
21809 } else {
21810 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21811 }
21812
21813 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21814 DAG.getVectorIdxConstant(0, dl));
21815
21816 if (IsStrict)
21817 return DAG.getMergeValues({Res, Chain}, dl);
21818 return Res;
21819 }
21820
21821 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21822 if (!Subtarget.hasVLX()) {
21823 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21824 // legalizer and then widened again by vector op legalization.
21825 if (!IsStrict)
21826 return SDValue();
21827
21828 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21829 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21830 {Src, Zero, Zero, Zero});
21831 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21832 {Chain, Tmp});
21833 SDValue Chain = Tmp.getValue(1);
21834 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21835 DAG.getVectorIdxConstant(0, dl));
21836 return DAG.getMergeValues({Tmp, Chain}, dl);
21837 }
21838
21839 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21840 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21841 DAG.getUNDEF(MVT::v2f32));
21842 if (IsStrict) {
21843 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21845 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21846 }
21847 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21848 return DAG.getNode(Opc, dl, VT, Tmp);
21849 }
21850
21851 // Generate optimized instructions for pre AVX512 unsigned conversions from
21852 // vXf32 to vXi32.
21853 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21854 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21855 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21856 assert(!IsSigned && "Expected unsigned conversion!");
21857 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21858 }
21859
21860 return SDValue();
21861 }
21862
21863 assert(!VT.isVector());
21864
21865 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21866
21867 if (!IsSigned && UseSSEReg) {
21868 // Conversions from f32/f64 with AVX512 should be legal.
21869 if (Subtarget.hasAVX512())
21870 return Op;
21871
21872 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21873 // behaves on out of range inputs to generate optimized conversions.
21874 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21875 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21876 unsigned DstBits = VT.getScalarSizeInBits();
21877 APInt UIntLimit = APInt::getSignMask(DstBits);
21878 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21879 DAG.getConstant(UIntLimit, dl, VT));
21880 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21881
21882 // Calculate the converted result for values in the range:
21883 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21884 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21885 SDValue Small =
21886 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21887 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21888 SDValue Big = DAG.getNode(
21889 X86ISD::CVTTS2SI, dl, VT,
21890 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21891 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21892
21893 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21894 // and only if the value was out of range. So we can use that
21895 // as our indicator that we rather use "Big" instead of "Small".
21896 //
21897 // Use "Small" if "IsOverflown" has all bits cleared
21898 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21899 SDValue IsOverflown = DAG.getNode(
21900 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21901 return DAG.getNode(ISD::OR, dl, VT, Small,
21902 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21903 }
21904
21905 // Use default expansion for i64.
21906 if (VT == MVT::i64)
21907 return SDValue();
21908
21909 assert(VT == MVT::i32 && "Unexpected VT!");
21910
21911 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21912 // FIXME: This does not generate an invalid exception if the input does not
21913 // fit in i32. PR44019
21914 if (Subtarget.is64Bit()) {
21915 if (IsStrict) {
21916 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21917 {Chain, Src});
21918 Chain = Res.getValue(1);
21919 } else
21920 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21921
21922 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21923 if (IsStrict)
21924 return DAG.getMergeValues({Res, Chain}, dl);
21925 return Res;
21926 }
21927
21928 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21929 // use fisttp which will be handled later.
21930 if (!Subtarget.hasSSE3())
21931 return SDValue();
21932 }
21933
21934 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21935 // FIXME: This does not generate an invalid exception if the input does not
21936 // fit in i16. PR44019
21937 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21938 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21939 if (IsStrict) {
21940 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21941 {Chain, Src});
21942 Chain = Res.getValue(1);
21943 } else
21944 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21945
21946 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21947 if (IsStrict)
21948 return DAG.getMergeValues({Res, Chain}, dl);
21949 return Res;
21950 }
21951
21952 // If this is a FP_TO_SINT using SSEReg we're done.
21953 if (UseSSEReg && IsSigned)
21954 return Op;
21955
21956 // fp128 needs to use a libcall.
21957 if (SrcVT == MVT::f128) {
21958 RTLIB::Libcall LC;
21959 if (IsSigned)
21960 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21961 else
21962 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21963
21964 MakeLibCallOptions CallOptions;
21965 std::pair<SDValue, SDValue> Tmp =
21966 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21967
21968 if (IsStrict)
21969 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21970
21971 return Tmp.first;
21972 }
21973
21974 // Fall back to X87.
21975 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21976 if (IsStrict)
21977 return DAG.getMergeValues({V, Chain}, dl);
21978 return V;
21979 }
21980
21981 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21982}
21983
21984SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21985 SelectionDAG &DAG) const {
21986 SDValue Src = Op.getOperand(0);
21987 EVT DstVT = Op.getSimpleValueType();
21988 MVT SrcVT = Src.getSimpleValueType();
21989
21990 if (SrcVT.isVector())
21991 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21992
21993 if (SrcVT == MVT::f16)
21994 return SDValue();
21995
21996 // If the source is in an SSE register, the node is Legal.
21997 if (isScalarFPTypeInSSEReg(SrcVT))
21998 return Op;
21999
22000 return LRINT_LLRINTHelper(Op.getNode(), DAG);
22001}
22002
22003SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22004 SelectionDAG &DAG) const {
22005 EVT DstVT = N->getValueType(0);
22006 SDValue Src = N->getOperand(0);
22007 EVT SrcVT = Src.getValueType();
22008
22009 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22010 // f16 must be promoted before using the lowering in this routine.
22011 // fp128 does not use this lowering.
22012 return SDValue();
22013 }
22014
22015 SDLoc DL(N);
22016 SDValue Chain = DAG.getEntryNode();
22017
22018 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22019
22020 // If we're converting from SSE, the stack slot needs to hold both types.
22021 // Otherwise it only needs to hold the DstVT.
22022 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22023 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22024 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22025 MachinePointerInfo MPI =
22027
22028 if (UseSSE) {
22029 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22030 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22031 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22032 SDValue Ops[] = { Chain, StackPtr };
22033
22034 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22035 /*Align*/ std::nullopt,
22037 Chain = Src.getValue(1);
22038 }
22039
22040 SDValue StoreOps[] = { Chain, Src, StackPtr };
22041 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22042 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22044
22045 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22046}
22047
22048SDValue
22049X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22050 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22051 // but making use of X86 specifics to produce better instruction sequences.
22052 SDNode *Node = Op.getNode();
22053 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22054 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22055 SDLoc dl(SDValue(Node, 0));
22056 SDValue Src = Node->getOperand(0);
22057
22058 // There are three types involved here: SrcVT is the source floating point
22059 // type, DstVT is the type of the result, and TmpVT is the result of the
22060 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22061 // DstVT).
22062 EVT SrcVT = Src.getValueType();
22063 EVT DstVT = Node->getValueType(0);
22064 EVT TmpVT = DstVT;
22065
22066 // This code is only for floats and doubles. Fall back to generic code for
22067 // anything else.
22068 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22069 return SDValue();
22070
22071 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22072 unsigned SatWidth = SatVT.getScalarSizeInBits();
22073 unsigned DstWidth = DstVT.getScalarSizeInBits();
22074 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22075 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22076 "Expected saturation width smaller than result width");
22077
22078 // Promote result of FP_TO_*INT to at least 32 bits.
22079 if (TmpWidth < 32) {
22080 TmpVT = MVT::i32;
22081 TmpWidth = 32;
22082 }
22083
22084 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22085 // us to use a native signed conversion instead.
22086 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22087 TmpVT = MVT::i64;
22088 TmpWidth = 64;
22089 }
22090
22091 // If the saturation width is smaller than the size of the temporary result,
22092 // we can always use signed conversion, which is native.
22093 if (SatWidth < TmpWidth)
22094 FpToIntOpcode = ISD::FP_TO_SINT;
22095
22096 // Determine minimum and maximum integer values and their corresponding
22097 // floating-point values.
22098 APInt MinInt, MaxInt;
22099 if (IsSigned) {
22100 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22101 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22102 } else {
22103 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22104 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22105 }
22106
22107 const fltSemantics &Sem = SrcVT.getFltSemantics();
22108 APFloat MinFloat(Sem);
22109 APFloat MaxFloat(Sem);
22110
22111 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22112 MinInt, IsSigned, APFloat::rmTowardZero);
22113 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22114 MaxInt, IsSigned, APFloat::rmTowardZero);
22115 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22116 && !(MaxStatus & APFloat::opStatus::opInexact);
22117
22118 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22119 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22120
22121 // If the integer bounds are exactly representable as floats, emit a
22122 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22123 if (AreExactFloatBounds) {
22124 if (DstVT != TmpVT) {
22125 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22126 SDValue MinClamped = DAG.getNode(
22127 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22128 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22129 SDValue BothClamped = DAG.getNode(
22130 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22131 // Convert clamped value to integer.
22132 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22133
22134 // NaN will become INDVAL, with the top bit set and the rest zero.
22135 // Truncation will discard the top bit, resulting in zero.
22136 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22137 }
22138
22139 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22140 SDValue MinClamped = DAG.getNode(
22141 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22142 // Clamp by MaxFloat from above. NaN cannot occur.
22143 SDValue BothClamped = DAG.getNode(
22144 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22145 // Convert clamped value to integer.
22146 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22147
22148 if (!IsSigned) {
22149 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22150 // which is zero.
22151 return FpToInt;
22152 }
22153
22154 // Otherwise, select zero if Src is NaN.
22155 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22156 return DAG.getSelectCC(
22157 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22158 }
22159
22160 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22161 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22162
22163 // Result of direct conversion, which may be selected away.
22164 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22165
22166 if (DstVT != TmpVT) {
22167 // NaN will become INDVAL, with the top bit set and the rest zero.
22168 // Truncation will discard the top bit, resulting in zero.
22169 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22170 }
22171
22172 SDValue Select = FpToInt;
22173 // For signed conversions where we saturate to the same size as the
22174 // result type of the fptoi instructions, INDVAL coincides with integer
22175 // minimum, so we don't need to explicitly check it.
22176 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22177 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22178 // MinInt if Src is NaN.
22179 Select = DAG.getSelectCC(
22180 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22181 }
22182
22183 // If Src OGT MaxFloat, select MaxInt.
22184 Select = DAG.getSelectCC(
22185 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22186
22187 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22188 // is already zero. The promoted case was already handled above.
22189 if (!IsSigned || DstVT != TmpVT) {
22190 return Select;
22191 }
22192
22193 // Otherwise, select 0 if Src is NaN.
22194 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22195 return DAG.getSelectCC(
22196 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22197}
22198
22199SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22200 bool IsStrict = Op->isStrictFPOpcode();
22201
22202 SDLoc DL(Op);
22203 MVT VT = Op.getSimpleValueType();
22204 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22205 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22206 MVT SVT = In.getSimpleValueType();
22207
22208 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22209 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22210 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22211 !Subtarget.getTargetTriple().isOSDarwin()))
22212 return SDValue();
22213
22214 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22215 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22216 return Op;
22217
22218 if (SVT == MVT::f16) {
22219 if (Subtarget.hasFP16())
22220 return Op;
22221
22222 if (VT != MVT::f32) {
22223 if (IsStrict)
22224 return DAG.getNode(
22225 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22226 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22227 {MVT::f32, MVT::Other}, {Chain, In})});
22228
22229 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22230 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22231 }
22232
22233 if (!Subtarget.hasF16C()) {
22234 if (!Subtarget.getTargetTriple().isOSDarwin())
22235 return SDValue();
22236
22237 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22238
22239 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22240 TargetLowering::CallLoweringInfo CLI(DAG);
22241 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22242
22243 In = DAG.getBitcast(MVT::i16, In);
22245 TargetLowering::ArgListEntry Entry(
22246 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22247 Entry.IsSExt = false;
22248 Entry.IsZExt = true;
22249 Args.push_back(Entry);
22250
22252 getLibcallName(RTLIB::FPEXT_F16_F32),
22254 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22255 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22256 std::move(Args));
22257
22258 SDValue Res;
22259 std::tie(Res,Chain) = LowerCallTo(CLI);
22260 if (IsStrict)
22261 Res = DAG.getMergeValues({Res, Chain}, DL);
22262
22263 return Res;
22264 }
22265
22266 In = DAG.getBitcast(MVT::i16, In);
22267 SDValue Res;
22268 if (IsStrict) {
22269 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22270 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22271 DAG.getVectorIdxConstant(0, DL));
22272 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22273 {Chain, In});
22274 Chain = Res.getValue(1);
22275 } else {
22276 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22277 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22278 DAG.getUNDEF(MVT::v4i32), In,
22279 DAG.getVectorIdxConstant(0, DL));
22280 In = DAG.getBitcast(MVT::v8i16, In);
22281 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22282 DAG.getTargetConstant(4, DL, MVT::i32));
22283 }
22284 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22285 DAG.getVectorIdxConstant(0, DL));
22286 if (IsStrict)
22287 return DAG.getMergeValues({Res, Chain}, DL);
22288 return Res;
22289 }
22290
22291 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22292 return Op;
22293
22294 if (SVT.getVectorElementType() == MVT::f16) {
22295 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22296 return Op;
22297 assert(Subtarget.hasF16C() && "Unexpected features!");
22298 if (SVT == MVT::v2f16)
22299 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22300 DAG.getUNDEF(MVT::v2f16));
22301 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22302 DAG.getUNDEF(MVT::v4f16));
22303 if (IsStrict)
22304 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22305 {Op->getOperand(0), Res});
22306 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22307 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22308 return Op;
22309 }
22310
22311 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22312
22313 SDValue Res =
22314 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22315 if (IsStrict)
22316 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22317 {Op->getOperand(0), Res});
22318 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22319}
22320
22321SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22322 bool IsStrict = Op->isStrictFPOpcode();
22323
22324 SDLoc DL(Op);
22325 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22326 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22327 MVT VT = Op.getSimpleValueType();
22328 MVT SVT = In.getSimpleValueType();
22329
22330 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22331 return SDValue();
22332
22333 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22334 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22335 if (!Subtarget.getTargetTriple().isOSDarwin())
22336 return SDValue();
22337
22338 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22339 TargetLowering::CallLoweringInfo CLI(DAG);
22340 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22341
22343 TargetLowering::ArgListEntry Entry(
22344 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22345 Entry.IsSExt = false;
22346 Entry.IsZExt = true;
22347 Args.push_back(Entry);
22348
22350 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22351 : RTLIB::FPROUND_F32_F16),
22353 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22354 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22355 std::move(Args));
22356
22357 SDValue Res;
22358 std::tie(Res, Chain) = LowerCallTo(CLI);
22359
22360 Res = DAG.getBitcast(MVT::f16, Res);
22361
22362 if (IsStrict)
22363 Res = DAG.getMergeValues({Res, Chain}, DL);
22364
22365 return Res;
22366 }
22367
22368 if (VT.getScalarType() == MVT::bf16) {
22369 if (SVT.getScalarType() == MVT::f32 &&
22370 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22371 Subtarget.hasAVXNECONVERT()))
22372 return Op;
22373 return SDValue();
22374 }
22375
22376 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22377 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22378 return SDValue();
22379
22380 if (VT.isVector())
22381 return Op;
22382
22383 SDValue Res;
22385 MVT::i32);
22386 if (IsStrict) {
22387 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22388 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22389 DAG.getVectorIdxConstant(0, DL));
22390 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22391 {Chain, Res, Rnd});
22392 Chain = Res.getValue(1);
22393 } else {
22394 // FIXME: Should we use zeros for upper elements for non-strict?
22395 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22396 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22397 }
22398
22399 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22400 DAG.getVectorIdxConstant(0, DL));
22401 Res = DAG.getBitcast(MVT::f16, Res);
22402
22403 if (IsStrict)
22404 return DAG.getMergeValues({Res, Chain}, DL);
22405
22406 return Res;
22407 }
22408
22409 return Op;
22410}
22411
22413 bool IsStrict = Op->isStrictFPOpcode();
22414 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22415 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22416 "Unexpected VT!");
22417
22418 SDLoc dl(Op);
22419 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22420 DAG.getConstant(0, dl, MVT::v8i16), Src,
22421 DAG.getVectorIdxConstant(0, dl));
22422
22423 SDValue Chain;
22424 if (IsStrict) {
22425 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22426 {Op.getOperand(0), Res});
22427 Chain = Res.getValue(1);
22428 } else {
22429 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22430 }
22431
22432 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22433 DAG.getVectorIdxConstant(0, dl));
22434
22435 if (IsStrict)
22436 return DAG.getMergeValues({Res, Chain}, dl);
22437
22438 return Res;
22439}
22440
22442 bool IsStrict = Op->isStrictFPOpcode();
22443 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22444 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22445 "Unexpected VT!");
22446
22447 SDLoc dl(Op);
22448 SDValue Res, Chain;
22449 if (IsStrict) {
22450 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22451 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22452 DAG.getVectorIdxConstant(0, dl));
22453 Res = DAG.getNode(
22454 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22455 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22456 Chain = Res.getValue(1);
22457 } else {
22458 // FIXME: Should we use zeros for upper elements for non-strict?
22459 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22460 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22461 DAG.getTargetConstant(4, dl, MVT::i32));
22462 }
22463
22464 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22465 DAG.getVectorIdxConstant(0, dl));
22466
22467 if (IsStrict)
22468 return DAG.getMergeValues({Res, Chain}, dl);
22469
22470 return Res;
22471}
22472
22473SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22474 SelectionDAG &DAG) const {
22475 SDLoc DL(Op);
22476
22477 MVT SVT = Op.getOperand(0).getSimpleValueType();
22478 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22479 Subtarget.hasAVXNECONVERT())) {
22480 SDValue Res;
22481 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22482 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22483 Res = DAG.getBitcast(MVT::v8i16, Res);
22484 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22485 DAG.getVectorIdxConstant(0, DL));
22486 }
22487
22488 MakeLibCallOptions CallOptions;
22489 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22490 SDValue Res =
22491 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22492 return DAG.getBitcast(MVT::i16, Res);
22493}
22494
22495/// Depending on uarch and/or optimizing for size, we might prefer to use a
22496/// vector operation in place of the typical scalar operation.
22498 SelectionDAG &DAG,
22499 const X86Subtarget &Subtarget) {
22500 // If both operands have other uses, this is probably not profitable.
22501 SDValue LHS = Op.getOperand(0);
22502 SDValue RHS = Op.getOperand(1);
22503 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22504 return Op;
22505
22506 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22507 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22508 if (IsFP && !Subtarget.hasSSE3())
22509 return Op;
22510 if (!IsFP && !Subtarget.hasSSSE3())
22511 return Op;
22512
22513 // Extract from a common vector.
22514 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22515 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22516 LHS.getOperand(0) != RHS.getOperand(0) ||
22517 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22518 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22519 !shouldUseHorizontalOp(true, DAG, Subtarget))
22520 return Op;
22521
22522 // Allow commuted 'hadd' ops.
22523 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22524 unsigned HOpcode;
22525 switch (Op.getOpcode()) {
22526 // clang-format off
22527 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22528 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22529 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22530 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22531 default:
22532 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22533 // clang-format on
22534 }
22535 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22536 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22537 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22538 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22539 std::swap(LExtIndex, RExtIndex);
22540
22541 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22542 return Op;
22543
22544 SDValue X = LHS.getOperand(0);
22545 EVT VecVT = X.getValueType();
22546 unsigned BitWidth = VecVT.getSizeInBits();
22547 unsigned NumLanes = BitWidth / 128;
22548 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22549 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22550 "Not expecting illegal vector widths here");
22551
22552 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22553 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22554 if (BitWidth == 256 || BitWidth == 512) {
22555 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22556 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22557 LExtIndex %= NumEltsPerLane;
22558 }
22559
22560 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22561 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22562 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22563 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22564 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22565 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22566 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22567}
22568
22569/// Depending on uarch and/or optimizing for size, we might prefer to use a
22570/// vector operation in place of the typical scalar operation.
22571SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22572 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22573 "Only expecting float/double");
22574 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22575}
22576
22577/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22578/// This mode isn't supported in hardware on X86. But as long as we aren't
22579/// compiling with trapping math, we can emulate this with
22580/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22582 SDValue N0 = Op.getOperand(0);
22583 SDLoc dl(Op);
22584 MVT VT = Op.getSimpleValueType();
22585
22586 // N0 += copysign(nextafter(0.5, 0.0), N0)
22587 const fltSemantics &Sem = VT.getFltSemantics();
22588 bool Ignored;
22589 APFloat Point5Pred = APFloat(0.5f);
22590 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22591 Point5Pred.next(/*nextDown*/true);
22592
22593 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22594 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22595 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22596
22597 // Truncate the result to remove fraction.
22598 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22599}
22600
22601/// The only differences between FABS and FNEG are the mask and the logic op.
22602/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22604 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22605 "Wrong opcode for lowering FABS or FNEG.");
22606
22607 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22608
22609 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22610 // into an FNABS. We'll lower the FABS after that if it is still in use.
22611 if (IsFABS)
22612 for (SDNode *User : Op->users())
22613 if (User->getOpcode() == ISD::FNEG)
22614 return Op;
22615
22616 SDLoc dl(Op);
22617 MVT VT = Op.getSimpleValueType();
22618
22619 bool IsF128 = (VT == MVT::f128);
22620 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22622 "Unexpected type in LowerFABSorFNEG");
22623
22624 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22625 // decide if we should generate a 16-byte constant mask when we only need 4 or
22626 // 8 bytes for the scalar case.
22627
22628 // There are no scalar bitwise logical SSE/AVX instructions, so we
22629 // generate a 16-byte vector constant and logic op even for the scalar case.
22630 // Using a 16-byte mask allows folding the load of the mask with
22631 // the logic op, so it can save (~4 bytes) on code size.
22632 bool IsFakeVector = !VT.isVector() && !IsF128;
22633 MVT LogicVT = VT;
22634 if (IsFakeVector)
22635 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22636 : (VT == MVT::f32) ? MVT::v4f32
22637 : MVT::v8f16;
22638
22639 unsigned EltBits = VT.getScalarSizeInBits();
22640 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22641 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22642 APInt::getSignMask(EltBits);
22643 const fltSemantics &Sem = VT.getFltSemantics();
22644 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22645
22646 SDValue Op0 = Op.getOperand(0);
22647 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22648 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22649 IsFNABS ? X86ISD::FOR :
22651 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22652
22653 if (VT.isVector() || IsF128)
22654 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22655
22656 // For the scalar case extend to a 128-bit vector, perform the logic op,
22657 // and extract the scalar result back out.
22658 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22659 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22660 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22661 DAG.getVectorIdxConstant(0, dl));
22662}
22663
22665 SDValue Mag = Op.getOperand(0);
22666 SDValue Sign = Op.getOperand(1);
22667 SDLoc dl(Op);
22668
22669 // If the sign operand is smaller, extend it first.
22670 MVT VT = Op.getSimpleValueType();
22671 if (Sign.getSimpleValueType().bitsLT(VT))
22672 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22673
22674 // And if it is bigger, shrink it first.
22675 if (Sign.getSimpleValueType().bitsGT(VT))
22676 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22677 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22678
22679 // At this point the operands and the result should have the same
22680 // type, and that won't be f80 since that is not custom lowered.
22681 bool IsF128 = (VT == MVT::f128);
22682 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22684 "Unexpected type in LowerFCOPYSIGN");
22685
22686 const fltSemantics &Sem = VT.getFltSemantics();
22687
22688 // Perform all scalar logic operations as 16-byte vectors because there are no
22689 // scalar FP logic instructions in SSE.
22690 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22691 // unnecessary splats, but we might miss load folding opportunities. Should
22692 // this decision be based on OptimizeForSize?
22693 bool IsFakeVector = !VT.isVector() && !IsF128;
22694 MVT LogicVT = VT;
22695 if (IsFakeVector)
22696 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22697 : (VT == MVT::f32) ? MVT::v4f32
22698 : MVT::v8f16;
22699
22700 // The mask constants are automatically splatted for vector types.
22701 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22702 SDValue SignMask = DAG.getConstantFP(
22703 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22704 SDValue MagMask = DAG.getConstantFP(
22705 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22706
22707 // First, clear all bits but the sign bit from the second operand (sign).
22708 if (IsFakeVector)
22709 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22710 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22711
22712 // Next, clear the sign bit from the first operand (magnitude).
22713 // TODO: If we had general constant folding for FP logic ops, this check
22714 // wouldn't be necessary.
22715 SDValue MagBits;
22716 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22717 APFloat APF = Op0CN->getValueAPF();
22718 APF.clearSign();
22719 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22720 } else {
22721 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22722 if (IsFakeVector)
22723 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22724 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22725 }
22726
22727 // OR the magnitude value with the sign bit.
22728 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22729 return !IsFakeVector ? Or
22730 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22731 DAG.getVectorIdxConstant(0, dl));
22732}
22733
22735 SDValue N0 = Op.getOperand(0);
22736 SDLoc dl(Op);
22737 MVT VT = Op.getSimpleValueType();
22738
22739 MVT OpVT = N0.getSimpleValueType();
22740 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22741 "Unexpected type for FGETSIGN");
22742
22743 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22744 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22745 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22746 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22747 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22748 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22749 return Res;
22750}
22751
22752/// Helper for attempting to create a X86ISD::BT node.
22753static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22754 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22755 // instruction. Since the shift amount is in-range-or-undefined, we know
22756 // that doing a bittest on the i32 value is ok. We extend to i32 because
22757 // the encoding for the i16 version is larger than the i32 version.
22758 // Also promote i16 to i32 for performance / code size reason.
22759 if (Src.getValueType().getScalarSizeInBits() < 32)
22760 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22761
22762 // No legal type found, give up.
22763 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22764 return SDValue();
22765
22766 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22767 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22768 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22769 // known to be zero.
22770 if (Src.getValueType() == MVT::i64 &&
22771 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22772 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22773
22774 // If the operand types disagree, extend the shift amount to match. Since
22775 // BT ignores high bits (like shifts) we can use anyextend.
22776 if (Src.getValueType() != BitNo.getValueType()) {
22777 // Peek through a mask/modulo operation.
22778 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22779 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22780 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22781 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22782 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22783 BitNo.getOperand(0)),
22784 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22785 BitNo.getOperand(1)));
22786 else
22787 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22788 }
22789
22790 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22791}
22792
22793/// Helper for creating a X86ISD::SETCC node.
22795 SelectionDAG &DAG) {
22796 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22797 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22798}
22799
22800/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22801/// recognizable memcmp expansion.
22802static bool isOrXorXorTree(SDValue X, bool Root = true) {
22803 if (X.getOpcode() == ISD::OR)
22804 return isOrXorXorTree(X.getOperand(0), false) &&
22805 isOrXorXorTree(X.getOperand(1), false);
22806 if (Root)
22807 return false;
22808 return X.getOpcode() == ISD::XOR;
22809}
22810
22811/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22812/// expansion.
22813template <typename F>
22815 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22816 SDValue Op0 = X.getOperand(0);
22817 SDValue Op1 = X.getOperand(1);
22818 if (X.getOpcode() == ISD::OR) {
22819 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22820 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22821 if (VecVT != CmpVT)
22822 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22823 if (HasPT)
22824 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22825 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22826 }
22827 if (X.getOpcode() == ISD::XOR) {
22828 SDValue A = SToV(Op0);
22829 SDValue B = SToV(Op1);
22830 if (VecVT != CmpVT)
22831 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22832 if (HasPT)
22833 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22834 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22835 }
22836 llvm_unreachable("Impossible");
22837}
22838
22839/// Try to map a 128-bit or larger integer comparison to vector instructions
22840/// before type legalization splits it up into chunks.
22842 ISD::CondCode CC,
22843 const SDLoc &DL,
22844 SelectionDAG &DAG,
22845 const X86Subtarget &Subtarget) {
22846 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22847
22848 // We're looking for an oversized integer equality comparison.
22849 EVT OpVT = X.getValueType();
22850 unsigned OpSize = OpVT.getSizeInBits();
22851 if (!OpVT.isScalarInteger() || OpSize < 128)
22852 return SDValue();
22853
22854 // Ignore a comparison with zero because that gets special treatment in
22855 // EmitTest(). But make an exception for the special case of a pair of
22856 // logically-combined vector-sized operands compared to zero. This pattern may
22857 // be generated by the memcmp expansion pass with oversized integer compares
22858 // (see PR33325).
22859 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22860 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22861 return SDValue();
22862
22863 // Don't perform this combine if constructing the vector will be expensive.
22864 auto IsVectorBitCastCheap = [](SDValue X) {
22866 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22867 X.getOpcode() == ISD::LOAD;
22868 };
22869 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22870 !IsOrXorXorTreeCCZero)
22871 return SDValue();
22872
22873 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22874 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22875 // Otherwise use PCMPEQ (plus AND) and mask testing.
22876 bool NoImplicitFloatOps =
22878 Attribute::NoImplicitFloat);
22879 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22880 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22881 (OpSize == 256 && Subtarget.hasAVX()) ||
22882 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22883 bool HasPT = Subtarget.hasSSE41();
22884
22885 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22886 // vector registers are essentially free. (Technically, widening registers
22887 // prevents load folding, but the tradeoff is worth it.)
22888 bool PreferKOT = Subtarget.preferMaskRegisters();
22889 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22890
22891 EVT VecVT = MVT::v16i8;
22892 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22893 if (OpSize == 256) {
22894 VecVT = MVT::v32i8;
22895 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22896 }
22897 EVT CastVT = VecVT;
22898 bool NeedsAVX512FCast = false;
22899 if (OpSize == 512 || NeedZExt) {
22900 if (Subtarget.hasBWI()) {
22901 VecVT = MVT::v64i8;
22902 CmpVT = MVT::v64i1;
22903 if (OpSize == 512)
22904 CastVT = VecVT;
22905 } else {
22906 VecVT = MVT::v16i32;
22907 CmpVT = MVT::v16i1;
22908 CastVT = OpSize == 512 ? VecVT
22909 : OpSize == 256 ? MVT::v8i32
22910 : MVT::v4i32;
22911 NeedsAVX512FCast = true;
22912 }
22913 }
22914
22915 auto ScalarToVector = [&](SDValue X) -> SDValue {
22916 bool TmpZext = false;
22917 EVT TmpCastVT = CastVT;
22918 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22919 SDValue OrigX = X.getOperand(0);
22920 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22921 if (OrigSize < OpSize) {
22922 if (OrigSize == 128) {
22923 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22924 X = OrigX;
22925 TmpZext = true;
22926 } else if (OrigSize == 256) {
22927 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22928 X = OrigX;
22929 TmpZext = true;
22930 }
22931 }
22932 }
22933 X = DAG.getBitcast(TmpCastVT, X);
22934 if (!NeedZExt && !TmpZext)
22935 return X;
22936 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22937 DAG.getConstant(0, DL, VecVT), X,
22938 DAG.getVectorIdxConstant(0, DL));
22939 };
22940
22941 SDValue Cmp;
22942 if (IsOrXorXorTreeCCZero) {
22943 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22944 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22945 // Use 2 vector equality compares and 'and' the results before doing a
22946 // MOVMSK.
22947 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22948 } else {
22949 SDValue VecX = ScalarToVector(X);
22950 SDValue VecY = ScalarToVector(Y);
22951 if (VecVT != CmpVT) {
22952 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22953 } else if (HasPT) {
22954 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22955 } else {
22956 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22957 }
22958 }
22959 // AVX512 should emit a setcc that will lower to kortest.
22960 if (VecVT != CmpVT) {
22961 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22962 : CmpVT == MVT::v32i1 ? MVT::i32
22963 : MVT::i16;
22964 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22965 DAG.getConstant(0, DL, KRegVT), CC);
22966 }
22967 if (HasPT) {
22968 SDValue BCCmp =
22969 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22970 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22972 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22973 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22974 }
22975 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22976 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22977 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22978 assert(Cmp.getValueType() == MVT::v16i8 &&
22979 "Non 128-bit vector on pre-SSE41 target");
22980 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22981 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22982 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22983 }
22984
22985 return SDValue();
22986}
22987
22988/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22989/// style scalarized (associative) reduction patterns. Partial reductions
22990/// are supported when the pointer SrcMask is non-null.
22991/// TODO - move this to SelectionDAG?
22994 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22996 DenseMap<SDValue, APInt> SrcOpMap;
22997 EVT VT = MVT::Other;
22998
22999 // Recognize a special case where a vector is casted into wide integer to
23000 // test all 0s.
23001 assert(Op.getOpcode() == unsigned(BinOp) &&
23002 "Unexpected bit reduction opcode");
23003 Opnds.push_back(Op.getOperand(0));
23004 Opnds.push_back(Op.getOperand(1));
23005
23006 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
23008 // BFS traverse all BinOp operands.
23009 if (I->getOpcode() == unsigned(BinOp)) {
23010 Opnds.push_back(I->getOperand(0));
23011 Opnds.push_back(I->getOperand(1));
23012 // Re-evaluate the number of nodes to be traversed.
23013 e += 2; // 2 more nodes (LHS and RHS) are pushed.
23014 continue;
23015 }
23016
23017 // Quit if a non-EXTRACT_VECTOR_ELT
23018 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23019 return false;
23020
23021 // Quit if without a constant index.
23022 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23023 if (!Idx)
23024 return false;
23025
23026 SDValue Src = I->getOperand(0);
23027 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23028 if (M == SrcOpMap.end()) {
23029 VT = Src.getValueType();
23030 // Quit if not the same type.
23031 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23032 return false;
23033 unsigned NumElts = VT.getVectorNumElements();
23034 APInt EltCount = APInt::getZero(NumElts);
23035 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23036 SrcOps.push_back(Src);
23037 }
23038
23039 // Quit if element already used.
23040 unsigned CIdx = Idx->getZExtValue();
23041 if (M->second[CIdx])
23042 return false;
23043 M->second.setBit(CIdx);
23044 }
23045
23046 if (SrcMask) {
23047 // Collect the source partial masks.
23048 for (SDValue &SrcOp : SrcOps)
23049 SrcMask->push_back(SrcOpMap[SrcOp]);
23050 } else {
23051 // Quit if not all elements are used.
23052 for (const auto &I : SrcOpMap)
23053 if (!I.second.isAllOnes())
23054 return false;
23055 }
23056
23057 return true;
23058}
23059
23060// Helper function for comparing all bits of two vectors.
23062 ISD::CondCode CC, const APInt &OriginalMask,
23063 const X86Subtarget &Subtarget,
23064 SelectionDAG &DAG, X86::CondCode &X86CC) {
23065 EVT VT = LHS.getValueType();
23066 unsigned ScalarSize = VT.getScalarSizeInBits();
23067 if (OriginalMask.getBitWidth() != ScalarSize) {
23068 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23069 return SDValue();
23070 }
23071
23072 // Quit if not convertable to legal scalar or 128/256-bit vector.
23074 return SDValue();
23075
23076 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23077 if (VT.isFloatingPoint())
23078 return SDValue();
23079
23080 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23081 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23082
23083 APInt Mask = OriginalMask;
23084
23085 auto MaskBits = [&](SDValue Src) {
23086 if (Mask.isAllOnes())
23087 return Src;
23088 EVT SrcVT = Src.getValueType();
23089 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23090 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23091 };
23092
23093 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23094 if (VT.getSizeInBits() < 128) {
23095 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23096 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23097 if (IntVT != MVT::i64)
23098 return SDValue();
23099 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23100 MVT::i32, MVT::i32);
23101 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23102 MVT::i32, MVT::i32);
23103 SDValue Lo =
23104 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23105 SDValue Hi =
23106 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23107 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23108 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23109 DAG.getConstant(0, DL, MVT::i32));
23110 }
23111 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23112 DAG.getBitcast(IntVT, MaskBits(LHS)),
23113 DAG.getBitcast(IntVT, MaskBits(RHS)));
23114 }
23115
23116 // Without PTEST, a masked v2i64 or-reduction is not faster than
23117 // scalarization.
23118 bool UseKORTEST = Subtarget.useAVX512Regs();
23119 bool UsePTEST = Subtarget.hasSSE41();
23120 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23121 return SDValue();
23122
23123 // Split down to 128/256/512-bit vector.
23124 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23125
23126 // If the input vector has vector elements wider than the target test size,
23127 // then cast to <X x i64> so it will safely split.
23128 if (ScalarSize > TestSize) {
23129 if (!Mask.isAllOnes())
23130 return SDValue();
23131 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23132 LHS = DAG.getBitcast(VT, LHS);
23133 RHS = DAG.getBitcast(VT, RHS);
23134 Mask = APInt::getAllOnes(64);
23135 }
23136
23137 if (VT.getSizeInBits() > TestSize) {
23138 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23139 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23140 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23141 while (VT.getSizeInBits() > TestSize) {
23142 auto Split = DAG.SplitVector(LHS, DL);
23143 VT = Split.first.getValueType();
23144 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23145 }
23146 RHS = DAG.getAllOnesConstant(DL, VT);
23147 } else if (!UsePTEST && !KnownRHS.isZero()) {
23148 // MOVMSK Special Case:
23149 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23150 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23151 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23152 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23153 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23154 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23155 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23156 V = DAG.getSExtOrTrunc(V, DL, VT);
23157 while (VT.getSizeInBits() > TestSize) {
23158 auto Split = DAG.SplitVector(V, DL);
23159 VT = Split.first.getValueType();
23160 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23161 }
23162 V = DAG.getNOT(DL, V, VT);
23163 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23164 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23165 DAG.getConstant(0, DL, MVT::i32));
23166 } else {
23167 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23168 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23169 while (VT.getSizeInBits() > TestSize) {
23170 auto Split = DAG.SplitVector(V, DL);
23171 VT = Split.first.getValueType();
23172 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23173 }
23174 LHS = V;
23175 RHS = DAG.getConstant(0, DL, VT);
23176 }
23177 }
23178
23179 if (UseKORTEST && VT.is512BitVector()) {
23180 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23181 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23182 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23183 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23184 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23185 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23186 }
23187
23188 if (UsePTEST) {
23189 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23190 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23191 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23192 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23193 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23194 }
23195
23196 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23197 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23198 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23199 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23200 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23201 V = DAG.getNOT(DL, V, MaskVT);
23202 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23203 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23204 DAG.getConstant(0, DL, MVT::i32));
23205}
23206
23207// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23208// to CMP(MOVMSK(PCMPEQB(X,Y))).
23210 ISD::CondCode CC, const SDLoc &DL,
23211 const X86Subtarget &Subtarget,
23212 SelectionDAG &DAG,
23213 X86::CondCode &X86CC) {
23214 SDValue Op = OrigLHS;
23215
23216 bool CmpNull;
23217 APInt Mask;
23218 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23219 CmpNull = isNullConstant(OrigRHS);
23220 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23221 return SDValue();
23222
23223 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23224 return SDValue();
23225
23226 // Check whether we're masking/truncating an OR-reduction result, in which
23227 // case track the masked bits.
23228 // TODO: Add CmpAllOnes support.
23229 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23230 if (CmpNull) {
23231 switch (Op.getOpcode()) {
23232 case ISD::TRUNCATE: {
23233 SDValue Src = Op.getOperand(0);
23234 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23235 Op.getScalarValueSizeInBits());
23236 Op = Src;
23237 break;
23238 }
23239 case ISD::AND: {
23240 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23241 Mask = Cst->getAPIntValue();
23242 Op = Op.getOperand(0);
23243 }
23244 break;
23245 }
23246 }
23247 }
23248 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23249 CC = ISD::SETEQ;
23250 CmpNull = true;
23251 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23252 } else {
23253 return SDValue();
23254 }
23255
23256 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23257
23258 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23259 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23261 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23262 EVT VT = VecIns[0].getValueType();
23263 assert(llvm::all_of(VecIns,
23264 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23265 "Reduction source vector mismatch");
23266
23267 // Quit if not splittable to scalar/128/256/512-bit vector.
23269 return SDValue();
23270
23271 // If more than one full vector is evaluated, AND/OR them first before
23272 // PTEST.
23273 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23274 Slot += 2, e += 1) {
23275 // Each iteration will AND/OR 2 nodes and append the result until there is
23276 // only 1 node left, i.e. the final value of all vectors.
23277 SDValue LHS = VecIns[Slot];
23278 SDValue RHS = VecIns[Slot + 1];
23279 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23280 }
23281
23282 return LowerVectorAllEqual(DL, VecIns.back(),
23283 CmpNull ? DAG.getConstant(0, DL, VT)
23284 : DAG.getAllOnesConstant(DL, VT),
23285 CC, Mask, Subtarget, DAG, X86CC);
23286 }
23287
23288 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23289 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23290 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23291 ISD::NodeType BinOp;
23292 if (SDValue Match =
23293 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23294 EVT MatchVT = Match.getValueType();
23295 return LowerVectorAllEqual(DL, Match,
23296 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23297 : DAG.getAllOnesConstant(DL, MatchVT),
23298 CC, Mask, Subtarget, DAG, X86CC);
23299 }
23300 }
23301
23302 if (Mask.isAllOnes()) {
23303 assert(!Op.getValueType().isVector() &&
23304 "Illegal vector type for reduction pattern");
23306 if (Src.getValueType().isFixedLengthVector() &&
23307 Src.getValueType().getScalarType() == MVT::i1) {
23308 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23309 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23310 if (Src.getOpcode() == ISD::SETCC) {
23311 SDValue LHS = Src.getOperand(0);
23312 SDValue RHS = Src.getOperand(1);
23313 EVT LHSVT = LHS.getValueType();
23314 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23315 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23317 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23318 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23319 X86CC);
23320 }
23321 }
23322 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23323 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23324 // Peek through truncation, mask the LSB and compare against zero/LSB.
23325 if (Src.getOpcode() == ISD::TRUNCATE) {
23326 SDValue Inner = Src.getOperand(0);
23327 EVT InnerVT = Inner.getValueType();
23329 unsigned BW = InnerVT.getScalarSizeInBits();
23330 APInt SrcMask = APInt(BW, 1);
23331 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23332 return LowerVectorAllEqual(DL, Inner,
23333 DAG.getConstant(Cmp, DL, InnerVT), CC,
23334 SrcMask, Subtarget, DAG, X86CC);
23335 }
23336 }
23337 }
23338 }
23339
23340 return SDValue();
23341}
23342
23343/// return true if \c Op has a use that doesn't just read flags.
23345 for (SDUse &Use : Op->uses()) {
23346 SDNode *User = Use.getUser();
23347 unsigned UOpNo = Use.getOperandNo();
23348 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23349 // Look past truncate.
23350 UOpNo = User->use_begin()->getOperandNo();
23351 User = User->use_begin()->getUser();
23352 }
23353
23354 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23355 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23356 return true;
23357 }
23358 return false;
23359}
23360
23361// Transform to an x86-specific ALU node with flags if there is a chance of
23362// using an RMW op or only the flags are used. Otherwise, leave
23363// the node alone and emit a 'cmp' or 'test' instruction.
23365 for (SDNode *U : Op->users())
23366 if (U->getOpcode() != ISD::CopyToReg &&
23367 U->getOpcode() != ISD::SETCC &&
23368 U->getOpcode() != ISD::STORE)
23369 return false;
23370
23371 return true;
23372}
23373
23374/// Emit nodes that will be selected as "test Op0,Op0", or something
23375/// equivalent.
23377 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23378 // CF and OF aren't always set the way we want. Determine which
23379 // of these we need.
23380 bool NeedCF = false;
23381 bool NeedOF = false;
23382 switch (X86CC) {
23383 default: break;
23384 case X86::COND_A: case X86::COND_AE:
23385 case X86::COND_B: case X86::COND_BE:
23386 NeedCF = true;
23387 break;
23388 case X86::COND_G: case X86::COND_GE:
23389 case X86::COND_L: case X86::COND_LE:
23390 case X86::COND_O: case X86::COND_NO: {
23391 // Check if we really need to set the
23392 // Overflow flag. If NoSignedWrap is present
23393 // that is not actually needed.
23394 switch (Op->getOpcode()) {
23395 case ISD::ADD:
23396 case ISD::SUB:
23397 case ISD::MUL:
23398 case ISD::SHL:
23399 if (Op.getNode()->getFlags().hasNoSignedWrap())
23400 break;
23401 [[fallthrough]];
23402 default:
23403 NeedOF = true;
23404 break;
23405 }
23406 break;
23407 }
23408 }
23409 // See if we can use the EFLAGS value from the operand instead of
23410 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23411 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23412 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23413 // Emit a CMP with 0, which is the TEST pattern.
23414 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23415 DAG.getConstant(0, dl, Op.getValueType()));
23416 }
23417 unsigned Opcode = 0;
23418 unsigned NumOperands = 0;
23419
23420 SDValue ArithOp = Op;
23421
23422 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23423 // which may be the result of a CAST. We use the variable 'Op', which is the
23424 // non-casted variable when we check for possible users.
23425 switch (ArithOp.getOpcode()) {
23426 case ISD::AND:
23427 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23428 // because a TEST instruction will be better.
23429 if (!hasNonFlagsUse(Op))
23430 break;
23431
23432 [[fallthrough]];
23433 case ISD::ADD:
23434 case ISD::SUB:
23435 case ISD::OR:
23436 case ISD::XOR:
23438 break;
23439
23440 // Otherwise use a regular EFLAGS-setting instruction.
23441 switch (ArithOp.getOpcode()) {
23442 // clang-format off
23443 default: llvm_unreachable("unexpected operator!");
23444 case ISD::ADD: Opcode = X86ISD::ADD; break;
23445 case ISD::SUB: Opcode = X86ISD::SUB; break;
23446 case ISD::XOR: Opcode = X86ISD::XOR; break;
23447 case ISD::AND: Opcode = X86ISD::AND; break;
23448 case ISD::OR: Opcode = X86ISD::OR; break;
23449 // clang-format on
23450 }
23451
23452 NumOperands = 2;
23453 break;
23454 case X86ISD::ADD:
23455 case X86ISD::SUB:
23456 case X86ISD::OR:
23457 case X86ISD::XOR:
23458 case X86ISD::AND:
23459 return SDValue(Op.getNode(), 1);
23460 case ISD::SSUBO:
23461 case ISD::USUBO: {
23462 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23463 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23464 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23465 Op->getOperand(1)).getValue(1);
23466 }
23467 default:
23468 break;
23469 }
23470
23471 if (Opcode == 0) {
23472 // Emit a CMP with 0, which is the TEST pattern.
23473 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23474 DAG.getConstant(0, dl, Op.getValueType()));
23475 }
23476 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23477 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23478
23479 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23480 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23481 return SDValue(New.getNode(), 1);
23482}
23483
23484/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23485/// equivalent.
23487 const SDLoc &dl, SelectionDAG &DAG,
23488 const X86Subtarget &Subtarget) {
23489 if (isNullConstant(Op1))
23490 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23491
23492 EVT CmpVT = Op0.getValueType();
23493
23494 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23495 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23496
23497 // Only promote the compare up to I32 if it is a 16 bit operation
23498 // with an immediate. 16 bit immediates are to be avoided unless the target
23499 // isn't slowed down by length changing prefixes, we're optimizing for
23500 // codesize or the comparison is with a folded load.
23501 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23502 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23504 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23505 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23506 // Don't do this if the immediate can fit in 8-bits.
23507 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23508 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23509 unsigned ExtendOp =
23511 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23512 // For equality comparisons try to use SIGN_EXTEND if the input was
23513 // truncate from something with enough sign bits.
23514 if (Op0.getOpcode() == ISD::TRUNCATE) {
23515 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23516 ExtendOp = ISD::SIGN_EXTEND;
23517 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23518 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23519 ExtendOp = ISD::SIGN_EXTEND;
23520 }
23521 }
23522
23523 CmpVT = MVT::i32;
23524 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23525 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23526 }
23527 }
23528
23529 // Try to shrink i64 compares if the input has enough zero bits.
23530 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23531 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23532 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23533 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23534 CmpVT = MVT::i32;
23535 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23536 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23537 }
23538
23539 // Try to shrink all i64 compares if the inputs are representable as signed
23540 // i32.
23541 if (CmpVT == MVT::i64 &&
23542 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23543 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23544 CmpVT = MVT::i32;
23545 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23546 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23547 }
23548
23549 // 0-x == y --> x+y == 0
23550 // 0-x != y --> x+y != 0
23551 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23552 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23553 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23554 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23555 return Add.getValue(1);
23556 }
23557
23558 // x == 0-y --> x+y == 0
23559 // x != 0-y --> x+y != 0
23560 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23561 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23562 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23563 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23564 return Add.getValue(1);
23565 }
23566
23567 // If we already have an XOR of the ops, use that to check for equality.
23568 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23569 unsigned X86Opc = X86ISD::SUB;
23570 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23571 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23572 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23573 X86Opc = X86ISD::XOR;
23574
23575 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23576 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23577 return CmpOp.getValue(1);
23578}
23579
23584
23585bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23586 SDNode *N, SDValue, SDValue IntPow2) const {
23587 if (N->getOpcode() == ISD::FDIV)
23588 return true;
23589
23590 EVT FPVT = N->getValueType(0);
23591 EVT IntVT = IntPow2.getValueType();
23592
23593 // This indicates a non-free bitcast.
23594 // TODO: This is probably overly conservative as we will need to scale the
23595 // integer vector anyways for the int->fp cast.
23596 if (FPVT.isVector() &&
23597 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23598 return false;
23599
23600 return true;
23601}
23602
23603/// Check if replacement of SQRT with RSQRT should be disabled.
23604bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23605 EVT VT = Op.getValueType();
23606
23607 // We don't need to replace SQRT with RSQRT for half type.
23608 if (VT.getScalarType() == MVT::f16)
23609 return true;
23610
23611 // We never want to use both SQRT and RSQRT instructions for the same input.
23612 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23613 return false;
23614
23615 if (VT.isVector())
23616 return Subtarget.hasFastVectorFSQRT();
23617 return Subtarget.hasFastScalarFSQRT();
23618}
23619
23620/// The minimum architected relative accuracy is 2^-12. We need one
23621/// Newton-Raphson step to have a good float result (24 bits of precision).
23622SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23623 SelectionDAG &DAG, int Enabled,
23624 int &RefinementSteps,
23625 bool &UseOneConstNR,
23626 bool Reciprocal) const {
23627 SDLoc DL(Op);
23628 EVT VT = Op.getValueType();
23629
23630 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23631 // It is likely not profitable to do this for f64 because a double-precision
23632 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23633 // instructions: convert to single, rsqrtss, convert back to double, refine
23634 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23635 // along with FMA, this could be a throughput win.
23636 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23637 // after legalize types.
23638 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23639 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23640 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23641 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23642 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23643 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23644 RefinementSteps = 1;
23645
23646 UseOneConstNR = false;
23647 // There is no FSQRT for 512-bits, but there is RSQRT14.
23648 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23649 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23650 if (RefinementSteps == 0 && !Reciprocal)
23651 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23652 return Estimate;
23653 }
23654
23655 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23656 Subtarget.hasFP16()) {
23657 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23658 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23659 RefinementSteps = 0;
23660
23661 if (VT == MVT::f16) {
23663 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23664 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23665 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23666 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23667 }
23668
23669 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23670 }
23671 return SDValue();
23672}
23673
23674/// The minimum architected relative accuracy is 2^-12. We need one
23675/// Newton-Raphson step to have a good float result (24 bits of precision).
23676SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23677 int Enabled,
23678 int &RefinementSteps) const {
23679 SDLoc DL(Op);
23680 EVT VT = Op.getValueType();
23681
23682 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23683 // It is likely not profitable to do this for f64 because a double-precision
23684 // reciprocal estimate with refinement on x86 prior to FMA requires
23685 // 15 instructions: convert to single, rcpss, convert back to double, refine
23686 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23687 // along with FMA, this could be a throughput win.
23688
23689 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23690 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23691 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23692 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23693 // Enable estimate codegen with 1 refinement step for vector division.
23694 // Scalar division estimates are disabled because they break too much
23695 // real-world code. These defaults are intended to match GCC behavior.
23696 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23697 return SDValue();
23698
23699 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23700 RefinementSteps = 1;
23701
23702 // There is no FSQRT for 512-bits, but there is RCP14.
23703 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23704 return DAG.getNode(Opcode, DL, VT, Op);
23705 }
23706
23707 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23708 Subtarget.hasFP16()) {
23709 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23710 RefinementSteps = 0;
23711
23712 if (VT == MVT::f16) {
23714 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23715 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23716 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23717 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23718 }
23719
23720 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23721 }
23722 return SDValue();
23723}
23724
23725/// If we have at least two divisions that use the same divisor, convert to
23726/// multiplication by a reciprocal. This may need to be adjusted for a given
23727/// CPU if a division's cost is not at least twice the cost of a multiplication.
23728/// This is because we still need one division to calculate the reciprocal and
23729/// then we need two multiplies by that reciprocal as replacements for the
23730/// original divisions.
23732 return 2;
23733}
23734
23735SDValue
23736X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23737 SelectionDAG &DAG,
23738 SmallVectorImpl<SDNode *> &Created) const {
23739 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23740 if (isIntDivCheap(N->getValueType(0), Attr))
23741 return SDValue(N,0); // Lower SDIV as SDIV
23742
23743 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23744 "Unexpected divisor!");
23745
23746 // Only perform this transform if CMOV is supported otherwise the select
23747 // below will become a branch.
23748 if (!Subtarget.canUseCMOV())
23749 return SDValue();
23750
23751 // fold (sdiv X, pow2)
23752 EVT VT = N->getValueType(0);
23753 // FIXME: Support i8.
23754 if (VT != MVT::i16 && VT != MVT::i32 &&
23755 !(Subtarget.is64Bit() && VT == MVT::i64))
23756 return SDValue();
23757
23758 // If the divisor is 2 or -2, the default expansion is better.
23759 if (Divisor == 2 ||
23760 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23761 return SDValue();
23762
23763 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23764}
23765
23766/// Result of 'and' is compared against zero. Change to a BT node if possible.
23767/// Returns the BT node and the condition code needed to use it.
23769 SelectionDAG &DAG, X86::CondCode &X86CC) {
23770 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23771 SDValue Op0 = And.getOperand(0);
23772 SDValue Op1 = And.getOperand(1);
23773 if (Op0.getOpcode() == ISD::TRUNCATE)
23774 Op0 = Op0.getOperand(0);
23775 if (Op1.getOpcode() == ISD::TRUNCATE)
23776 Op1 = Op1.getOperand(0);
23777
23778 SDValue Src, BitNo;
23779 if (Op1.getOpcode() == ISD::SHL)
23780 std::swap(Op0, Op1);
23781 if (Op0.getOpcode() == ISD::SHL) {
23782 if (isOneConstant(Op0.getOperand(0))) {
23783 // If we looked past a truncate, check that it's only truncating away
23784 // known zeros.
23785 unsigned BitWidth = Op0.getValueSizeInBits();
23786 unsigned AndBitWidth = And.getValueSizeInBits();
23787 if (BitWidth > AndBitWidth) {
23788 KnownBits Known = DAG.computeKnownBits(Op0);
23789 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23790 return SDValue();
23791 }
23792 Src = Op1;
23793 BitNo = Op0.getOperand(1);
23794 }
23795 } else if (Op1.getOpcode() == ISD::Constant) {
23796 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23797 uint64_t AndRHSVal = AndRHS->getZExtValue();
23798 SDValue AndLHS = Op0;
23799
23800 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23801 Src = AndLHS.getOperand(0);
23802 BitNo = AndLHS.getOperand(1);
23803 } else {
23804 // Use BT if the immediate can't be encoded in a TEST instruction or we
23805 // are optimizing for size and the immedaite won't fit in a byte.
23806 bool OptForSize = DAG.shouldOptForSize();
23807 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23808 isPowerOf2_64(AndRHSVal)) {
23809 Src = AndLHS;
23810 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23811 Src.getValueType());
23812 }
23813 }
23814 }
23815
23816 // No patterns found, give up.
23817 if (!Src.getNode())
23818 return SDValue();
23819
23820 // Remove any bit flip.
23821 if (isBitwiseNot(Src)) {
23822 Src = Src.getOperand(0);
23823 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23824 }
23825
23826 // Attempt to create the X86ISD::BT node.
23827 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23828 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23829 return BT;
23830 }
23831
23832 return SDValue();
23833}
23834
23835// Check if pre-AVX condcode can be performed by a single FCMP op.
23836static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23837 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23838}
23839
23840/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23841/// CMPs.
23842static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23843 SDValue &Op1, bool &IsAlwaysSignaling) {
23844 unsigned SSECC;
23845 bool Swap = false;
23846
23847 // SSE Condition code mapping:
23848 // 0 - EQ
23849 // 1 - LT
23850 // 2 - LE
23851 // 3 - UNORD
23852 // 4 - NEQ
23853 // 5 - NLT
23854 // 6 - NLE
23855 // 7 - ORD
23856 switch (SetCCOpcode) {
23857 // clang-format off
23858 default: llvm_unreachable("Unexpected SETCC condition");
23859 case ISD::SETOEQ:
23860 case ISD::SETEQ: SSECC = 0; break;
23861 case ISD::SETOGT:
23862 case ISD::SETGT: Swap = true; [[fallthrough]];
23863 case ISD::SETLT:
23864 case ISD::SETOLT: SSECC = 1; break;
23865 case ISD::SETOGE:
23866 case ISD::SETGE: Swap = true; [[fallthrough]];
23867 case ISD::SETLE:
23868 case ISD::SETOLE: SSECC = 2; break;
23869 case ISD::SETUO: SSECC = 3; break;
23870 case ISD::SETUNE:
23871 case ISD::SETNE: SSECC = 4; break;
23872 case ISD::SETULE: Swap = true; [[fallthrough]];
23873 case ISD::SETUGE: SSECC = 5; break;
23874 case ISD::SETULT: Swap = true; [[fallthrough]];
23875 case ISD::SETUGT: SSECC = 6; break;
23876 case ISD::SETO: SSECC = 7; break;
23877 case ISD::SETUEQ: SSECC = 8; break;
23878 case ISD::SETONE: SSECC = 12; break;
23879 // clang-format on
23880 }
23881 if (Swap)
23882 std::swap(Op0, Op1);
23883
23884 switch (SetCCOpcode) {
23885 default:
23886 IsAlwaysSignaling = true;
23887 break;
23888 case ISD::SETEQ:
23889 case ISD::SETOEQ:
23890 case ISD::SETUEQ:
23891 case ISD::SETNE:
23892 case ISD::SETONE:
23893 case ISD::SETUNE:
23894 case ISD::SETO:
23895 case ISD::SETUO:
23896 IsAlwaysSignaling = false;
23897 break;
23898 }
23899
23900 return SSECC;
23901}
23902
23903/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23904/// concatenate the result back.
23906 SelectionDAG &DAG, const SDLoc &dl) {
23907 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23908 "Unsupported VTs!");
23909 SDValue CC = DAG.getCondCode(Cond);
23910
23911 // Extract the LHS Lo/Hi vectors
23912 SDValue LHS1, LHS2;
23913 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23914
23915 // Extract the RHS Lo/Hi vectors
23916 SDValue RHS1, RHS2;
23917 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23918
23919 // Issue the operation on the smaller types and concatenate the result back
23920 EVT LoVT, HiVT;
23921 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23922 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23923 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23924 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23925}
23926
23928 SelectionDAG &DAG) {
23929 SDValue Op0 = Op.getOperand(0);
23930 SDValue Op1 = Op.getOperand(1);
23931 SDValue CC = Op.getOperand(2);
23932 MVT VT = Op.getSimpleValueType();
23933 assert(VT.getVectorElementType() == MVT::i1 &&
23934 "Cannot set masked compare for this operation");
23935
23936 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23937
23938 // Prefer SETGT over SETLT.
23939 if (SetCCOpcode == ISD::SETLT) {
23940 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23941 std::swap(Op0, Op1);
23942 }
23943
23944 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23945}
23946
23947/// Given a buildvector constant, return a new vector constant with each element
23948/// incremented or decremented. If incrementing or decrementing would result in
23949/// unsigned overflow or underflow or this is not a simple vector constant,
23950/// return an empty value.
23952 bool NSW) {
23953 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23954 if (!BV || !V.getValueType().isSimple())
23955 return SDValue();
23956
23957 MVT VT = V.getSimpleValueType();
23958 MVT EltVT = VT.getVectorElementType();
23959 unsigned NumElts = VT.getVectorNumElements();
23961 SDLoc DL(V);
23962 for (unsigned i = 0; i < NumElts; ++i) {
23963 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23964 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23965 return SDValue();
23966
23967 // Avoid overflow/underflow.
23968 const APInt &EltC = Elt->getAPIntValue();
23969 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23970 return SDValue();
23971 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23972 (!IsInc && EltC.isMinSignedValue())))
23973 return SDValue();
23974
23975 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23976 }
23977
23978 return DAG.getBuildVector(VT, DL, NewVecC);
23979}
23980
23981/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23982/// Op0 u<= Op1:
23983/// t = psubus Op0, Op1
23984/// pcmpeq t, <0..0>
23986 ISD::CondCode Cond, const SDLoc &dl,
23987 const X86Subtarget &Subtarget,
23988 SelectionDAG &DAG) {
23989 if (!Subtarget.hasSSE2())
23990 return SDValue();
23991
23992 MVT VET = VT.getVectorElementType();
23993 if (VET != MVT::i8 && VET != MVT::i16)
23994 return SDValue();
23995
23996 switch (Cond) {
23997 default:
23998 return SDValue();
23999 case ISD::SETULT: {
24000 // If the comparison is against a constant we can turn this into a
24001 // setule. With psubus, setule does not require a swap. This is
24002 // beneficial because the constant in the register is no longer
24003 // destructed as the destination so it can be hoisted out of a loop.
24004 // Only do this pre-AVX since vpcmp* is no longer destructive.
24005 if (Subtarget.hasAVX())
24006 return SDValue();
24007 SDValue ULEOp1 =
24008 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
24009 if (!ULEOp1)
24010 return SDValue();
24011 Op1 = ULEOp1;
24012 break;
24013 }
24014 case ISD::SETUGT: {
24015 // If the comparison is against a constant, we can turn this into a setuge.
24016 // This is beneficial because materializing a constant 0 for the PCMPEQ is
24017 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
24018 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
24019 SDValue UGEOp1 =
24020 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
24021 if (!UGEOp1)
24022 return SDValue();
24023 Op1 = Op0;
24024 Op0 = UGEOp1;
24025 break;
24026 }
24027 // Psubus is better than flip-sign because it requires no inversion.
24028 case ISD::SETUGE:
24029 std::swap(Op0, Op1);
24030 break;
24031 case ISD::SETULE:
24032 break;
24033 }
24034
24035 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24036 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24037 DAG.getConstant(0, dl, VT));
24038}
24039
24040static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24041 SelectionDAG &DAG) {
24042 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24043 Op.getOpcode() == ISD::STRICT_FSETCCS;
24044 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24045 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24046 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24047 MVT VT = Op->getSimpleValueType(0);
24049 MVT OpVT = Op0.getSimpleValueType();
24050 SDLoc dl(Op);
24051
24052 if (OpVT.isFloatingPoint()) {
24053 MVT EltVT = OpVT.getVectorElementType();
24054 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24055 EltVT == MVT::f64);
24056
24057 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24058 if (isSoftF16(EltVT, Subtarget)) {
24059 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24060 return SDValue();
24061
24062 // Break 256-bit FP vector compare into smaller ones.
24063 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24064 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24065
24066 // Break 512-bit FP vector compare into smaller ones.
24067 if (OpVT.is512BitVector())
24068 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24069
24070 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24071 if (IsStrict) {
24072 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24073 {Chain, Op0});
24074 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24075 {Chain, Op1});
24076 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24077 {Chain, Op0, Op1, CC});
24078 }
24079 MVT DVT = VT.getVectorElementType() == MVT::i16
24080 ? VT.changeVectorElementType(MVT::i32)
24081 : VT;
24082 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24083 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24084 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24085 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24086 }
24087
24088 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24089
24090 // If we have a strict compare with a vXi1 result and the input is 128/256
24091 // bits we can't use a masked compare unless we have VLX. If we use a wider
24092 // compare like we do for non-strict, we might trigger spurious exceptions
24093 // from the upper elements. Instead emit a AVX compare and convert to mask.
24094 unsigned Opc;
24095 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24096 (!IsStrict || Subtarget.hasVLX() ||
24098#ifndef NDEBUG
24099 unsigned Num = VT.getVectorNumElements();
24100 assert(Num <= 16 ||
24101 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24102#endif
24103 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24104 } else {
24105 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24106 // The SSE/AVX packed FP comparison nodes are defined with a
24107 // floating-point vector result that matches the operand type. This allows
24108 // them to work with an SSE1 target (integer vector types are not legal).
24109 VT = Op0.getSimpleValueType();
24110 }
24111
24112 SDValue Cmp;
24113 bool IsAlwaysSignaling;
24114 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24115 if (!Subtarget.hasAVX()) {
24116 // TODO: We could use following steps to handle a quiet compare with
24117 // signaling encodings.
24118 // 1. Get ordered masks from a quiet ISD::SETO
24119 // 2. Use the masks to mask potential unordered elements in operand A, B
24120 // 3. Get the compare results of masked A, B
24121 // 4. Calculating final result using the mask and result from 3
24122 // But currently, we just fall back to scalar operations.
24123 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24124 return SDValue();
24125
24126 // Insert an extra signaling instruction to raise exception.
24127 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24128 SDValue SignalCmp = DAG.getNode(
24129 Opc, dl, {VT, MVT::Other},
24130 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24131 // FIXME: It seems we need to update the flags of all new strict nodes.
24132 // Otherwise, mayRaiseFPException in MI will return false due to
24133 // NoFPExcept = false by default. However, I didn't find it in other
24134 // patches.
24135 SignalCmp->setFlags(Op->getFlags());
24136 Chain = SignalCmp.getValue(1);
24137 }
24138
24139 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24140 // emit two comparisons and a logic op to tie them together.
24141 if (!cheapX86FSETCC_SSE(Cond)) {
24142 // LLVM predicate is SETUEQ or SETONE.
24143 unsigned CC0, CC1;
24144 unsigned CombineOpc;
24145 if (Cond == ISD::SETUEQ) {
24146 CC0 = 3; // UNORD
24147 CC1 = 0; // EQ
24148 CombineOpc = X86ISD::FOR;
24149 } else {
24151 CC0 = 7; // ORD
24152 CC1 = 4; // NEQ
24153 CombineOpc = X86ISD::FAND;
24154 }
24155
24156 SDValue Cmp0, Cmp1;
24157 if (IsStrict) {
24158 Cmp0 = DAG.getNode(
24159 Opc, dl, {VT, MVT::Other},
24160 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24161 Cmp1 = DAG.getNode(
24162 Opc, dl, {VT, MVT::Other},
24163 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24164 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24165 Cmp1.getValue(1));
24166 } else {
24167 Cmp0 = DAG.getNode(
24168 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24169 Cmp1 = DAG.getNode(
24170 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24171 }
24172 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24173 } else {
24174 if (IsStrict) {
24175 Cmp = DAG.getNode(
24176 Opc, dl, {VT, MVT::Other},
24177 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24178 Chain = Cmp.getValue(1);
24179 } else
24180 Cmp = DAG.getNode(
24181 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24182 }
24183 } else {
24184 // Handle all other FP comparisons here.
24185 if (IsStrict) {
24186 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24187 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24188 Cmp = DAG.getNode(
24189 Opc, dl, {VT, MVT::Other},
24190 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24191 Chain = Cmp.getValue(1);
24192 } else
24193 Cmp = DAG.getNode(
24194 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24195 }
24196
24197 if (VT.getFixedSizeInBits() >
24198 Op.getSimpleValueType().getFixedSizeInBits()) {
24199 // We emitted a compare with an XMM/YMM result. Finish converting to a
24200 // mask register using a vptestm.
24202 Cmp = DAG.getBitcast(CastVT, Cmp);
24203 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24204 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24205 } else {
24206 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24207 // the result type of SETCC. The bitcast is expected to be optimized
24208 // away during combining/isel.
24209 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24210 }
24211
24212 if (IsStrict)
24213 return DAG.getMergeValues({Cmp, Chain}, dl);
24214
24215 return Cmp;
24216 }
24217
24218 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24219
24220 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24221 assert(VTOp0 == Op1.getSimpleValueType() &&
24222 "Expected operands with same type!");
24224 "Invalid number of packed elements for source and destination!");
24225
24226 // The non-AVX512 code below works under the assumption that source and
24227 // destination types are the same.
24228 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24229 "Value types for source and destination must be the same!");
24230
24231 // The result is boolean, but operands are int/float
24232 if (VT.getVectorElementType() == MVT::i1) {
24233 // In AVX-512 architecture setcc returns mask with i1 elements,
24234 // But there is no compare instruction for i8 and i16 elements in KNL.
24235 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24236 "Unexpected operand type");
24237 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24238 }
24239
24240 // Lower using XOP integer comparisons.
24241 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24242 // Translate compare code to XOP PCOM compare mode.
24243 unsigned CmpMode = 0;
24244 switch (Cond) {
24245 // clang-format off
24246 default: llvm_unreachable("Unexpected SETCC condition");
24247 case ISD::SETULT:
24248 case ISD::SETLT: CmpMode = 0x00; break;
24249 case ISD::SETULE:
24250 case ISD::SETLE: CmpMode = 0x01; break;
24251 case ISD::SETUGT:
24252 case ISD::SETGT: CmpMode = 0x02; break;
24253 case ISD::SETUGE:
24254 case ISD::SETGE: CmpMode = 0x03; break;
24255 case ISD::SETEQ: CmpMode = 0x04; break;
24256 case ISD::SETNE: CmpMode = 0x05; break;
24257 // clang-format on
24258 }
24259
24260 // Are we comparing unsigned or signed integers?
24261 unsigned Opc =
24263
24264 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24265 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24266 }
24267
24268 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24269 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24271 SDValue BC0 = peekThroughBitcasts(Op0);
24272 if (BC0.getOpcode() == ISD::AND &&
24274 /*AllowUndefs=*/false)) {
24275 Cond = ISD::SETEQ;
24276 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24277 }
24278 }
24279
24280 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24281 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24282 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24284 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24285 unsigned BitWidth = VT.getScalarSizeInBits();
24286 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24287
24288 SDValue Result = Op0.getOperand(0);
24289 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24290 DAG.getConstant(ShiftAmt, dl, VT));
24291 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24292 DAG.getConstant(BitWidth - 1, dl, VT));
24293 return Result;
24294 }
24295 }
24296
24297 // Break 256-bit integer vector compare into smaller ones.
24298 if (VT.is256BitVector() && !Subtarget.hasInt256())
24299 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24300
24301 // Break 512-bit integer vector compare into smaller ones.
24302 // TODO: Try harder to use VPCMPx + VPMOV2x?
24303 if (VT.is512BitVector())
24304 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24305
24306 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24307 // not-of-PCMPEQ:
24308 // X != INT_MIN --> X >s INT_MIN
24309 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24310 // +X != 0 --> +X >s 0
24311 APInt ConstValue;
24312 if (Cond == ISD::SETNE &&
24313 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24314 if (ConstValue.isMinSignedValue())
24315 Cond = ISD::SETGT;
24316 else if (ConstValue.isMaxSignedValue())
24317 Cond = ISD::SETLT;
24318 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24319 Cond = ISD::SETGT;
24320 }
24321
24322 // If both operands are known non-negative, then an unsigned compare is the
24323 // same as a signed compare and there's no need to flip signbits.
24324 // TODO: We could check for more general simplifications here since we're
24325 // computing known bits.
24326 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24327 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24328
24329 // Special case: Use min/max operations for unsigned compares.
24330 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24332 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24333 TLI.isOperationLegal(ISD::UMIN, VT)) {
24334 // If we have a constant operand, increment/decrement it and change the
24335 // condition to avoid an invert.
24336 if (Cond == ISD::SETUGT) {
24337 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24338 if (SDValue UGTOp1 =
24339 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24340 Op1 = UGTOp1;
24341 Cond = ISD::SETUGE;
24342 }
24343 }
24344 if (Cond == ISD::SETULT) {
24345 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24346 if (SDValue ULTOp1 =
24347 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24348 Op1 = ULTOp1;
24349 Cond = ISD::SETULE;
24350 }
24351 }
24352 bool Invert = false;
24353 unsigned Opc;
24354 switch (Cond) {
24355 // clang-format off
24356 default: llvm_unreachable("Unexpected condition code");
24357 case ISD::SETUGT: Invert = true; [[fallthrough]];
24358 case ISD::SETULE: Opc = ISD::UMIN; break;
24359 case ISD::SETULT: Invert = true; [[fallthrough]];
24360 case ISD::SETUGE: Opc = ISD::UMAX; break;
24361 // clang-format on
24362 }
24363
24364 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24365 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24366
24367 // If the logical-not of the result is required, perform that now.
24368 if (Invert)
24369 Result = DAG.getNOT(dl, Result, VT);
24370
24371 return Result;
24372 }
24373
24374 // Try to use SUBUS and PCMPEQ.
24375 if (FlipSigns)
24376 if (SDValue V =
24377 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24378 return V;
24379
24380 // We are handling one of the integer comparisons here. Since SSE only has
24381 // GT and EQ comparisons for integer, swapping operands and multiple
24382 // operations may be required for some comparisons.
24383 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24385 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24387 bool Invert = Cond == ISD::SETNE ||
24389
24390 if (Swap)
24391 std::swap(Op0, Op1);
24392
24393 // Check that the operation in question is available (most are plain SSE2,
24394 // but PCMPGTQ and PCMPEQQ have different requirements).
24395 if (VT == MVT::v2i64) {
24396 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24397 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24398
24399 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24400 // the odd elements over the even elements.
24401 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24402 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24403 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24404
24405 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24406 static const int MaskHi[] = { 1, 1, 3, 3 };
24407 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24408
24409 return DAG.getBitcast(VT, Result);
24410 }
24411
24412 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24413 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24414 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24415
24416 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24417 static const int MaskHi[] = { 1, 1, 3, 3 };
24418 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24419
24420 return DAG.getBitcast(VT, Result);
24421 }
24422
24423 // If the i64 elements are sign-extended enough to be representable as i32
24424 // then we can compare the lower i32 bits and splat.
24425 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24426 DAG.ComputeNumSignBits(Op1) > 32) {
24427 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24428 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24429
24430 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24431 static const int MaskLo[] = {0, 0, 2, 2};
24432 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24433
24434 return DAG.getBitcast(VT, Result);
24435 }
24436
24437 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24438 // bits of the inputs before performing those operations. The lower
24439 // compare is always unsigned.
24440 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24441 : 0x0000000080000000ULL,
24442 dl, MVT::v2i64);
24443
24444 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24445 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24446
24447 // Cast everything to the right type.
24448 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24449 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24450
24451 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24452 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24453 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24454
24455 // Create masks for only the low parts/high parts of the 64 bit integers.
24456 static const int MaskHi[] = { 1, 1, 3, 3 };
24457 static const int MaskLo[] = { 0, 0, 2, 2 };
24458 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24459 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24460 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24461
24462 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24463 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24464
24465 if (Invert)
24466 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24467
24468 return DAG.getBitcast(VT, Result);
24469 }
24470
24471 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24472 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24473 // pcmpeqd + pshufd + pand.
24474 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24475
24476 // First cast everything to the right type.
24477 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24478 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24479
24480 // Do the compare.
24481 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24482
24483 // Make sure the lower and upper halves are both all-ones.
24484 static const int Mask[] = { 1, 0, 3, 2 };
24485 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24486 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24487
24488 if (Invert)
24489 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24490
24491 return DAG.getBitcast(VT, Result);
24492 }
24493 }
24494
24495 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24496 // bits of the inputs before performing those operations.
24497 if (FlipSigns) {
24498 MVT EltVT = VT.getVectorElementType();
24500 VT);
24501 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24502 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24503 }
24504
24505 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24506
24507 // If the logical-not of the result is required, perform that now.
24508 if (Invert)
24509 Result = DAG.getNOT(dl, Result, VT);
24510
24511 return Result;
24512}
24513
24514// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24516 const SDLoc &dl, SelectionDAG &DAG,
24517 const X86Subtarget &Subtarget,
24518 SDValue &X86CC) {
24519 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24520
24521 // Must be a bitcast from vXi1.
24522 if (Op0.getOpcode() != ISD::BITCAST)
24523 return SDValue();
24524
24525 Op0 = Op0.getOperand(0);
24526 MVT VT = Op0.getSimpleValueType();
24527 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24528 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24529 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24530 return SDValue();
24531
24532 X86::CondCode X86Cond;
24533 if (isNullConstant(Op1)) {
24534 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24535 } else if (isAllOnesConstant(Op1)) {
24536 // C flag is set for all ones.
24537 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24538 } else
24539 return SDValue();
24540
24541 // If the input is an AND, we can combine it's operands into the KTEST.
24542 bool KTestable = false;
24543 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24544 KTestable = true;
24545 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24546 KTestable = true;
24547 if (!isNullConstant(Op1))
24548 KTestable = false;
24549 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24550 SDValue LHS = Op0.getOperand(0);
24551 SDValue RHS = Op0.getOperand(1);
24552 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24553 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24554 }
24555
24556 // If the input is an OR, we can combine it's operands into the KORTEST.
24557 SDValue LHS = Op0;
24558 SDValue RHS = Op0;
24559 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24560 LHS = Op0.getOperand(0);
24561 RHS = Op0.getOperand(1);
24562 }
24563
24564 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24565 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24566}
24567
24568/// Emit flags for the given setcc condition and operands. Also returns the
24569/// corresponding X86 condition code constant in X86CC.
24570SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24571 ISD::CondCode CC, const SDLoc &dl,
24572 SelectionDAG &DAG,
24573 SDValue &X86CC) const {
24574 // Equality Combines.
24575 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24576 X86::CondCode X86CondCode;
24577
24578 // Optimize to BT if possible.
24579 // Lower (X & (1 << N)) == 0 to BT(X, N).
24580 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24581 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24582 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24583 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24584 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24585 return BT;
24586 }
24587 }
24588
24589 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24590 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24591 X86CondCode)) {
24592 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24593 return CmpZ;
24594 }
24595
24596 // Try to lower using KORTEST or KTEST.
24597 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24598 return Test;
24599
24600 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24601 // of these.
24602 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24603 // If the input is a setcc, then reuse the input setcc or use a new one
24604 // with the inverted condition.
24605 if (Op0.getOpcode() == X86ISD::SETCC) {
24606 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24607
24608 X86CC = Op0.getOperand(0);
24609 if (Invert) {
24610 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24611 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24612 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24613 }
24614
24615 return Op0.getOperand(1);
24616 }
24617 }
24618
24619 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24620 // overflow.
24621 if (isMinSignedConstant(Op1)) {
24622 EVT VT = Op0.getValueType();
24623 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24624 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24626 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24627 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24628 DAG.getConstant(0, dl, VT), Op0);
24629 return SDValue(Neg.getNode(), 1);
24630 }
24631 }
24632
24633 // Try to use the carry flag from the add in place of an separate CMP for:
24634 // (seteq (add X, -1), -1). Similar for setne.
24635 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24636 Op0.getOperand(1) == Op1) {
24637 if (isProfitableToUseFlagOp(Op0)) {
24638 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24639
24640 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24641 Op0.getOperand(1));
24642 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24643 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24644 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24645 return SDValue(New.getNode(), 1);
24646 }
24647 }
24648 }
24649
24651 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24652 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24653
24654 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24655 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24656 return EFLAGS;
24657}
24658
24659SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24660
24661 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24662 Op.getOpcode() == ISD::STRICT_FSETCCS;
24663 MVT VT = Op->getSimpleValueType(0);
24664
24665 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24666
24667 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24668 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24669 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24670 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24671 SDLoc dl(Op);
24672 ISD::CondCode CC =
24673 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24674
24675 if (isSoftF16(Op0.getValueType(), Subtarget))
24676 return SDValue();
24677
24678 // Handle f128 first, since one possible outcome is a normal integer
24679 // comparison which gets handled by emitFlagsForSetcc.
24680 if (Op0.getValueType() == MVT::f128) {
24681 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24682 Op.getOpcode() == ISD::STRICT_FSETCCS);
24683
24684 // If softenSetCCOperands returned a scalar, use it.
24685 if (!Op1.getNode()) {
24686 assert(Op0.getValueType() == Op.getValueType() &&
24687 "Unexpected setcc expansion!");
24688 if (IsStrict)
24689 return DAG.getMergeValues({Op0, Chain}, dl);
24690 return Op0;
24691 }
24692 }
24693
24694 if (Op0.getSimpleValueType().isInteger()) {
24695 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24696 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24697 // this may translate to less uops depending on uarch implementation. The
24698 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24699 // canonicalize to that CondCode.
24700 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24701 // encoding size - so it must either already be a i8 or i32 immediate, or it
24702 // shrinks down to that. We don't do this for any i64's to avoid additional
24703 // constant materializations.
24704 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24705 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24706 const APInt &Op1Val = Op1C->getAPIntValue();
24707 if (!Op1Val.isZero()) {
24708 // Ensure the constant+1 doesn't overflow.
24709 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24710 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24711 APInt Op1ValPlusOne = Op1Val + 1;
24712 if (Op1ValPlusOne.isSignedIntN(32) &&
24713 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24714 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24717 }
24718 }
24719 }
24720 }
24721
24722 SDValue X86CC;
24723 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24724 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24725 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24726 }
24727
24728 if (Subtarget.hasAVX10_2()) {
24729 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24730 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24731 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24732 if (Op0.getSimpleValueType() != MVT::f80) {
24733 SDValue Res = getSETCC(
24734 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24735 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24736 }
24737 }
24738 }
24739 // Handle floating point.
24740 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24741 if (CondCode == X86::COND_INVALID)
24742 return SDValue();
24743
24744 SDValue EFLAGS;
24745 if (IsStrict) {
24746 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24747 EFLAGS =
24749 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24750 Chain = EFLAGS.getValue(1);
24751 } else {
24752 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24753 }
24754
24755 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24756 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24757 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24758}
24759
24760SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24761 SDValue LHS = Op.getOperand(0);
24762 SDValue RHS = Op.getOperand(1);
24763 SDValue Carry = Op.getOperand(2);
24764 SDValue Cond = Op.getOperand(3);
24765 SDLoc DL(Op);
24766
24767 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24769
24770 // Recreate the carry if needed.
24771 EVT CarryVT = Carry.getValueType();
24772 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24773 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24774
24775 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24776 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24777 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24778}
24779
24780// This function returns three things: the arithmetic computation itself
24781// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24782// flag and the condition code define the case in which the arithmetic
24783// computation overflows.
24784static std::pair<SDValue, SDValue>
24786 assert(Op.getResNo() == 0 && "Unexpected result number!");
24787 SDValue Value, Overflow;
24788 SDValue LHS = Op.getOperand(0);
24789 SDValue RHS = Op.getOperand(1);
24790 unsigned BaseOp = 0;
24791 SDLoc DL(Op);
24792 switch (Op.getOpcode()) {
24793 default: llvm_unreachable("Unknown ovf instruction!");
24794 case ISD::SADDO:
24795 BaseOp = X86ISD::ADD;
24796 Cond = X86::COND_O;
24797 break;
24798 case ISD::UADDO:
24799 BaseOp = X86ISD::ADD;
24801 break;
24802 case ISD::SSUBO:
24803 BaseOp = X86ISD::SUB;
24804 Cond = X86::COND_O;
24805 break;
24806 case ISD::USUBO:
24807 BaseOp = X86ISD::SUB;
24808 Cond = X86::COND_B;
24809 break;
24810 case ISD::SMULO:
24811 BaseOp = X86ISD::SMUL;
24812 Cond = X86::COND_O;
24813 break;
24814 case ISD::UMULO:
24815 BaseOp = X86ISD::UMUL;
24816 Cond = X86::COND_O;
24817 break;
24818 }
24819
24820 if (BaseOp) {
24821 // Also sets EFLAGS.
24822 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24823 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24824 Overflow = Value.getValue(1);
24825 }
24826
24827 return std::make_pair(Value, Overflow);
24828}
24829
24831 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24832 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24833 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24834 // has only one use.
24835 SDLoc DL(Op);
24837 SDValue Value, Overflow;
24838 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24839
24840 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24841 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24842 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24843}
24844
24845/// Return true if opcode is a X86 logical comparison.
24847 unsigned Opc = Op.getOpcode();
24848 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24849 Opc == X86ISD::FCMP)
24850 return true;
24851 if (Op.getResNo() == 1 &&
24852 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24854 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24855 return true;
24856
24857 return false;
24858}
24859
24861 if (V.getOpcode() != ISD::TRUNCATE)
24862 return false;
24863
24864 SDValue VOp0 = V.getOperand(0);
24865 unsigned InBits = VOp0.getValueSizeInBits();
24866 unsigned Bits = V.getValueSizeInBits();
24867 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24868}
24869
24870// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24872 unsigned X86CC, const SDLoc &DL,
24873 SelectionDAG &DAG,
24874 const X86Subtarget &Subtarget) {
24875 EVT CmpVT = CmpVal.getValueType();
24876 EVT VT = LHS.getValueType();
24877 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24878 return SDValue();
24879
24880 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24881 isOneConstant(CmpVal.getOperand(1))) {
24882 auto SplatLSB = [&](EVT SplatVT) {
24883 // we need mask of all zeros or ones with same size of the other
24884 // operands.
24885 SDValue Neg = CmpVal;
24886 if (CmpVT.bitsGT(SplatVT))
24887 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24888 else if (CmpVT.bitsLT(SplatVT))
24889 Neg = DAG.getNode(
24890 ISD::AND, DL, SplatVT,
24891 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24892 DAG.getConstant(1, DL, SplatVT));
24893 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24894 };
24895
24896 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24898 return SplatLSB(VT);
24899
24900 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24901 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24903 SDValue Mask = SplatLSB(VT);
24904 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24905 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24906 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24907 }
24908
24909 SDValue Src1, Src2;
24910 auto isIdentityPatternZero = [&]() {
24911 switch (RHS.getOpcode()) {
24912 default:
24913 break;
24914 case ISD::OR:
24915 case ISD::XOR:
24916 case ISD::ADD:
24917 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24918 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24919 Src2 = LHS;
24920 return true;
24921 }
24922 break;
24923 case ISD::SHL:
24924 case ISD::SRA:
24925 case ISD::SRL:
24926 case ISD::SUB:
24927 if (RHS.getOperand(0) == LHS) {
24928 Src1 = RHS.getOperand(1);
24929 Src2 = LHS;
24930 return true;
24931 }
24932 break;
24933 }
24934 return false;
24935 };
24936
24937 auto isIdentityPatternOnes = [&]() {
24938 switch (LHS.getOpcode()) {
24939 default:
24940 break;
24941 case ISD::AND:
24942 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24943 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24944 Src2 = RHS;
24945 return true;
24946 }
24947 break;
24948 }
24949 return false;
24950 };
24951
24952 // Convert 'identity' patterns (iff X is 0 or 1):
24953 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24954 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24955 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24956 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24957 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24958 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24959 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24960 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24961 SDValue Mask = SplatLSB(Src1.getValueType());
24962 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24963 Src1); // Mask & z
24964 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24965 }
24966 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24967 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24968 SDValue Mask = SplatLSB(VT);
24969 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24970 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24971 }
24972 }
24973
24974 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24977 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24978
24979 // 'X - 1' sets the carry flag if X == 0.
24980 // '0 - X' sets the carry flag if X != 0.
24981 // Convert the carry flag to a -1/0 mask with sbb:
24982 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24983 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24984 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24985 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24986 SDValue Sub;
24987 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24988 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24989 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24990 } else {
24991 SDValue One = DAG.getConstant(1, DL, CmpVT);
24992 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24993 }
24994 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24995 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24996 Sub.getValue(1));
24997 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24998 }
24999
25000 return SDValue();
25001}
25002
25003SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
25004 bool AddTest = true;
25005 SDValue Cond = Op.getOperand(0);
25006 SDValue Op1 = Op.getOperand(1);
25007 SDValue Op2 = Op.getOperand(2);
25008 SDLoc DL(Op);
25009 MVT VT = Op1.getSimpleValueType();
25010 SDValue CC;
25011
25012 if (isSoftF16(VT, Subtarget)) {
25013 MVT NVT = VT.changeTypeToInteger();
25014 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
25015 DAG.getBitcast(NVT, Op1),
25016 DAG.getBitcast(NVT, Op2)));
25017 }
25018
25019 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
25020 // are available or VBLENDV if AVX is available.
25021 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
25022 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
25023 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
25024 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25025 bool IsAlwaysSignaling;
25026 unsigned SSECC =
25027 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25028 CondOp0, CondOp1, IsAlwaysSignaling);
25029
25030 if (Subtarget.hasAVX512()) {
25031 SDValue Cmp =
25032 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25033 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25034 assert(!VT.isVector() && "Not a scalar type?");
25035 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25036 }
25037
25038 if (SSECC < 8 || Subtarget.hasAVX()) {
25039 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25040 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25041
25042 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25043 // instead of 3 logic instructions for size savings and potentially speed.
25044 // Unfortunately, there is no scalar form of VBLENDV.
25045 //
25046 // If either operand is a +0.0 constant, don't try this. We can expect to
25047 // optimize away at least one of the logic instructions later in that
25048 // case, so that sequence would be faster than a variable blend.
25049 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25050 !isNullFPConstant(Op2)) {
25051 // Convert to vectors, do a VSELECT, and convert back to scalar.
25052 // All of the conversions should be optimized away.
25053 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25054 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25055 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25056 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25057
25058 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25059 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25060
25061 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25062
25063 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25064 DAG.getVectorIdxConstant(0, DL));
25065 }
25066 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25067 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25068 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25069 }
25070 }
25071
25072 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25073 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25074 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25075 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25076 }
25077
25078 if (Cond.getOpcode() == ISD::SETCC &&
25079 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25080 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25081 Cond = NewCond;
25082 // If the condition was updated, it's possible that the operands of the
25083 // select were also updated (for example, EmitTest has a RAUW). Refresh
25084 // the local references to the select operands in case they got stale.
25085 Op1 = Op.getOperand(1);
25086 Op2 = Op.getOperand(2);
25087 }
25088 }
25089
25090 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25091 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25092 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25093 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25094 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25095 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25096 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25097 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25098 if (Cond.getOpcode() == X86ISD::SETCC &&
25099 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25100 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25101 SDValue Cmp = Cond.getOperand(1);
25102 SDValue CmpOp0 = Cmp.getOperand(0);
25103 unsigned CondCode = Cond.getConstantOperandVal(0);
25104
25105 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25106 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25107 // handle to keep the CMP with 0. This should be removed by
25108 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25109 // cttz_zero_undef.
25110 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25111 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25112 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25113 };
25114 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25115 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25116 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25117 // Keep Cmp.
25118 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25119 DL, DAG, Subtarget)) {
25120 return R;
25121 } else if (VT.isScalarInteger() && isNullConstant(Op2) &&
25122 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25123 ((CondCode == X86::COND_S) || // smin(x, 0)
25124 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25125 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25126 //
25127 // If the comparison is testing for a positive value, we have to invert
25128 // the sign bit mask, so only do that transform if the target has a
25129 // bitwise 'and not' instruction (the invert is free).
25130 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25131 unsigned ShCt = VT.getSizeInBits() - 1;
25132 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25133 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25134 if (CondCode == X86::COND_G)
25135 Shift = DAG.getNOT(DL, Shift, VT);
25136 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25137 }
25138 }
25139
25140 // Look past (and (setcc_carry (cmp ...)), 1).
25141 if (Cond.getOpcode() == ISD::AND &&
25142 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25143 isOneConstant(Cond.getOperand(1)))
25144 Cond = Cond.getOperand(0);
25145
25146 // Attempt to fold "raw cond" cases by treating them as:
25147 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25148 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25149 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25150 Subtarget))
25151 return R;
25152
25153 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25154 // setting operand in place of the X86ISD::SETCC.
25155 unsigned CondOpcode = Cond.getOpcode();
25156 if (CondOpcode == X86ISD::SETCC ||
25157 CondOpcode == X86ISD::SETCC_CARRY) {
25158 CC = Cond.getOperand(0);
25159
25160 SDValue Cmp = Cond.getOperand(1);
25161 bool IllegalFPCMov = false;
25162 if (VT.isFloatingPoint() && !VT.isVector() &&
25163 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25164 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25165
25166 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25167 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25168 Cond = Cmp;
25169 AddTest = false;
25170 }
25171 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25172 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25173 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25174 SDValue Value;
25175 X86::CondCode X86Cond;
25176 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25177
25178 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25179 AddTest = false;
25180 }
25181
25182 if (AddTest) {
25183 // Look past the truncate if the high bits are known zero.
25185 Cond = Cond.getOperand(0);
25186
25187 // We know the result of AND is compared against zero. Try to match
25188 // it to BT.
25189 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25190 X86::CondCode X86CondCode;
25191 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25192 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25193 Cond = BT;
25194 AddTest = false;
25195 }
25196 }
25197 }
25198
25199 if (AddTest) {
25200 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25201 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25202 }
25203
25204 // a < b ? -1 : 0 -> RES = ~setcc_carry
25205 // a < b ? 0 : -1 -> RES = setcc_carry
25206 // a >= b ? -1 : 0 -> RES = setcc_carry
25207 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25208 if (Cond.getOpcode() == X86ISD::SUB) {
25209 unsigned CondCode = CC->getAsZExtVal();
25210
25211 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25212 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25213 (isNullConstant(Op1) || isNullConstant(Op2))) {
25214 SDValue Res =
25215 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25216 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25217 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25218 return DAG.getNOT(DL, Res, Res.getValueType());
25219 return Res;
25220 }
25221 }
25222
25223 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25224 // widen the cmov and push the truncate through. This avoids introducing a new
25225 // branch during isel and doesn't add any extensions.
25226 if (Op.getValueType() == MVT::i8 &&
25227 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25228 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25229 if (T1.getValueType() == T2.getValueType() &&
25230 // Exclude CopyFromReg to avoid partial register stalls.
25231 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25232 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25233 CC, Cond);
25234 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25235 }
25236 }
25237
25238 // Or finally, promote i8 cmovs if we have CMOV,
25239 // or i16 cmovs if it won't prevent folding a load.
25240 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25241 // legal, but EmitLoweredSelect() can not deal with these extensions
25242 // being inserted between two CMOV's. (in i16 case too TBN)
25243 // https://bugs.llvm.org/show_bug.cgi?id=40974
25244 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25245 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25246 !X86::mayFoldLoad(Op2, Subtarget))) {
25247 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25248 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25249 SDValue Ops[] = { Op2, Op1, CC, Cond };
25250 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25251 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25252 }
25253
25254 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25255 // condition is true.
25256 SDValue Ops[] = { Op2, Op1, CC, Cond };
25257 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25258}
25259
25261 const X86Subtarget &Subtarget,
25262 SelectionDAG &DAG) {
25263 MVT VT = Op->getSimpleValueType(0);
25264 SDValue In = Op->getOperand(0);
25265 MVT InVT = In.getSimpleValueType();
25266 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25267 MVT VTElt = VT.getVectorElementType();
25268 unsigned NumElts = VT.getVectorNumElements();
25269
25270 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25271 MVT ExtVT = VT;
25272 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25273 // If v16i32 is to be avoided, we'll need to split and concatenate.
25274 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25275 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25276
25277 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25278 }
25279
25280 // Widen to 512-bits if VLX is not supported.
25281 MVT WideVT = ExtVT;
25282 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25283 NumElts *= 512 / ExtVT.getSizeInBits();
25284 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25285 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25286 DAG.getVectorIdxConstant(0, dl));
25287 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25288 }
25289
25290 SDValue V;
25291 MVT WideEltVT = WideVT.getVectorElementType();
25292 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25293 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25294 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25295 } else {
25296 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25297 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25298 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25299 }
25300
25301 // Truncate if we had to extend i16/i8 above.
25302 if (VT != ExtVT) {
25303 WideVT = MVT::getVectorVT(VTElt, NumElts);
25304 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25305 }
25306
25307 // Extract back to 128/256-bit if we widened.
25308 if (WideVT != VT)
25309 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25310 DAG.getVectorIdxConstant(0, dl));
25311
25312 return V;
25313}
25314
25316 SelectionDAG &DAG) {
25317 SDValue In = Op->getOperand(0);
25318 MVT InVT = In.getSimpleValueType();
25319 SDLoc DL(Op);
25320
25321 if (InVT.getVectorElementType() == MVT::i1)
25322 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25323
25324 assert(Subtarget.hasAVX() && "Expected AVX support");
25325 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25326}
25327
25328// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25329// For sign extend this needs to handle all vector sizes and SSE4.1 and
25330// non-SSE4.1 targets. For zero extend this should only handle inputs of
25331// MVT::v64i8 when BWI is not supported, but AVX512 is.
25333 const X86Subtarget &Subtarget,
25334 SelectionDAG &DAG) {
25335 SDValue In = Op->getOperand(0);
25336 MVT VT = Op->getSimpleValueType(0);
25337 MVT InVT = In.getSimpleValueType();
25338
25339 MVT SVT = VT.getVectorElementType();
25340 MVT InSVT = InVT.getVectorElementType();
25342
25343 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25344 return SDValue();
25345 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25346 return SDValue();
25347 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25348 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25349 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25350 return SDValue();
25351
25352 SDLoc dl(Op);
25353 unsigned Opc = Op.getOpcode();
25354 unsigned NumElts = VT.getVectorNumElements();
25355
25356 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25357 // For 512-bit vectors, we need 128-bits or 256-bits.
25358 if (InVT.getSizeInBits() > 128) {
25359 // Input needs to be at least the same number of elements as output, and
25360 // at least 128-bits.
25361 int InSize = InSVT.getSizeInBits() * NumElts;
25362 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25363 InVT = In.getSimpleValueType();
25364 }
25365
25366 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25367 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25368 // need to be handled here for 256/512-bit results.
25369 if (Subtarget.hasInt256()) {
25370 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25371
25372 if (InVT.getVectorNumElements() != NumElts)
25373 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25374
25375 // FIXME: Apparently we create inreg operations that could be regular
25376 // extends.
25377 unsigned ExtOpc =
25380 return DAG.getNode(ExtOpc, dl, VT, In);
25381 }
25382
25383 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25384 if (Subtarget.hasAVX()) {
25385 assert(VT.is256BitVector() && "256-bit vector expected");
25386 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25387 int HalfNumElts = HalfVT.getVectorNumElements();
25388
25389 unsigned NumSrcElts = InVT.getVectorNumElements();
25390 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25391 for (int i = 0; i != HalfNumElts; ++i)
25392 HiMask[i] = HalfNumElts + i;
25393
25394 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25395 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25396 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25397 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25398 }
25399
25400 // We should only get here for sign extend.
25401 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25402 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25403 unsigned InNumElts = InVT.getVectorNumElements();
25404
25405 // If the source elements are already all-signbits, we don't need to extend,
25406 // just splat the elements.
25407 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25408 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25409 unsigned Scale = InNumElts / NumElts;
25410 SmallVector<int, 16> ShuffleMask;
25411 for (unsigned I = 0; I != NumElts; ++I)
25412 ShuffleMask.append(Scale, I);
25413 return DAG.getBitcast(VT,
25414 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25415 }
25416
25417 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25418 SDValue Curr = In;
25419 SDValue SignExt = Curr;
25420
25421 // As SRAI is only available on i16/i32 types, we expand only up to i32
25422 // and handle i64 separately.
25423 if (InVT != MVT::v4i32) {
25424 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25425
25426 unsigned DestWidth = DestVT.getScalarSizeInBits();
25427 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25428 unsigned DestElts = DestVT.getVectorNumElements();
25429
25430 // Build a shuffle mask that takes each input element and places it in the
25431 // MSBs of the new element size.
25432 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25433 for (unsigned i = 0; i != DestElts; ++i)
25434 Mask[i * Scale + (Scale - 1)] = i;
25435
25436 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25437 Curr = DAG.getBitcast(DestVT, Curr);
25438
25439 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25440 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25441 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25442 }
25443
25444 if (VT == MVT::v2i64) {
25445 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25446 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25447 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25448 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25449 SignExt = DAG.getBitcast(VT, SignExt);
25450 }
25451
25452 return SignExt;
25453}
25454
25456 SelectionDAG &DAG) {
25457 MVT VT = Op->getSimpleValueType(0);
25458 SDValue In = Op->getOperand(0);
25459 MVT InVT = In.getSimpleValueType();
25460 SDLoc dl(Op);
25461
25462 if (InVT.getVectorElementType() == MVT::i1)
25463 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25464
25465 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25467 "Expected same number of elements");
25468 assert((VT.getVectorElementType() == MVT::i16 ||
25469 VT.getVectorElementType() == MVT::i32 ||
25470 VT.getVectorElementType() == MVT::i64) &&
25471 "Unexpected element type");
25472 assert((InVT.getVectorElementType() == MVT::i8 ||
25473 InVT.getVectorElementType() == MVT::i16 ||
25474 InVT.getVectorElementType() == MVT::i32) &&
25475 "Unexpected element type");
25476
25477 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25478 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25479 return splitVectorIntUnary(Op, DAG, dl);
25480 }
25481
25482 if (Subtarget.hasInt256())
25483 return Op;
25484
25485 // Optimize vectors in AVX mode
25486 // Sign extend v8i16 to v8i32 and
25487 // v4i32 to v4i64
25488 //
25489 // Divide input vector into two parts
25490 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25491 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25492 // concat the vectors to original VT
25493 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25494 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25495
25496 unsigned NumElems = InVT.getVectorNumElements();
25497 SmallVector<int,8> ShufMask(NumElems, -1);
25498 for (unsigned i = 0; i != NumElems/2; ++i)
25499 ShufMask[i] = i + NumElems/2;
25500
25501 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25502 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25503
25504 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25505}
25506
25507/// Change a vector store into a pair of half-size vector stores.
25509 SDValue StoredVal = Store->getValue();
25510 assert((StoredVal.getValueType().is256BitVector() ||
25511 StoredVal.getValueType().is512BitVector()) &&
25512 "Expecting 256/512-bit op");
25513
25514 // Splitting volatile memory ops is not allowed unless the operation was not
25515 // legal to begin with. Assume the input store is legal (this transform is
25516 // only used for targets with AVX). Note: It is possible that we have an
25517 // illegal type like v2i128, and so we could allow splitting a volatile store
25518 // in that case if that is important.
25519 if (!Store->isSimple())
25520 return SDValue();
25521
25522 SDLoc DL(Store);
25523 SDValue Value0, Value1;
25524 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25525 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25526 SDValue Ptr0 = Store->getBasePtr();
25527 SDValue Ptr1 =
25528 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25529 SDValue Ch0 =
25530 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25531 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25532 SDValue Ch1 =
25533 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25534 Store->getPointerInfo().getWithOffset(HalfOffset),
25535 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25536 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25537}
25538
25539/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25540/// type.
25542 SelectionDAG &DAG) {
25543 SDValue StoredVal = Store->getValue();
25544 assert(StoreVT.is128BitVector() &&
25545 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25546 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25547
25548 // Splitting volatile memory ops is not allowed unless the operation was not
25549 // legal to begin with. We are assuming the input op is legal (this transform
25550 // is only used for targets with AVX).
25551 if (!Store->isSimple())
25552 return SDValue();
25553
25554 MVT StoreSVT = StoreVT.getScalarType();
25555 unsigned NumElems = StoreVT.getVectorNumElements();
25556 unsigned ScalarSize = StoreSVT.getStoreSize();
25557
25558 SDLoc DL(Store);
25560 for (unsigned i = 0; i != NumElems; ++i) {
25561 unsigned Offset = i * ScalarSize;
25562 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25564 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25565 DAG.getVectorIdxConstant(i, DL));
25566 SDValue Ch =
25567 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25568 Store->getPointerInfo().getWithOffset(Offset),
25569 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25570 Stores.push_back(Ch);
25571 }
25572 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25573}
25574
25575static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25576 SelectionDAG &DAG) {
25577 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25578 SDLoc dl(St);
25579 SDValue StoredVal = St->getValue();
25580
25581 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25582 if (StoredVal.getValueType().isVector() &&
25583 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25584 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25585 assert(NumElts <= 8 && "Unexpected VT");
25586 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25587 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25588 "Expected AVX512F without AVX512DQI");
25589
25590 // We must pad with zeros to ensure we store zeroes to any unused bits.
25591 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25592 DAG.getUNDEF(MVT::v16i1), StoredVal,
25593 DAG.getVectorIdxConstant(0, dl));
25594 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25595 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25596 // Make sure we store zeros in the extra bits.
25597 if (NumElts < 8)
25598 StoredVal = DAG.getZeroExtendInReg(
25599 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25600
25601 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25602 St->getPointerInfo(), St->getBaseAlign(),
25603 St->getMemOperand()->getFlags());
25604 }
25605
25606 if (St->isTruncatingStore())
25607 return SDValue();
25608
25609 // If this is a 256/512-bit store of concatenated ops, we are better off
25610 // splitting that store into two half-size stores. This avoids spurious use of
25611 // concatenated ops and each half can execute independently. Some cores would
25612 // split the op into halves anyway, so the concat is purely an extra op.
25613 MVT StoreVT = StoredVal.getSimpleValueType();
25614 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25615 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25616 return splitVectorStore(St, DAG);
25617 return SDValue();
25618 }
25619
25620 if (StoreVT.is32BitVector())
25621 return SDValue();
25622
25623 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25624 assert(StoreVT.is64BitVector() && "Unexpected VT");
25625 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25627 "Unexpected type action!");
25628
25629 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25630 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25631 DAG.getUNDEF(StoreVT));
25632
25633 if (Subtarget.hasSSE2()) {
25634 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25635 // and store it.
25636 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25637 MVT CastVT = MVT::getVectorVT(StVT, 2);
25638 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25639 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25640 DAG.getVectorIdxConstant(0, dl));
25641
25642 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25643 St->getPointerInfo(), St->getBaseAlign(),
25644 St->getMemOperand()->getFlags());
25645 }
25646 assert(Subtarget.hasSSE1() && "Expected SSE");
25647 SDVTList Tys = DAG.getVTList(MVT::Other);
25648 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25649 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25650 St->getMemOperand());
25651}
25652
25653// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25654// may emit an illegal shuffle but the expansion is still better than scalar
25655// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25656// we'll emit a shuffle and a arithmetic shift.
25657// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25658// TODO: It is possible to support ZExt by zeroing the undef values during
25659// the shuffle phase or after the shuffle.
25660static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25661 SelectionDAG &DAG) {
25662 MVT RegVT = Op.getSimpleValueType();
25663 assert(RegVT.isVector() && "We only custom lower vector loads.");
25664 assert(RegVT.isInteger() &&
25665 "We only custom lower integer vector loads.");
25666
25667 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25668 SDLoc dl(Ld);
25669
25670 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25671 if (RegVT.getVectorElementType() == MVT::i1) {
25672 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25673 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25674 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25675 "Expected AVX512F without AVX512DQI");
25676
25677 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25678 Ld->getPointerInfo(), Ld->getBaseAlign(),
25679 Ld->getMemOperand()->getFlags());
25680
25681 // Replace chain users with the new chain.
25682 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25683
25684 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25685 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25686 DAG.getBitcast(MVT::v16i1, Val),
25687 DAG.getVectorIdxConstant(0, dl));
25688 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25689 }
25690
25691 return SDValue();
25692}
25693
25694/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25695/// each of which has no other use apart from the AND / OR.
25696static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25697 Opc = Op.getOpcode();
25698 if (Opc != ISD::OR && Opc != ISD::AND)
25699 return false;
25700 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25701 Op.getOperand(0).hasOneUse() &&
25702 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25703 Op.getOperand(1).hasOneUse());
25704}
25705
25706SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25707 SDValue Chain = Op.getOperand(0);
25708 SDValue Cond = Op.getOperand(1);
25709 SDValue Dest = Op.getOperand(2);
25710 SDLoc dl(Op);
25711
25712 // Bail out when we don't have native compare instructions.
25713 if (Cond.getOpcode() == ISD::SETCC &&
25714 Cond.getOperand(0).getValueType() != MVT::f128 &&
25715 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25716 SDValue LHS = Cond.getOperand(0);
25717 SDValue RHS = Cond.getOperand(1);
25718 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25719
25720 // Special case for
25721 // setcc([su]{add,sub,mul}o == 0)
25722 // setcc([su]{add,sub,mul}o != 1)
25724 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25726 SDValue Value, Overflow;
25727 X86::CondCode X86Cond;
25728 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25729
25730 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25731 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25732
25733 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25734 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25735 Overflow, Op->getFlags());
25736 }
25737
25738 if (LHS.getSimpleValueType().isInteger()) {
25739 SDValue CCVal;
25740 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25741 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25742 EFLAGS, Op->getFlags());
25743 }
25744
25745 if (CC == ISD::SETOEQ) {
25746 // For FCMP_OEQ, we can emit
25747 // two branches instead of an explicit AND instruction with a
25748 // separate test. However, we only do this if this block doesn't
25749 // have a fall-through edge, because this requires an explicit
25750 // jmp when the condition is false.
25751 if (Op.getNode()->hasOneUse()) {
25752 SDNode *User = *Op.getNode()->user_begin();
25753 // Look for an unconditional branch following this conditional branch.
25754 // We need this because we need to reverse the successors in order
25755 // to implement FCMP_OEQ.
25756 if (User->getOpcode() == ISD::BR) {
25757 SDValue FalseBB = User->getOperand(1);
25758 SDNode *NewBR =
25759 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25760 assert(NewBR == User);
25761 (void)NewBR;
25762 Dest = FalseBB;
25763
25764 SDValue Cmp =
25765 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25766 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25767 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25768 CCVal, Cmp, Op->getFlags());
25769 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25770 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25771 Cmp, Op->getFlags());
25772 }
25773 }
25774 } else if (CC == ISD::SETUNE) {
25775 // For FCMP_UNE, we can emit
25776 // two branches instead of an explicit OR instruction with a
25777 // separate test.
25778 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25779 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25780 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25781 Cmp, Op->getFlags());
25782 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25783 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25784 Cmp, Op->getFlags());
25785 } else {
25786 X86::CondCode X86Cond =
25787 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25788 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25789 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25790 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25791 Cmp, Op->getFlags());
25792 }
25793 }
25794
25796 SDValue Value, Overflow;
25797 X86::CondCode X86Cond;
25798 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25799
25800 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25801 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25802 Overflow, Op->getFlags());
25803 }
25804
25805 // Look past the truncate if the high bits are known zero.
25807 Cond = Cond.getOperand(0);
25808
25809 EVT CondVT = Cond.getValueType();
25810
25811 // Add an AND with 1 if we don't already have one.
25812 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25813 Cond =
25814 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25815
25816 SDValue LHS = Cond;
25817 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25818
25819 SDValue CCVal;
25820 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25821 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25822 Op->getFlags());
25823}
25824
25825// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25826// Calls to _alloca are needed to probe the stack when allocating more than 4k
25827// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25828// that the guard pages used by the OS virtual memory manager are allocated in
25829// correct sequence.
25830SDValue
25831X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25832 SelectionDAG &DAG) const {
25833 MachineFunction &MF = DAG.getMachineFunction();
25834 bool SplitStack = MF.shouldSplitStack();
25835 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25836 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25837 SplitStack || EmitStackProbeCall;
25838 SDLoc dl(Op);
25839
25840 // Get the inputs.
25841 SDNode *Node = Op.getNode();
25842 SDValue Chain = Op.getOperand(0);
25843 SDValue Size = Op.getOperand(1);
25844 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25845 EVT VT = Node->getValueType(0);
25846
25847 // Chain the dynamic stack allocation so that it doesn't modify the stack
25848 // pointer when other instructions are using the stack.
25849 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25850
25851 bool Is64Bit = Subtarget.is64Bit();
25852 MVT SPTy = Op.getValueType().getSimpleVT();
25853
25855 if (!Lower) {
25856 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25858 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25859 " not tell us which reg is the stack pointer!");
25860
25861 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25862 const Align StackAlign = TFI.getStackAlign();
25863 if (hasInlineStackProbe(MF)) {
25864 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25865 {Chain, Size});
25866 Chain = Result.getValue(1);
25867 } else {
25868 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25869 Chain = SP.getValue(1);
25870 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25871 }
25872 if (Alignment && *Alignment > StackAlign)
25873 Result = DAG.getNode(
25874 ISD::AND, dl, VT, Result,
25875 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25876 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25877 } else if (SplitStack) {
25878 if (Is64Bit) {
25879 // The 64 bit implementation of segmented stacks needs to clobber both r10
25880 // r11. This makes it impossible to use it along with nested parameters.
25881 const Function &F = MF.getFunction();
25882 for (const auto &A : F.args()) {
25883 if (A.hasNestAttr())
25884 report_fatal_error("Cannot use segmented stacks with functions that "
25885 "have nested arguments.");
25886 }
25887 }
25888
25889 Result =
25890 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25891 Chain = Result.getValue(1);
25892 } else {
25893 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25894 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25895 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25896
25897 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25898 Register SPReg = RegInfo->getStackRegister();
25899 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25900 Chain = SP.getValue(1);
25901
25902 if (Alignment) {
25903 SP = DAG.getNode(
25904 ISD::AND, dl, VT, SP.getValue(0),
25905 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25906 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25907 }
25908
25909 Result = SP;
25910 }
25911
25912 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25913
25914 SDValue Ops[2] = {Result, Chain};
25915 return DAG.getMergeValues(Ops, dl);
25916}
25917
25918SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25919 MachineFunction &MF = DAG.getMachineFunction();
25920 SDValue Ptr = Op.getOperand(1);
25921 EVT PtrVT = Ptr.getValueType();
25922
25923 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25924
25925 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25926 SDLoc DL(Op);
25927
25928 if (!Subtarget.is64Bit() ||
25929 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25930 // vastart just stores the address of the VarArgsFrameIndex slot into the
25931 // memory location argument.
25932 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25933 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25934 }
25935
25936 // __va_list_tag:
25937 // gp_offset (0 - 6 * 8)
25938 // fp_offset (48 - 48 + 8 * 16)
25939 // overflow_arg_area (point to parameters coming in memory).
25940 // reg_save_area
25942 SDValue FIN = Op.getOperand(1);
25943 // Store gp_offset
25944 SDValue Store = DAG.getStore(
25945 Op.getOperand(0), DL,
25946 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25947 MachinePointerInfo(SV));
25948 MemOps.push_back(Store);
25949
25950 // Store fp_offset
25951 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25952 Store = DAG.getStore(
25953 Op.getOperand(0), DL,
25954 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25955 MachinePointerInfo(SV, 4));
25956 MemOps.push_back(Store);
25957
25958 // Store ptr to overflow_arg_area
25959 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25960 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25961 Store =
25962 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25963 MemOps.push_back(Store);
25964
25965 // Store ptr to reg_save_area.
25966 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25967 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25968 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25969 Store = DAG.getStore(
25970 Op.getOperand(0), DL, RSFIN, FIN,
25971 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25972 MemOps.push_back(Store);
25973 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25974}
25975
25976SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25977 assert(Subtarget.is64Bit() &&
25978 "LowerVAARG only handles 64-bit va_arg!");
25979 assert(Op.getNumOperands() == 4);
25980
25981 MachineFunction &MF = DAG.getMachineFunction();
25982 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25983 // The Win64 ABI uses char* instead of a structure.
25984 return DAG.expandVAArg(Op.getNode());
25985
25986 SDValue Chain = Op.getOperand(0);
25987 SDValue SrcPtr = Op.getOperand(1);
25988 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25989 unsigned Align = Op.getConstantOperandVal(3);
25990 SDLoc dl(Op);
25991
25992 EVT ArgVT = Op.getNode()->getValueType(0);
25993 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25994 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25995 uint8_t ArgMode;
25996
25997 // Decide which area this value should be read from.
25998 // TODO: Implement the AMD64 ABI in its entirety. This simple
25999 // selection mechanism works only for the basic types.
26000 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
26001 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
26002 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
26003 } else {
26004 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
26005 "Unhandled argument type in LowerVAARG");
26006 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
26007 }
26008
26009 if (ArgMode == 2) {
26010 // Make sure using fp_offset makes sense.
26011 assert(!Subtarget.useSoftFloat() &&
26012 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
26013 Subtarget.hasSSE1());
26014 }
26015
26016 // Insert VAARG node into the DAG
26017 // VAARG returns two values: Variable Argument Address, Chain
26018 SDValue InstOps[] = {Chain, SrcPtr,
26019 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
26020 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
26021 DAG.getTargetConstant(Align, dl, MVT::i32)};
26022 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
26023 SDValue VAARG = DAG.getMemIntrinsicNode(
26024 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
26025 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26026 /*Alignment=*/std::nullopt,
26028 Chain = VAARG.getValue(1);
26029
26030 // Load the next argument and return it
26031 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26032}
26033
26034static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26035 SelectionDAG &DAG) {
26036 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26037 // where a va_list is still an i8*.
26038 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26039 if (Subtarget.isCallingConvWin64(
26041 // Probably a Win64 va_copy.
26042 return DAG.expandVACopy(Op.getNode());
26043
26044 SDValue Chain = Op.getOperand(0);
26045 SDValue DstPtr = Op.getOperand(1);
26046 SDValue SrcPtr = Op.getOperand(2);
26047 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26048 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26049 SDLoc DL(Op);
26050
26051 return DAG.getMemcpy(
26052 Chain, DL, DstPtr, SrcPtr,
26053 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26054 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26055 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26056 MachinePointerInfo(SrcSV));
26057}
26058
26059// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26060static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26061 switch (Opc) {
26062 case ISD::SHL:
26063 case X86ISD::VSHL:
26064 case X86ISD::VSHLI:
26065 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26066 case ISD::SRL:
26067 case X86ISD::VSRL:
26068 case X86ISD::VSRLI:
26069 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26070 case ISD::SRA:
26071 case X86ISD::VSRA:
26072 case X86ISD::VSRAI:
26073 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26074 }
26075 llvm_unreachable("Unknown target vector shift node");
26076}
26077
26078/// Handle vector element shifts where the shift amount is a constant.
26079/// Takes immediate version of shift as input.
26080static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26081 SDValue SrcOp, uint64_t ShiftAmt,
26082 SelectionDAG &DAG) {
26083 MVT ElementType = VT.getVectorElementType();
26084
26085 // Bitcast the source vector to the output type, this is mainly necessary for
26086 // vXi8/vXi64 shifts.
26087 if (VT != SrcOp.getSimpleValueType())
26088 SrcOp = DAG.getBitcast(VT, SrcOp);
26089
26090 // Fold this packed shift into its first operand if ShiftAmt is 0.
26091 if (ShiftAmt == 0)
26092 return SrcOp;
26093
26094 // Check for ShiftAmt >= element width
26095 if (ShiftAmt >= ElementType.getSizeInBits()) {
26096 if (Opc == X86ISD::VSRAI)
26097 ShiftAmt = ElementType.getSizeInBits() - 1;
26098 else
26099 return DAG.getConstant(0, dl, VT);
26100 }
26101
26103 && "Unknown target vector shift-by-constant node");
26104
26105 // Fold this packed vector shift into a build vector if SrcOp is a
26106 // vector of Constants or UNDEFs.
26108 unsigned ShiftOpc;
26109 switch (Opc) {
26110 default: llvm_unreachable("Unknown opcode!");
26111 case X86ISD::VSHLI:
26112 ShiftOpc = ISD::SHL;
26113 break;
26114 case X86ISD::VSRLI:
26115 ShiftOpc = ISD::SRL;
26116 break;
26117 case X86ISD::VSRAI:
26118 ShiftOpc = ISD::SRA;
26119 break;
26120 }
26121
26122 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26123 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26124 return C;
26125 }
26126
26127 return DAG.getNode(Opc, dl, VT, SrcOp,
26128 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26129}
26130
26131/// Handle vector element shifts by a splat shift amount
26132static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26133 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26134 const X86Subtarget &Subtarget,
26135 SelectionDAG &DAG) {
26136 MVT AmtVT = ShAmt.getSimpleValueType();
26137 assert(AmtVT.isVector() && "Vector shift type mismatch");
26138 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26139 "Illegal vector splat index");
26140
26141 // Move the splat element to the bottom element.
26142 if (ShAmtIdx != 0) {
26143 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26144 Mask[0] = ShAmtIdx;
26145 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26146 }
26147
26148 // Peek through any zext node if we can get back to a 128-bit source.
26149 if (AmtVT.getScalarSizeInBits() == 64 &&
26150 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26152 ShAmt.getOperand(0).getValueType().isSimple() &&
26153 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26154 ShAmt = ShAmt.getOperand(0);
26155 AmtVT = ShAmt.getSimpleValueType();
26156 }
26157
26158 // See if we can mask off the upper elements using the existing source node.
26159 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26160 // do this for vXi64 types.
26161 bool IsMasked = false;
26162 if (AmtVT.getScalarSizeInBits() < 64) {
26163 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26164 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26165 // If the shift amount has come from a scalar, then zero-extend the scalar
26166 // before moving to the vector.
26167 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26168 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26169 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26170 AmtVT = MVT::v4i32;
26171 IsMasked = true;
26172 } else if (ShAmt.getOpcode() == ISD::AND) {
26173 // See if the shift amount is already masked (e.g. for rotation modulo),
26174 // then we can zero-extend it by setting all the other mask elements to
26175 // zero.
26176 SmallVector<SDValue> MaskElts(
26177 AmtVT.getVectorNumElements(),
26178 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26179 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26180 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26181 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26182 {ShAmt.getOperand(1), Mask}))) {
26183 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26184 IsMasked = true;
26185 }
26186 }
26187 }
26188
26189 // Extract if the shift amount vector is larger than 128-bits.
26190 if (AmtVT.getSizeInBits() > 128) {
26191 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26192 AmtVT = ShAmt.getSimpleValueType();
26193 }
26194
26195 // Zero-extend bottom element to v2i64 vector type, either by extension or
26196 // shuffle masking.
26197 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26198 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26199 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26200 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26201 } else if (Subtarget.hasSSE41()) {
26202 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26203 MVT::v2i64, ShAmt);
26204 } else {
26205 SDValue ByteShift = DAG.getTargetConstant(
26206 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26207 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26208 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26209 ByteShift);
26210 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26211 ByteShift);
26212 }
26213 }
26214
26215 // Change opcode to non-immediate version.
26217
26218 // The return type has to be a 128-bit type with the same element
26219 // type as the input type.
26220 MVT EltVT = VT.getVectorElementType();
26221 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26222
26223 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26224 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26225}
26226
26227/// Return Mask with the necessary casting or extending
26228/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26229static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26230 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26231 const SDLoc &dl) {
26232
26233 if (isAllOnesConstant(Mask))
26234 return DAG.getConstant(1, dl, MaskVT);
26235 if (X86::isZeroNode(Mask))
26236 return DAG.getConstant(0, dl, MaskVT);
26237
26238 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26239
26240 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26241 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26242 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26243 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26244 SDValue Lo, Hi;
26245 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26246 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26247 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26248 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26249 } else {
26250 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26251 Mask.getSimpleValueType().getSizeInBits());
26252 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26253 // are extracted by EXTRACT_SUBVECTOR.
26254 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26255 DAG.getBitcast(BitcastVT, Mask),
26256 DAG.getVectorIdxConstant(0, dl));
26257 }
26258}
26259
26260/// Return (and \p Op, \p Mask) for compare instructions or
26261/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26262/// necessary casting or extending for \p Mask when lowering masking intrinsics
26264 SDValue PreservedSrc,
26265 const X86Subtarget &Subtarget,
26266 SelectionDAG &DAG) {
26267 MVT VT = Op.getSimpleValueType();
26268 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26269 unsigned OpcodeSelect = ISD::VSELECT;
26270 SDLoc dl(Op);
26271
26272 if (isAllOnesConstant(Mask))
26273 return Op;
26274
26275 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26276
26277 if (PreservedSrc.isUndef())
26278 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26279 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26280}
26281
26282/// Creates an SDNode for a predicated scalar operation.
26283/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26284/// The mask is coming as MVT::i8 and it should be transformed
26285/// to MVT::v1i1 while lowering masking intrinsics.
26286/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26287/// "X86select" instead of "vselect". We just can't create the "vselect" node
26288/// for a scalar instruction.
26290 SDValue PreservedSrc,
26291 const X86Subtarget &Subtarget,
26292 SelectionDAG &DAG) {
26293 auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
26294 if (MaskConst && (MaskConst->getZExtValue() & 0x1))
26295 return Op;
26296
26297 MVT VT = Op.getSimpleValueType();
26298 SDLoc dl(Op);
26299
26300 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26301 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26302 DAG.getBitcast(MVT::v8i1, Mask),
26303 DAG.getVectorIdxConstant(0, dl));
26304 if (Op.getOpcode() == X86ISD::FSETCCM ||
26305 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26306 Op.getOpcode() == X86ISD::VFPCLASSS)
26307 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26308
26309 if (PreservedSrc.isUndef())
26310 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26311
26312 if (MaskConst) {
26313 assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
26314 // Discard op and blend passthrough with scalar op src/dst.
26316 std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
26317 ShuffleMask[0] = VT.getVectorNumElements();
26318 return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
26319 ShuffleMask);
26320 }
26321
26322 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26323}
26324
26326 if (!Fn->hasPersonalityFn())
26328 "querying registration node size for function without personality");
26329 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26330 // WinEHStatePass for the full struct definition.
26331 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26332 case EHPersonality::MSVC_X86SEH: return 24;
26333 case EHPersonality::MSVC_CXX: return 16;
26334 default: break;
26335 }
26337 "can only recover FP for 32-bit MSVC EH personality functions");
26338}
26339
26340/// When the MSVC runtime transfers control to us, either to an outlined
26341/// function or when returning to a parent frame after catching an exception, we
26342/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26343/// Here's the math:
26344/// RegNodeBase = EntryEBP - RegNodeSize
26345/// ParentFP = RegNodeBase - ParentFrameOffset
26346/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26347/// subtracting the offset (negative on x86) takes us back to the parent FP.
26349 SDValue EntryEBP) {
26351 SDLoc dl;
26352
26353 // It's possible that the parent function no longer has a personality function
26354 // if the exceptional code was optimized away, in which case we just return
26355 // the incoming EBP.
26356 if (!Fn->hasPersonalityFn())
26357 return EntryEBP;
26358
26359 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26360 // registration, or the .set_setframe offset.
26363 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26364 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26365 SDValue ParentFrameOffset =
26366 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26367
26368 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26369 // prologue to RBP in the parent function.
26370 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26371 if (Subtarget.is64Bit())
26372 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26373
26374 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26375 // RegNodeBase = EntryEBP - RegNodeSize
26376 // ParentFP = RegNodeBase - ParentFrameOffset
26377 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26378 DAG.getConstant(RegNodeSize, dl, PtrVT));
26379 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26380}
26381
26382SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26383 SelectionDAG &DAG) const {
26384 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26385 auto isRoundModeCurDirection = [](SDValue Rnd) {
26386 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26387 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26388
26389 return false;
26390 };
26391 auto isRoundModeSAE = [](SDValue Rnd) {
26392 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26393 unsigned RC = C->getZExtValue();
26395 // Clear the NO_EXC bit and check remaining bits.
26397 // As a convenience we allow no other bits or explicitly
26398 // current direction.
26399 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26400 }
26401 }
26402
26403 return false;
26404 };
26405 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26406 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26407 RC = C->getZExtValue();
26409 // Clear the NO_EXC bit and check remaining bits.
26415 }
26416 }
26417
26418 return false;
26419 };
26420
26421 SDLoc dl(Op);
26422 unsigned IntNo = Op.getConstantOperandVal(0);
26423 MVT VT = Op.getSimpleValueType();
26424 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26425
26426 // Propagate flags from original node to transformed node(s).
26427 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26428
26429 if (IntrData) {
26430 switch(IntrData->Type) {
26431 case INTR_TYPE_1OP: {
26432 // We specify 2 possible opcodes for intrinsics with rounding modes.
26433 // First, we check if the intrinsic may have non-default rounding mode,
26434 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26435 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26436 if (IntrWithRoundingModeOpcode != 0) {
26437 SDValue Rnd = Op.getOperand(2);
26438 unsigned RC = 0;
26439 if (isRoundModeSAEToX(Rnd, RC))
26440 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26441 Op.getOperand(1),
26442 DAG.getTargetConstant(RC, dl, MVT::i32));
26443 if (!isRoundModeCurDirection(Rnd))
26444 return SDValue();
26445 }
26446 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26447 Op.getOperand(1));
26448 }
26449 case INTR_TYPE_1OP_SAE: {
26450 SDValue Sae = Op.getOperand(2);
26451
26452 unsigned Opc;
26453 if (isRoundModeCurDirection(Sae))
26454 Opc = IntrData->Opc0;
26455 else if (isRoundModeSAE(Sae))
26456 Opc = IntrData->Opc1;
26457 else
26458 return SDValue();
26459
26460 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26461 }
26462 case INTR_TYPE_2OP: {
26463 SDValue Src2 = Op.getOperand(2);
26464
26465 // We specify 2 possible opcodes for intrinsics with rounding modes.
26466 // First, we check if the intrinsic may have non-default rounding mode,
26467 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26468 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26469 if (IntrWithRoundingModeOpcode != 0) {
26470 SDValue Rnd = Op.getOperand(3);
26471 unsigned RC = 0;
26472 if (isRoundModeSAEToX(Rnd, RC))
26473 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26474 Op.getOperand(1), Src2,
26475 DAG.getTargetConstant(RC, dl, MVT::i32));
26476 if (!isRoundModeCurDirection(Rnd))
26477 return SDValue();
26478 }
26479
26480 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26481 Op.getOperand(1), Src2);
26482 }
26483 case INTR_TYPE_2OP_SAE: {
26484 SDValue Sae = Op.getOperand(3);
26485
26486 unsigned Opc;
26487 if (isRoundModeCurDirection(Sae))
26488 Opc = IntrData->Opc0;
26489 else if (isRoundModeSAE(Sae))
26490 Opc = IntrData->Opc1;
26491 else
26492 return SDValue();
26493
26494 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26495 Op.getOperand(2));
26496 }
26497 case INTR_TYPE_3OP:
26498 case INTR_TYPE_3OP_IMM8: {
26499 SDValue Src1 = Op.getOperand(1);
26500 SDValue Src2 = Op.getOperand(2);
26501 SDValue Src3 = Op.getOperand(3);
26502
26503 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26504 Src3.getValueType() != MVT::i8) {
26505 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26506 }
26507
26508 // We specify 2 possible opcodes for intrinsics with rounding modes.
26509 // First, we check if the intrinsic may have non-default rounding mode,
26510 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26511 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26512 if (IntrWithRoundingModeOpcode != 0) {
26513 SDValue Rnd = Op.getOperand(4);
26514 unsigned RC = 0;
26515 if (isRoundModeSAEToX(Rnd, RC))
26516 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26517 Src1, Src2, Src3,
26518 DAG.getTargetConstant(RC, dl, MVT::i32));
26519 if (!isRoundModeCurDirection(Rnd))
26520 return SDValue();
26521 }
26522
26523 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26524 {Src1, Src2, Src3});
26525 }
26526 case INTR_TYPE_4OP_IMM8: {
26527 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26528 SDValue Src4 = Op.getOperand(4);
26529 if (Src4.getValueType() != MVT::i8) {
26530 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26531 }
26532
26533 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26534 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26535 Src4);
26536 }
26537 case INTR_TYPE_1OP_MASK: {
26538 SDValue Src = Op.getOperand(1);
26539 SDValue PassThru = Op.getOperand(2);
26540 SDValue Mask = Op.getOperand(3);
26541 // We add rounding mode to the Node when
26542 // - RC Opcode is specified and
26543 // - RC is not "current direction".
26544 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26545 if (IntrWithRoundingModeOpcode != 0) {
26546 SDValue Rnd = Op.getOperand(4);
26547 unsigned RC = 0;
26548 if (isRoundModeSAEToX(Rnd, RC))
26549 return getVectorMaskingNode(
26550 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26551 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26552 Mask, PassThru, Subtarget, DAG);
26553 if (!isRoundModeCurDirection(Rnd))
26554 return SDValue();
26555 }
26556 return getVectorMaskingNode(
26557 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26558 Subtarget, DAG);
26559 }
26561 SDValue Src = Op.getOperand(1);
26562 SDValue PassThru = Op.getOperand(2);
26563 SDValue Mask = Op.getOperand(3);
26564 SDValue Rnd = Op.getOperand(4);
26565
26566 unsigned Opc;
26567 if (isRoundModeCurDirection(Rnd))
26568 Opc = IntrData->Opc0;
26569 else if (isRoundModeSAE(Rnd))
26570 Opc = IntrData->Opc1;
26571 else
26572 return SDValue();
26573
26574 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26575 Subtarget, DAG);
26576 }
26577 case INTR_TYPE_SCALAR_MASK: {
26578 SDValue Src1 = Op.getOperand(1);
26579 SDValue Src2 = Op.getOperand(2);
26580 SDValue passThru = Op.getOperand(3);
26581 SDValue Mask = Op.getOperand(4);
26582 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26583 // There are 2 kinds of intrinsics in this group:
26584 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26585 // (2) With rounding mode and sae - 7 operands.
26586 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26587 if (Op.getNumOperands() == (5U + HasRounding)) {
26588 if (HasRounding) {
26589 SDValue Rnd = Op.getOperand(5);
26590 unsigned RC = 0;
26591 if (isRoundModeSAEToX(Rnd, RC))
26592 return getScalarMaskingNode(
26593 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26594 DAG.getTargetConstant(RC, dl, MVT::i32)),
26595 Mask, passThru, Subtarget, DAG);
26596 if (!isRoundModeCurDirection(Rnd))
26597 return SDValue();
26598 }
26599 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26600 Src2),
26601 Mask, passThru, Subtarget, DAG);
26602 }
26603
26604 assert(Op.getNumOperands() == (6U + HasRounding) &&
26605 "Unexpected intrinsic form");
26606 SDValue RoundingMode = Op.getOperand(5);
26607 unsigned Opc = IntrData->Opc0;
26608 if (HasRounding) {
26609 SDValue Sae = Op.getOperand(6);
26610 if (isRoundModeSAE(Sae))
26611 Opc = IntrWithRoundingModeOpcode;
26612 else if (!isRoundModeCurDirection(Sae))
26613 return SDValue();
26614 }
26615 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26616 Src2, RoundingMode),
26617 Mask, passThru, Subtarget, DAG);
26618 }
26620 SDValue Src1 = Op.getOperand(1);
26621 SDValue Src2 = Op.getOperand(2);
26622 SDValue passThru = Op.getOperand(3);
26623 SDValue Mask = Op.getOperand(4);
26624 SDValue Rnd = Op.getOperand(5);
26625
26626 SDValue NewOp;
26627 unsigned RC = 0;
26628 if (isRoundModeCurDirection(Rnd))
26629 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26630 else if (isRoundModeSAEToX(Rnd, RC))
26631 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26632 DAG.getTargetConstant(RC, dl, MVT::i32));
26633 else
26634 return SDValue();
26635
26636 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26637 }
26639 SDValue Src1 = Op.getOperand(1);
26640 SDValue Src2 = Op.getOperand(2);
26641 SDValue passThru = Op.getOperand(3);
26642 SDValue Mask = Op.getOperand(4);
26643 SDValue Sae = Op.getOperand(5);
26644 unsigned Opc;
26645 if (isRoundModeCurDirection(Sae))
26646 Opc = IntrData->Opc0;
26647 else if (isRoundModeSAE(Sae))
26648 Opc = IntrData->Opc1;
26649 else
26650 return SDValue();
26651
26652 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26653 Mask, passThru, Subtarget, DAG);
26654 }
26655 case INTR_TYPE_2OP_MASK: {
26656 SDValue Src1 = Op.getOperand(1);
26657 SDValue Src2 = Op.getOperand(2);
26658 SDValue PassThru = Op.getOperand(3);
26659 SDValue Mask = Op.getOperand(4);
26660 SDValue NewOp;
26661 if (IntrData->Opc1 != 0) {
26662 SDValue Rnd = Op.getOperand(5);
26663 unsigned RC = 0;
26664 if (isRoundModeSAEToX(Rnd, RC))
26665 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26666 DAG.getTargetConstant(RC, dl, MVT::i32));
26667 else if (!isRoundModeCurDirection(Rnd))
26668 return SDValue();
26669 }
26670 if (!NewOp)
26671 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26672 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26673 }
26675 SDValue Src1 = Op.getOperand(1);
26676 SDValue Src2 = Op.getOperand(2);
26677 SDValue PassThru = Op.getOperand(3);
26678 SDValue Mask = Op.getOperand(4);
26679
26680 unsigned Opc = IntrData->Opc0;
26681 if (IntrData->Opc1 != 0) {
26682 SDValue Sae = Op.getOperand(5);
26683 if (isRoundModeSAE(Sae))
26684 Opc = IntrData->Opc1;
26685 else if (!isRoundModeCurDirection(Sae))
26686 return SDValue();
26687 }
26688
26689 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26690 Mask, PassThru, Subtarget, DAG);
26691 }
26693 SDValue Src1 = Op.getOperand(1);
26694 SDValue Src2 = Op.getOperand(2);
26695 SDValue Src3 = Op.getOperand(3);
26696 SDValue PassThru = Op.getOperand(4);
26697 SDValue Mask = Op.getOperand(5);
26698 SDValue Sae = Op.getOperand(6);
26699 unsigned Opc;
26700 if (isRoundModeCurDirection(Sae))
26701 Opc = IntrData->Opc0;
26702 else if (isRoundModeSAE(Sae))
26703 Opc = IntrData->Opc1;
26704 else
26705 return SDValue();
26706
26707 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26708 Mask, PassThru, Subtarget, DAG);
26709 }
26711 SDValue Src1 = Op.getOperand(1);
26712 SDValue Src2 = Op.getOperand(2);
26713 SDValue Src3 = Op.getOperand(3);
26714 SDValue PassThru = Op.getOperand(4);
26715 SDValue Mask = Op.getOperand(5);
26716
26717 unsigned Opc = IntrData->Opc0;
26718 if (IntrData->Opc1 != 0) {
26719 SDValue Sae = Op.getOperand(6);
26720 if (isRoundModeSAE(Sae))
26721 Opc = IntrData->Opc1;
26722 else if (!isRoundModeCurDirection(Sae))
26723 return SDValue();
26724 }
26725 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26726 Mask, PassThru, Subtarget, DAG);
26727 }
26728 case BLENDV: {
26729 SDValue Src1 = Op.getOperand(1);
26730 SDValue Src2 = Op.getOperand(2);
26731 SDValue Src3 = Op.getOperand(3);
26732
26733 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26734 Src3 = DAG.getBitcast(MaskVT, Src3);
26735
26736 // Reverse the operands to match VSELECT order.
26737 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26738 }
26739 case VPERM_2OP : {
26740 SDValue Src1 = Op.getOperand(1);
26741 SDValue Src2 = Op.getOperand(2);
26742
26743 // Swap Src1 and Src2 in the node creation
26744 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26745 }
26746 case CFMA_OP_MASKZ:
26747 case CFMA_OP_MASK: {
26748 SDValue Src1 = Op.getOperand(1);
26749 SDValue Src2 = Op.getOperand(2);
26750 SDValue Src3 = Op.getOperand(3);
26751 SDValue Mask = Op.getOperand(4);
26752 MVT VT = Op.getSimpleValueType();
26753
26754 SDValue PassThru = Src3;
26755 if (IntrData->Type == CFMA_OP_MASKZ)
26756 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26757
26758 // We add rounding mode to the Node when
26759 // - RC Opcode is specified and
26760 // - RC is not "current direction".
26761 SDValue NewOp;
26762 if (IntrData->Opc1 != 0) {
26763 SDValue Rnd = Op.getOperand(5);
26764 unsigned RC = 0;
26765 if (isRoundModeSAEToX(Rnd, RC))
26766 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26767 DAG.getTargetConstant(RC, dl, MVT::i32));
26768 else if (!isRoundModeCurDirection(Rnd))
26769 return SDValue();
26770 }
26771 if (!NewOp)
26772 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26773 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26774 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26775 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26776 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26777 }
26778 case IFMA_OP:
26779 // NOTE: We need to swizzle the operands to pass the multiply operands
26780 // first.
26781 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26782 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26783 case FPCLASSS: {
26784 SDValue Src1 = Op.getOperand(1);
26785 SDValue Imm = Op.getOperand(2);
26786 SDValue Mask = Op.getOperand(3);
26787 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26788 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26789 Subtarget, DAG);
26790 // Need to fill with zeros to ensure the bitcast will produce zeroes
26791 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26792 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26793 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26794 DAG.getVectorIdxConstant(0, dl));
26795 return DAG.getBitcast(MVT::i8, Ins);
26796 }
26797
26798 case CMP_MASK_CC: {
26799 MVT MaskVT = Op.getSimpleValueType();
26800 SDValue CC = Op.getOperand(3);
26801 SDValue Mask = Op.getOperand(4);
26802 // We specify 2 possible opcodes for intrinsics with rounding modes.
26803 // First, we check if the intrinsic may have non-default rounding mode,
26804 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26805 if (IntrData->Opc1 != 0) {
26806 SDValue Sae = Op.getOperand(5);
26807 if (isRoundModeSAE(Sae))
26808 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26809 Op.getOperand(2), CC, Mask, Sae);
26810 if (!isRoundModeCurDirection(Sae))
26811 return SDValue();
26812 }
26813 //default rounding mode
26814 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26815 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26816 }
26817 case CMP_MASK_SCALAR_CC: {
26818 SDValue Src1 = Op.getOperand(1);
26819 SDValue Src2 = Op.getOperand(2);
26820 SDValue CC = Op.getOperand(3);
26821 SDValue Mask = Op.getOperand(4);
26822
26823 SDValue Cmp;
26824 if (IntrData->Opc1 != 0) {
26825 SDValue Sae = Op.getOperand(5);
26826 if (isRoundModeSAE(Sae))
26827 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26828 else if (!isRoundModeCurDirection(Sae))
26829 return SDValue();
26830 }
26831 //default rounding mode
26832 if (!Cmp.getNode())
26833 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26834
26835 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26836 Subtarget, DAG);
26837 // Need to fill with zeros to ensure the bitcast will produce zeroes
26838 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26839 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26840 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26841 DAG.getVectorIdxConstant(0, dl));
26842 return DAG.getBitcast(MVT::i8, Ins);
26843 }
26844 case COMI: { // Comparison intrinsics
26845 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26846 SDValue LHS = Op.getOperand(1);
26847 SDValue RHS = Op.getOperand(2);
26848 // Some conditions require the operands to be swapped.
26849 if (CC == ISD::SETLT || CC == ISD::SETLE)
26850 std::swap(LHS, RHS);
26851
26852 // For AVX10.2, Support EQ and NE.
26853 bool HasAVX10_2_COMX =
26854 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26855
26856 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26857 // For BF type we need to fall back.
26858 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26859
26860 auto ComiOpCode = IntrData->Opc0;
26861 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26862
26863 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26864 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26865
26866 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26867
26868 SDValue SetCC;
26869 switch (CC) {
26870 case ISD::SETEQ: {
26871 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26872 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26873 break;
26874 // (ZF = 1 and PF = 0)
26875 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26876 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26877 break;
26878 }
26879 case ISD::SETNE: {
26880 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26881 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26882 break;
26883 // (ZF = 0 or PF = 1)
26884 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26885 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26886 break;
26887 }
26888 case ISD::SETGT: // (CF = 0 and ZF = 0)
26889 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26890 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26891 break;
26892 }
26893 case ISD::SETGE: // CF = 0
26894 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26895 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26896 break;
26897 default:
26898 llvm_unreachable("Unexpected illegal condition!");
26899 }
26900 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26901 }
26902 case COMI_RM: { // Comparison intrinsics with Sae
26903 SDValue LHS = Op.getOperand(1);
26904 SDValue RHS = Op.getOperand(2);
26905 unsigned CondVal = Op.getConstantOperandVal(3);
26906 SDValue Sae = Op.getOperand(4);
26907
26908 SDValue FCmp;
26909 if (isRoundModeCurDirection(Sae))
26910 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26911 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26912 else if (isRoundModeSAE(Sae))
26913 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26914 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26915 else
26916 return SDValue();
26917 // Need to fill with zeros to ensure the bitcast will produce zeroes
26918 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26919 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26920 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26921 DAG.getVectorIdxConstant(0, dl));
26922 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26923 DAG.getBitcast(MVT::i16, Ins));
26924 }
26925 case VSHIFT: {
26926 SDValue SrcOp = Op.getOperand(1);
26927 SDValue ShAmt = Op.getOperand(2);
26928 assert(ShAmt.getValueType() == MVT::i32 &&
26929 "Unexpected VSHIFT amount type");
26930
26931 // Catch shift-by-constant.
26932 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26933 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26934 Op.getSimpleValueType(), SrcOp,
26935 CShAmt->getZExtValue(), DAG);
26936
26937 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26938 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26939 SrcOp, ShAmt, 0, Subtarget, DAG);
26940 }
26942 SDValue Mask = Op.getOperand(3);
26943 SDValue DataToCompress = Op.getOperand(1);
26944 SDValue PassThru = Op.getOperand(2);
26945 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26946 return Op.getOperand(1);
26947
26948 // Avoid false dependency.
26949 if (PassThru.isUndef())
26950 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26951
26952 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26953 Mask);
26954 }
26955 case FIXUPIMM:
26956 case FIXUPIMM_MASKZ: {
26957 SDValue Src1 = Op.getOperand(1);
26958 SDValue Src2 = Op.getOperand(2);
26959 SDValue Src3 = Op.getOperand(3);
26960 SDValue Imm = Op.getOperand(4);
26961 SDValue Mask = Op.getOperand(5);
26962 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26963 ? Src1
26964 : getZeroVector(VT, Subtarget, DAG, dl);
26965
26966 unsigned Opc = IntrData->Opc0;
26967 if (IntrData->Opc1 != 0) {
26968 SDValue Sae = Op.getOperand(6);
26969 if (isRoundModeSAE(Sae))
26970 Opc = IntrData->Opc1;
26971 else if (!isRoundModeCurDirection(Sae))
26972 return SDValue();
26973 }
26974
26975 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26976
26978 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26979
26980 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26981 }
26982 case ROUNDP: {
26983 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26984 // Clear the upper bits of the rounding immediate so that the legacy
26985 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26986 uint64_t Round = Op.getConstantOperandVal(2);
26987 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26988 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26989 Op.getOperand(1), RoundingMode);
26990 }
26991 case ROUNDS: {
26992 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26993 // Clear the upper bits of the rounding immediate so that the legacy
26994 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26995 uint64_t Round = Op.getConstantOperandVal(3);
26996 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26997 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26998 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26999 }
27000 case BEXTRI: {
27001 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
27002
27003 uint64_t Imm = Op.getConstantOperandVal(2);
27004 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
27005 Op.getValueType());
27006 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27007 Op.getOperand(1), Control);
27008 }
27009 // ADC/SBB
27010 case ADX: {
27011 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
27012 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
27013
27014 SDValue Res;
27015 // If the carry in is zero, then we should just use ADD/SUB instead of
27016 // ADC/SBB.
27017 if (isNullConstant(Op.getOperand(1))) {
27018 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
27019 Op.getOperand(3));
27020 } else {
27021 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
27022 DAG.getAllOnesConstant(dl, MVT::i8));
27023 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
27024 Op.getOperand(3), GenCF.getValue(1));
27025 }
27026 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27027 SDValue Results[] = { SetCC, Res };
27028 return DAG.getMergeValues(Results, dl);
27029 }
27030 case CVTPD2PS_MASK:
27031 case CVTPD2DQ_MASK:
27032 case CVTQQ2PS_MASK:
27033 case TRUNCATE_TO_REG: {
27034 SDValue Src = Op.getOperand(1);
27035 SDValue PassThru = Op.getOperand(2);
27036 SDValue Mask = Op.getOperand(3);
27037
27038 if (isAllOnesConstant(Mask))
27039 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27040
27041 MVT SrcVT = Src.getSimpleValueType();
27042 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27043 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27044 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27045 {Src, PassThru, Mask});
27046 }
27047 case TRUNCATE2_TO_REG: {
27048 SDValue Src = Op.getOperand(1);
27049 SDValue Src2 = Op.getOperand(2);
27050 SDValue PassThru = Op.getOperand(3);
27051 SDValue Mask = Op.getOperand(4);
27052
27053 if (isAllOnesConstant(Mask))
27054 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27055
27056 MVT Src2VT = Src2.getSimpleValueType();
27057 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27058 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27059 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27060 {Src, Src2, PassThru, Mask});
27061 }
27062 case CVTPS2PH_MASK: {
27063 SDValue Src = Op.getOperand(1);
27064 SDValue Rnd = Op.getOperand(2);
27065 SDValue PassThru = Op.getOperand(3);
27066 SDValue Mask = Op.getOperand(4);
27067
27068 unsigned RC = 0;
27069 unsigned Opc = IntrData->Opc0;
27070 bool SAE = Src.getValueType().is512BitVector() &&
27071 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27072 if (SAE) {
27074 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27075 }
27076
27077 if (isAllOnesConstant(Mask))
27078 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27079
27080 if (SAE)
27082 else
27083 Opc = IntrData->Opc1;
27084 MVT SrcVT = Src.getSimpleValueType();
27085 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27086 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27087 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27088 }
27089 case CVTNEPS2BF16_MASK: {
27090 SDValue Src = Op.getOperand(1);
27091 SDValue PassThru = Op.getOperand(2);
27092 SDValue Mask = Op.getOperand(3);
27093
27094 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27095 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27096
27097 // Break false dependency.
27098 if (PassThru.isUndef())
27099 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27100
27101 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27102 Mask);
27103 }
27104 default:
27105 break;
27106 }
27107 }
27108
27109 switch (IntNo) {
27110 default: return SDValue(); // Don't custom lower most intrinsics.
27111
27112 // ptest and testp intrinsics. The intrinsic these come from are designed to
27113 // return an integer value, not just an instruction so lower it to the ptest
27114 // or testp pattern and a setcc for the result.
27115 case Intrinsic::x86_avx512_ktestc_b:
27116 case Intrinsic::x86_avx512_ktestc_w:
27117 case Intrinsic::x86_avx512_ktestc_d:
27118 case Intrinsic::x86_avx512_ktestc_q:
27119 case Intrinsic::x86_avx512_ktestz_b:
27120 case Intrinsic::x86_avx512_ktestz_w:
27121 case Intrinsic::x86_avx512_ktestz_d:
27122 case Intrinsic::x86_avx512_ktestz_q:
27123 case Intrinsic::x86_sse41_ptestz:
27124 case Intrinsic::x86_sse41_ptestc:
27125 case Intrinsic::x86_sse41_ptestnzc:
27126 case Intrinsic::x86_avx_ptestz_256:
27127 case Intrinsic::x86_avx_ptestc_256:
27128 case Intrinsic::x86_avx_ptestnzc_256:
27129 case Intrinsic::x86_avx_vtestz_ps:
27130 case Intrinsic::x86_avx_vtestc_ps:
27131 case Intrinsic::x86_avx_vtestnzc_ps:
27132 case Intrinsic::x86_avx_vtestz_pd:
27133 case Intrinsic::x86_avx_vtestc_pd:
27134 case Intrinsic::x86_avx_vtestnzc_pd:
27135 case Intrinsic::x86_avx_vtestz_ps_256:
27136 case Intrinsic::x86_avx_vtestc_ps_256:
27137 case Intrinsic::x86_avx_vtestnzc_ps_256:
27138 case Intrinsic::x86_avx_vtestz_pd_256:
27139 case Intrinsic::x86_avx_vtestc_pd_256:
27140 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27141 unsigned TestOpc = X86ISD::PTEST;
27142 X86::CondCode X86CC;
27143 switch (IntNo) {
27144 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27145 case Intrinsic::x86_avx512_ktestc_b:
27146 case Intrinsic::x86_avx512_ktestc_w:
27147 case Intrinsic::x86_avx512_ktestc_d:
27148 case Intrinsic::x86_avx512_ktestc_q:
27149 // CF = 1
27150 TestOpc = X86ISD::KTEST;
27151 X86CC = X86::COND_B;
27152 break;
27153 case Intrinsic::x86_avx512_ktestz_b:
27154 case Intrinsic::x86_avx512_ktestz_w:
27155 case Intrinsic::x86_avx512_ktestz_d:
27156 case Intrinsic::x86_avx512_ktestz_q:
27157 TestOpc = X86ISD::KTEST;
27158 X86CC = X86::COND_E;
27159 break;
27160 case Intrinsic::x86_avx_vtestz_ps:
27161 case Intrinsic::x86_avx_vtestz_pd:
27162 case Intrinsic::x86_avx_vtestz_ps_256:
27163 case Intrinsic::x86_avx_vtestz_pd_256:
27164 TestOpc = X86ISD::TESTP;
27165 [[fallthrough]];
27166 case Intrinsic::x86_sse41_ptestz:
27167 case Intrinsic::x86_avx_ptestz_256:
27168 // ZF = 1
27169 X86CC = X86::COND_E;
27170 break;
27171 case Intrinsic::x86_avx_vtestc_ps:
27172 case Intrinsic::x86_avx_vtestc_pd:
27173 case Intrinsic::x86_avx_vtestc_ps_256:
27174 case Intrinsic::x86_avx_vtestc_pd_256:
27175 TestOpc = X86ISD::TESTP;
27176 [[fallthrough]];
27177 case Intrinsic::x86_sse41_ptestc:
27178 case Intrinsic::x86_avx_ptestc_256:
27179 // CF = 1
27180 X86CC = X86::COND_B;
27181 break;
27182 case Intrinsic::x86_avx_vtestnzc_ps:
27183 case Intrinsic::x86_avx_vtestnzc_pd:
27184 case Intrinsic::x86_avx_vtestnzc_ps_256:
27185 case Intrinsic::x86_avx_vtestnzc_pd_256:
27186 TestOpc = X86ISD::TESTP;
27187 [[fallthrough]];
27188 case Intrinsic::x86_sse41_ptestnzc:
27189 case Intrinsic::x86_avx_ptestnzc_256:
27190 // ZF and CF = 0
27191 X86CC = X86::COND_A;
27192 break;
27193 }
27194
27195 SDValue LHS = Op.getOperand(1);
27196 SDValue RHS = Op.getOperand(2);
27197 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27198 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27199 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27200 }
27201
27202 case Intrinsic::x86_sse42_pcmpistria128:
27203 case Intrinsic::x86_sse42_pcmpestria128:
27204 case Intrinsic::x86_sse42_pcmpistric128:
27205 case Intrinsic::x86_sse42_pcmpestric128:
27206 case Intrinsic::x86_sse42_pcmpistrio128:
27207 case Intrinsic::x86_sse42_pcmpestrio128:
27208 case Intrinsic::x86_sse42_pcmpistris128:
27209 case Intrinsic::x86_sse42_pcmpestris128:
27210 case Intrinsic::x86_sse42_pcmpistriz128:
27211 case Intrinsic::x86_sse42_pcmpestriz128: {
27212 unsigned Opcode;
27213 X86::CondCode X86CC;
27214 switch (IntNo) {
27215 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27216 case Intrinsic::x86_sse42_pcmpistria128:
27217 Opcode = X86ISD::PCMPISTR;
27218 X86CC = X86::COND_A;
27219 break;
27220 case Intrinsic::x86_sse42_pcmpestria128:
27221 Opcode = X86ISD::PCMPESTR;
27222 X86CC = X86::COND_A;
27223 break;
27224 case Intrinsic::x86_sse42_pcmpistric128:
27225 Opcode = X86ISD::PCMPISTR;
27226 X86CC = X86::COND_B;
27227 break;
27228 case Intrinsic::x86_sse42_pcmpestric128:
27229 Opcode = X86ISD::PCMPESTR;
27230 X86CC = X86::COND_B;
27231 break;
27232 case Intrinsic::x86_sse42_pcmpistrio128:
27233 Opcode = X86ISD::PCMPISTR;
27234 X86CC = X86::COND_O;
27235 break;
27236 case Intrinsic::x86_sse42_pcmpestrio128:
27237 Opcode = X86ISD::PCMPESTR;
27238 X86CC = X86::COND_O;
27239 break;
27240 case Intrinsic::x86_sse42_pcmpistris128:
27241 Opcode = X86ISD::PCMPISTR;
27242 X86CC = X86::COND_S;
27243 break;
27244 case Intrinsic::x86_sse42_pcmpestris128:
27245 Opcode = X86ISD::PCMPESTR;
27246 X86CC = X86::COND_S;
27247 break;
27248 case Intrinsic::x86_sse42_pcmpistriz128:
27249 Opcode = X86ISD::PCMPISTR;
27250 X86CC = X86::COND_E;
27251 break;
27252 case Intrinsic::x86_sse42_pcmpestriz128:
27253 Opcode = X86ISD::PCMPESTR;
27254 X86CC = X86::COND_E;
27255 break;
27256 }
27258 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27259 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27260 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27261 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27262 }
27263
27264 case Intrinsic::x86_sse42_pcmpistri128:
27265 case Intrinsic::x86_sse42_pcmpestri128: {
27266 unsigned Opcode;
27267 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27268 Opcode = X86ISD::PCMPISTR;
27269 else
27270 Opcode = X86ISD::PCMPESTR;
27271
27273 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27274 return DAG.getNode(Opcode, dl, VTs, NewOps);
27275 }
27276
27277 case Intrinsic::x86_sse42_pcmpistrm128:
27278 case Intrinsic::x86_sse42_pcmpestrm128: {
27279 unsigned Opcode;
27280 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27281 Opcode = X86ISD::PCMPISTR;
27282 else
27283 Opcode = X86ISD::PCMPESTR;
27284
27286 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27287 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27288 }
27289
27290 case Intrinsic::eh_sjlj_lsda: {
27291 MachineFunction &MF = DAG.getMachineFunction();
27292 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27293 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27294 auto &Context = MF.getContext();
27295 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27296 Twine(MF.getFunctionNumber()));
27297 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27298 DAG.getMCSymbol(S, PtrVT));
27299 }
27300
27301 case Intrinsic::x86_seh_lsda: {
27302 // Compute the symbol for the LSDA. We know it'll get emitted later.
27303 MachineFunction &MF = DAG.getMachineFunction();
27304 SDValue Op1 = Op.getOperand(1);
27305 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27308
27309 // Generate a simple absolute symbol reference. This intrinsic is only
27310 // supported on 32-bit Windows, which isn't PIC.
27311 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27312 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27313 }
27314
27315 case Intrinsic::eh_recoverfp: {
27316 SDValue FnOp = Op.getOperand(1);
27317 SDValue IncomingFPOp = Op.getOperand(2);
27318 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27319 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27320 if (!Fn)
27322 "llvm.eh.recoverfp must take a function as the first argument");
27323 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27324 }
27325
27326 case Intrinsic::localaddress: {
27327 // Returns one of the stack, base, or frame pointer registers, depending on
27328 // which is used to reference local variables.
27329 MachineFunction &MF = DAG.getMachineFunction();
27330 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27331 Register Reg;
27332 if (RegInfo->hasBasePointer(MF))
27333 Reg = RegInfo->getBaseRegister();
27334 else { // Handles the SP or FP case.
27335 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27336 if (CantUseFP)
27337 Reg = RegInfo->getPtrSizedStackRegister(MF);
27338 else
27339 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27340 }
27341 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27342 }
27343 case Intrinsic::x86_avx512_vp2intersect_q_512:
27344 case Intrinsic::x86_avx512_vp2intersect_q_256:
27345 case Intrinsic::x86_avx512_vp2intersect_q_128:
27346 case Intrinsic::x86_avx512_vp2intersect_d_512:
27347 case Intrinsic::x86_avx512_vp2intersect_d_256:
27348 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27349 SDLoc DL(Op);
27350 MVT MaskVT = Op.getSimpleValueType();
27351 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27353 Op.getOperand(1), Op.getOperand(2));
27354 SDValue Result0 =
27355 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27356 SDValue Result1 =
27357 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27358 return DAG.getMergeValues({Result0, Result1}, DL);
27359 }
27360 case Intrinsic::x86_mmx_pslli_w:
27361 case Intrinsic::x86_mmx_pslli_d:
27362 case Intrinsic::x86_mmx_pslli_q:
27363 case Intrinsic::x86_mmx_psrli_w:
27364 case Intrinsic::x86_mmx_psrli_d:
27365 case Intrinsic::x86_mmx_psrli_q:
27366 case Intrinsic::x86_mmx_psrai_w:
27367 case Intrinsic::x86_mmx_psrai_d: {
27368 SDLoc DL(Op);
27369 SDValue ShAmt = Op.getOperand(2);
27370 // If the argument is a constant, convert it to a target constant.
27371 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27372 // Clamp out of bounds shift amounts since they will otherwise be masked
27373 // to 8-bits which may make it no longer out of bounds.
27374 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27375 if (ShiftAmount == 0)
27376 return Op.getOperand(1);
27377
27378 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27379 Op.getOperand(0), Op.getOperand(1),
27380 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27381 }
27382
27383 unsigned NewIntrinsic;
27384 switch (IntNo) {
27385 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27386 case Intrinsic::x86_mmx_pslli_w:
27387 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27388 break;
27389 case Intrinsic::x86_mmx_pslli_d:
27390 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27391 break;
27392 case Intrinsic::x86_mmx_pslli_q:
27393 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27394 break;
27395 case Intrinsic::x86_mmx_psrli_w:
27396 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27397 break;
27398 case Intrinsic::x86_mmx_psrli_d:
27399 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27400 break;
27401 case Intrinsic::x86_mmx_psrli_q:
27402 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27403 break;
27404 case Intrinsic::x86_mmx_psrai_w:
27405 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27406 break;
27407 case Intrinsic::x86_mmx_psrai_d:
27408 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27409 break;
27410 }
27411
27412 // The vector shift intrinsics with scalars uses 32b shift amounts but
27413 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27414 // MMX register.
27415 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27416 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27417 DAG.getTargetConstant(NewIntrinsic, DL,
27419 Op.getOperand(1), ShAmt);
27420 }
27421 case Intrinsic::thread_pointer: {
27422 if (Subtarget.isTargetELF()) {
27423 SDLoc dl(Op);
27424 EVT PtrVT = Op.getValueType();
27425 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27427 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27428 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27429 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27430 }
27432 "Target OS doesn't support __builtin_thread_pointer() yet.");
27433 }
27434 }
27435}
27436
27438 SDValue Src, SDValue Mask, SDValue Base,
27439 SDValue Index, SDValue ScaleOp, SDValue Chain,
27440 const X86Subtarget &Subtarget) {
27441 SDLoc dl(Op);
27442 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27443 // Scale must be constant.
27444 if (!C)
27445 return SDValue();
27446 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27447 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27448 TLI.getPointerTy(DAG.getDataLayout()));
27449 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27450 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27451 // If source is undef or we know it won't be used, use a zero vector
27452 // to break register dependency.
27453 // TODO: use undef instead and let BreakFalseDeps deal with it?
27454 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27455 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27456
27457 // Cast mask to an integer type.
27458 Mask = DAG.getBitcast(MaskVT, Mask);
27459
27461
27462 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27463 SDValue Res =
27465 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27466 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27467}
27468
27470 SDValue Src, SDValue Mask, SDValue Base,
27471 SDValue Index, SDValue ScaleOp, SDValue Chain,
27472 const X86Subtarget &Subtarget) {
27473 MVT VT = Op.getSimpleValueType();
27474 SDLoc dl(Op);
27475 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27476 // Scale must be constant.
27477 if (!C)
27478 return SDValue();
27479 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27480 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27481 TLI.getPointerTy(DAG.getDataLayout()));
27482 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27484 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27485
27486 // We support two versions of the gather intrinsics. One with scalar mask and
27487 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27488 if (Mask.getValueType() != MaskVT)
27489 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27490
27491 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27492 // If source is undef or we know it won't be used, use a zero vector
27493 // to break register dependency.
27494 // TODO: use undef instead and let BreakFalseDeps deal with it?
27495 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27496 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27497
27499
27500 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27501 SDValue Res =
27503 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27504 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27505}
27506
27508 SDValue Src, SDValue Mask, SDValue Base,
27509 SDValue Index, SDValue ScaleOp, SDValue Chain,
27510 const X86Subtarget &Subtarget) {
27511 SDLoc dl(Op);
27512 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27513 // Scale must be constant.
27514 if (!C)
27515 return SDValue();
27516 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27517 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27518 TLI.getPointerTy(DAG.getDataLayout()));
27519 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27520 Src.getSimpleValueType().getVectorNumElements());
27521 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27522
27523 // We support two versions of the scatter intrinsics. One with scalar mask and
27524 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27525 if (Mask.getValueType() != MaskVT)
27526 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27527
27529
27530 SDVTList VTs = DAG.getVTList(MVT::Other);
27531 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27532 SDValue Res =
27534 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27535 return Res;
27536}
27537
27539 SDValue Mask, SDValue Base, SDValue Index,
27540 SDValue ScaleOp, SDValue Chain,
27541 const X86Subtarget &Subtarget) {
27542 SDLoc dl(Op);
27543 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27544 // Scale must be constant.
27545 if (!C)
27546 return SDValue();
27547 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27548 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27549 TLI.getPointerTy(DAG.getDataLayout()));
27550 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27551 SDValue Segment = DAG.getRegister(0, MVT::i32);
27552 MVT MaskVT =
27553 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27554 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27555 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27556 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27557 return SDValue(Res, 0);
27558}
27559
27560/// Handles the lowering of builtin intrinsics with chain that return their
27561/// value into registers EDX:EAX.
27562/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27563/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27564/// TargetOpcode.
27565/// Returns a Glue value which can be used to add extra copy-from-reg if the
27566/// expanded intrinsics implicitly defines extra registers (i.e. not just
27567/// EDX:EAX).
27569 SelectionDAG &DAG,
27570 unsigned TargetOpcode,
27571 unsigned SrcReg,
27572 const X86Subtarget &Subtarget,
27574 SDValue Chain = N->getOperand(0);
27575 SDValue Glue;
27576
27577 if (SrcReg) {
27578 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27579 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27580 Glue = Chain.getValue(1);
27581 }
27582
27583 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27584 SDValue N1Ops[] = {Chain, Glue};
27585 SDNode *N1 = DAG.getMachineNode(
27586 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27587 Chain = SDValue(N1, 0);
27588
27589 // Reads the content of XCR and returns it in registers EDX:EAX.
27590 SDValue LO, HI;
27591 if (Subtarget.is64Bit()) {
27592 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27593 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27594 LO.getValue(2));
27595 } else {
27596 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27597 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27598 LO.getValue(2));
27599 }
27600 Chain = HI.getValue(1);
27601 Glue = HI.getValue(2);
27602
27603 if (Subtarget.is64Bit()) {
27604 // Merge the two 32-bit values into a 64-bit one.
27605 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27606 DAG.getConstant(32, DL, MVT::i8));
27607 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27608 Results.push_back(Chain);
27609 return Glue;
27610 }
27611
27612 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27613 SDValue Ops[] = { LO, HI };
27614 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27615 Results.push_back(Pair);
27616 Results.push_back(Chain);
27617 return Glue;
27618}
27619
27620/// Handles the lowering of builtin intrinsics that read the time stamp counter
27621/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27622/// READCYCLECOUNTER nodes.
27623static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27624 SelectionDAG &DAG,
27625 const X86Subtarget &Subtarget,
27627 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27628 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27629 // and the EAX register is loaded with the low-order 32 bits.
27630 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27631 /* NoRegister */0, Subtarget,
27632 Results);
27633 if (Opcode != X86::RDTSCP)
27634 return;
27635
27636 SDValue Chain = Results[1];
27637 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27638 // the ECX register. Add 'ecx' explicitly to the chain.
27639 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27640 Results[1] = ecx;
27641 Results.push_back(ecx.getValue(1));
27642}
27643
27645 SelectionDAG &DAG) {
27647 SDLoc DL(Op);
27648 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27649 Results);
27650 return DAG.getMergeValues(Results, DL);
27651}
27652
27655 SDValue Chain = Op.getOperand(0);
27656 SDValue RegNode = Op.getOperand(2);
27657 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27658 if (!EHInfo)
27659 report_fatal_error("EH registrations only live in functions using WinEH");
27660
27661 // Cast the operand to an alloca, and remember the frame index.
27662 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27663 if (!FINode)
27664 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27665 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27666
27667 // Return the chain operand without making any DAG nodes.
27668 return Chain;
27669}
27670
27673 SDValue Chain = Op.getOperand(0);
27674 SDValue EHGuard = Op.getOperand(2);
27675 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27676 if (!EHInfo)
27677 report_fatal_error("EHGuard only live in functions using WinEH");
27678
27679 // Cast the operand to an alloca, and remember the frame index.
27680 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27681 if (!FINode)
27682 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27683 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27684
27685 // Return the chain operand without making any DAG nodes.
27686 return Chain;
27687}
27688
27689/// Emit Truncating Store with signed or unsigned saturation.
27690static SDValue
27691EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27692 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27693 SelectionDAG &DAG) {
27694 SDVTList VTs = DAG.getVTList(MVT::Other);
27695 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27696 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27697 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27698 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27699}
27700
27701/// Emit Masked Truncating Store with signed or unsigned saturation.
27702static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27703 const SDLoc &DL,
27704 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27705 MachineMemOperand *MMO, SelectionDAG &DAG) {
27706 SDVTList VTs = DAG.getVTList(MVT::Other);
27707 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27708 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27709 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27710}
27711
27713 const MachineFunction &MF) {
27714 if (!Subtarget.is64Bit())
27715 return false;
27716 // 64-bit targets support extended Swift async frame setup,
27717 // except for targets that use the windows 64 prologue.
27718 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27719}
27720
27722 SelectionDAG &DAG) {
27723 unsigned IntNo = Op.getConstantOperandVal(1);
27724 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27725 if (!IntrData) {
27726 switch (IntNo) {
27727
27728 case Intrinsic::swift_async_context_addr: {
27729 SDLoc dl(Op);
27730 auto &MF = DAG.getMachineFunction();
27731 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27732 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27734 X86FI->setHasSwiftAsyncContext(true);
27735 SDValue Chain = Op->getOperand(0);
27736 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27737 SDValue Result =
27738 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27739 DAG.getTargetConstant(8, dl, MVT::i32)),
27740 0);
27741 // Return { result, chain }.
27742 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27743 CopyRBP.getValue(1));
27744 } else {
27745 // No special extended frame, create or reuse an existing stack slot.
27746 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27747 if (!X86FI->getSwiftAsyncContextFrameIdx())
27748 X86FI->setSwiftAsyncContextFrameIdx(
27749 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27750 false));
27751 SDValue Result =
27752 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27753 PtrSize == 8 ? MVT::i64 : MVT::i32);
27754 // Return { result, chain }.
27755 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27756 Op->getOperand(0));
27757 }
27758 }
27759
27760 case llvm::Intrinsic::x86_seh_ehregnode:
27761 return MarkEHRegistrationNode(Op, DAG);
27762 case llvm::Intrinsic::x86_seh_ehguard:
27763 return MarkEHGuard(Op, DAG);
27764 case llvm::Intrinsic::x86_rdpkru: {
27765 SDLoc dl(Op);
27766 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27767 // Create a RDPKRU node and pass 0 to the ECX parameter.
27768 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27769 DAG.getConstant(0, dl, MVT::i32));
27770 }
27771 case llvm::Intrinsic::x86_wrpkru: {
27772 SDLoc dl(Op);
27773 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27774 // to the EDX and ECX parameters.
27775 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27776 Op.getOperand(0), Op.getOperand(2),
27777 DAG.getConstant(0, dl, MVT::i32),
27778 DAG.getConstant(0, dl, MVT::i32));
27779 }
27780 case llvm::Intrinsic::asan_check_memaccess: {
27781 // Mark this as adjustsStack because it will be lowered to a call.
27783 // Don't do anything here, we will expand these intrinsics out later.
27784 return Op;
27785 }
27786 case llvm::Intrinsic::x86_flags_read_u32:
27787 case llvm::Intrinsic::x86_flags_read_u64:
27788 case llvm::Intrinsic::x86_flags_write_u32:
27789 case llvm::Intrinsic::x86_flags_write_u64: {
27790 // We need a frame pointer because this will get lowered to a PUSH/POP
27791 // sequence.
27794 // Don't do anything here, we will expand these intrinsics out later
27795 // during FinalizeISel in EmitInstrWithCustomInserter.
27796 return Op;
27797 }
27798 case Intrinsic::x86_lwpins32:
27799 case Intrinsic::x86_lwpins64:
27800 case Intrinsic::x86_umwait:
27801 case Intrinsic::x86_tpause: {
27802 SDLoc dl(Op);
27803 SDValue Chain = Op->getOperand(0);
27804 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27805 unsigned Opcode;
27806
27807 switch (IntNo) {
27808 default: llvm_unreachable("Impossible intrinsic");
27809 case Intrinsic::x86_umwait:
27810 Opcode = X86ISD::UMWAIT;
27811 break;
27812 case Intrinsic::x86_tpause:
27813 Opcode = X86ISD::TPAUSE;
27814 break;
27815 case Intrinsic::x86_lwpins32:
27816 case Intrinsic::x86_lwpins64:
27817 Opcode = X86ISD::LWPINS;
27818 break;
27819 }
27820
27822 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27823 Op->getOperand(3), Op->getOperand(4));
27824 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27825 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27826 Operation.getValue(1));
27827 }
27828 case Intrinsic::x86_enqcmd:
27829 case Intrinsic::x86_enqcmds: {
27830 SDLoc dl(Op);
27831 SDValue Chain = Op.getOperand(0);
27832 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27833 unsigned Opcode;
27834 switch (IntNo) {
27835 default: llvm_unreachable("Impossible intrinsic!");
27836 case Intrinsic::x86_enqcmd:
27837 Opcode = X86ISD::ENQCMD;
27838 break;
27839 case Intrinsic::x86_enqcmds:
27840 Opcode = X86ISD::ENQCMDS;
27841 break;
27842 }
27843 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27844 Op.getOperand(3));
27845 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27846 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27847 Operation.getValue(1));
27848 }
27849 case Intrinsic::x86_aesenc128kl:
27850 case Intrinsic::x86_aesdec128kl:
27851 case Intrinsic::x86_aesenc256kl:
27852 case Intrinsic::x86_aesdec256kl: {
27853 SDLoc DL(Op);
27854 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27855 SDValue Chain = Op.getOperand(0);
27856 unsigned Opcode;
27857
27858 switch (IntNo) {
27859 default: llvm_unreachable("Impossible intrinsic");
27860 case Intrinsic::x86_aesenc128kl:
27861 Opcode = X86ISD::AESENC128KL;
27862 break;
27863 case Intrinsic::x86_aesdec128kl:
27864 Opcode = X86ISD::AESDEC128KL;
27865 break;
27866 case Intrinsic::x86_aesenc256kl:
27867 Opcode = X86ISD::AESENC256KL;
27868 break;
27869 case Intrinsic::x86_aesdec256kl:
27870 Opcode = X86ISD::AESDEC256KL;
27871 break;
27872 }
27873
27875 MachineMemOperand *MMO = MemIntr->getMemOperand();
27876 EVT MemVT = MemIntr->getMemoryVT();
27878 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27879 MMO);
27880 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27881
27882 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27883 {ZF, Operation.getValue(0), Operation.getValue(2)});
27884 }
27885 case Intrinsic::x86_aesencwide128kl:
27886 case Intrinsic::x86_aesdecwide128kl:
27887 case Intrinsic::x86_aesencwide256kl:
27888 case Intrinsic::x86_aesdecwide256kl: {
27889 SDLoc DL(Op);
27890 SDVTList VTs = DAG.getVTList(
27891 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27892 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27893 SDValue Chain = Op.getOperand(0);
27894 unsigned Opcode;
27895
27896 switch (IntNo) {
27897 default: llvm_unreachable("Impossible intrinsic");
27898 case Intrinsic::x86_aesencwide128kl:
27899 Opcode = X86ISD::AESENCWIDE128KL;
27900 break;
27901 case Intrinsic::x86_aesdecwide128kl:
27902 Opcode = X86ISD::AESDECWIDE128KL;
27903 break;
27904 case Intrinsic::x86_aesencwide256kl:
27905 Opcode = X86ISD::AESENCWIDE256KL;
27906 break;
27907 case Intrinsic::x86_aesdecwide256kl:
27908 Opcode = X86ISD::AESDECWIDE256KL;
27909 break;
27910 }
27911
27913 MachineMemOperand *MMO = MemIntr->getMemOperand();
27914 EVT MemVT = MemIntr->getMemoryVT();
27916 Opcode, DL, VTs,
27917 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27918 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27919 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27920 MemVT, MMO);
27921 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27922
27923 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27924 {ZF, Operation.getValue(1), Operation.getValue(2),
27925 Operation.getValue(3), Operation.getValue(4),
27926 Operation.getValue(5), Operation.getValue(6),
27927 Operation.getValue(7), Operation.getValue(8),
27928 Operation.getValue(9)});
27929 }
27930 case Intrinsic::x86_testui: {
27931 SDLoc dl(Op);
27932 SDValue Chain = Op.getOperand(0);
27933 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27934 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27935 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27936 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27937 Operation.getValue(1));
27938 }
27939 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27940 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27941 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27942 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27943 case Intrinsic::x86_t2rpntlvwz0_internal:
27944 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27945 case Intrinsic::x86_t2rpntlvwz1_internal:
27946 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27947 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27949 unsigned IntNo = Op.getConstantOperandVal(1);
27950 unsigned Opc = 0;
27951 switch (IntNo) {
27952 default:
27953 llvm_unreachable("Unexpected intrinsic!");
27954 case Intrinsic::x86_t2rpntlvwz0_internal:
27955 Opc = X86::PT2RPNTLVWZ0V;
27956 break;
27957 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27958 Opc = X86::PT2RPNTLVWZ0T1V;
27959 break;
27960 case Intrinsic::x86_t2rpntlvwz1_internal:
27961 Opc = X86::PT2RPNTLVWZ1V;
27962 break;
27963 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27964 Opc = X86::PT2RPNTLVWZ1T1V;
27965 break;
27966 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27967 Opc = X86::PT2RPNTLVWZ0RSV;
27968 break;
27969 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27970 Opc = X86::PT2RPNTLVWZ0RST1V;
27971 break;
27972 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27973 Opc = X86::PT2RPNTLVWZ1RSV;
27974 break;
27975 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27976 Opc = X86::PT2RPNTLVWZ1RST1V;
27977 break;
27978 }
27979
27980 SDLoc DL(Op);
27981 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27982
27983 SDValue Ops[] = {Op.getOperand(2), // Row
27984 Op.getOperand(3), // Col0
27985 Op.getOperand(4), // Col1
27986 Op.getOperand(5), // Base
27987 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27988 Op.getOperand(6), // Index
27989 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27990 DAG.getRegister(0, MVT::i16), // Segment
27991 Op.getOperand(0)}; // Chain
27992
27993 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27994 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27995 SDValue(Res, 0));
27996 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27997 SDValue(Res, 0));
27998 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
27999 }
28000 case Intrinsic::x86_atomic_bts_rm:
28001 case Intrinsic::x86_atomic_btc_rm:
28002 case Intrinsic::x86_atomic_btr_rm: {
28003 SDLoc DL(Op);
28004 MVT VT = Op.getSimpleValueType();
28005 SDValue Chain = Op.getOperand(0);
28006 SDValue Op1 = Op.getOperand(2);
28007 SDValue Op2 = Op.getOperand(3);
28008 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
28009 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
28011 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28012 SDValue Res =
28013 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28014 {Chain, Op1, Op2}, VT, MMO);
28015 Chain = Res.getValue(1);
28016 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28017 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28018 }
28019 case Intrinsic::x86_atomic_bts:
28020 case Intrinsic::x86_atomic_btc:
28021 case Intrinsic::x86_atomic_btr: {
28022 SDLoc DL(Op);
28023 MVT VT = Op.getSimpleValueType();
28024 SDValue Chain = Op.getOperand(0);
28025 SDValue Op1 = Op.getOperand(2);
28026 SDValue Op2 = Op.getOperand(3);
28027 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28028 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28029 : X86ISD::LBTR;
28030 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28031 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28032 SDValue Res =
28033 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28034 {Chain, Op1, Op2, Size}, VT, MMO);
28035 Chain = Res.getValue(1);
28036 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28037 unsigned Imm = Op2->getAsZExtVal();
28038 if (Imm)
28039 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28040 DAG.getShiftAmountConstant(Imm, VT, DL));
28041 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28042 }
28043 case Intrinsic::x86_cmpccxadd32:
28044 case Intrinsic::x86_cmpccxadd64: {
28045 SDLoc DL(Op);
28046 SDValue Chain = Op.getOperand(0);
28047 SDValue Addr = Op.getOperand(2);
28048 SDValue Src1 = Op.getOperand(3);
28049 SDValue Src2 = Op.getOperand(4);
28050 SDValue CC = Op.getOperand(5);
28051 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28053 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28054 MVT::i32, MMO);
28055 return Operation;
28056 }
28057 case Intrinsic::x86_aadd32:
28058 case Intrinsic::x86_aadd64:
28059 case Intrinsic::x86_aand32:
28060 case Intrinsic::x86_aand64:
28061 case Intrinsic::x86_aor32:
28062 case Intrinsic::x86_aor64:
28063 case Intrinsic::x86_axor32:
28064 case Intrinsic::x86_axor64: {
28065 SDLoc DL(Op);
28066 SDValue Chain = Op.getOperand(0);
28067 SDValue Op1 = Op.getOperand(2);
28068 SDValue Op2 = Op.getOperand(3);
28069 MVT VT = Op2.getSimpleValueType();
28070 unsigned Opc = 0;
28071 switch (IntNo) {
28072 default:
28073 llvm_unreachable("Unknown Intrinsic");
28074 case Intrinsic::x86_aadd32:
28075 case Intrinsic::x86_aadd64:
28076 Opc = X86ISD::AADD;
28077 break;
28078 case Intrinsic::x86_aand32:
28079 case Intrinsic::x86_aand64:
28080 Opc = X86ISD::AAND;
28081 break;
28082 case Intrinsic::x86_aor32:
28083 case Intrinsic::x86_aor64:
28084 Opc = X86ISD::AOR;
28085 break;
28086 case Intrinsic::x86_axor32:
28087 case Intrinsic::x86_axor64:
28088 Opc = X86ISD::AXOR;
28089 break;
28090 }
28091 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28092 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28093 {Chain, Op1, Op2}, VT, MMO);
28094 }
28095 case Intrinsic::x86_atomic_add_cc:
28096 case Intrinsic::x86_atomic_sub_cc:
28097 case Intrinsic::x86_atomic_or_cc:
28098 case Intrinsic::x86_atomic_and_cc:
28099 case Intrinsic::x86_atomic_xor_cc: {
28100 SDLoc DL(Op);
28101 SDValue Chain = Op.getOperand(0);
28102 SDValue Op1 = Op.getOperand(2);
28103 SDValue Op2 = Op.getOperand(3);
28104 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28105 MVT VT = Op2.getSimpleValueType();
28106 unsigned Opc = 0;
28107 switch (IntNo) {
28108 default:
28109 llvm_unreachable("Unknown Intrinsic");
28110 case Intrinsic::x86_atomic_add_cc:
28111 Opc = X86ISD::LADD;
28112 break;
28113 case Intrinsic::x86_atomic_sub_cc:
28114 Opc = X86ISD::LSUB;
28115 break;
28116 case Intrinsic::x86_atomic_or_cc:
28117 Opc = X86ISD::LOR;
28118 break;
28119 case Intrinsic::x86_atomic_and_cc:
28120 Opc = X86ISD::LAND;
28121 break;
28122 case Intrinsic::x86_atomic_xor_cc:
28123 Opc = X86ISD::LXOR;
28124 break;
28125 }
28126 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28127 SDValue LockArith =
28128 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28129 {Chain, Op1, Op2}, VT, MMO);
28130 Chain = LockArith.getValue(1);
28131 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28132 }
28133 }
28134 return SDValue();
28135 }
28136
28137 SDLoc dl(Op);
28138 switch(IntrData->Type) {
28139 default: llvm_unreachable("Unknown Intrinsic Type");
28140 case RDSEED:
28141 case RDRAND: {
28142 // Emit the node with the right value type.
28143 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28144 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28145
28146 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28147 // Otherwise return the value from Rand, which is always 0, casted to i32.
28148 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28149 DAG.getConstant(1, dl, Op->getValueType(1)),
28150 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28151 SDValue(Result.getNode(), 1)};
28152 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28153
28154 // Return { result, isValid, chain }.
28155 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28156 SDValue(Result.getNode(), 2));
28157 }
28158 case GATHER_AVX2: {
28159 SDValue Chain = Op.getOperand(0);
28160 SDValue Src = Op.getOperand(2);
28161 SDValue Base = Op.getOperand(3);
28162 SDValue Index = Op.getOperand(4);
28163 SDValue Mask = Op.getOperand(5);
28164 SDValue Scale = Op.getOperand(6);
28165 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28166 Scale, Chain, Subtarget);
28167 }
28168 case GATHER: {
28169 //gather(v1, mask, index, base, scale);
28170 SDValue Chain = Op.getOperand(0);
28171 SDValue Src = Op.getOperand(2);
28172 SDValue Base = Op.getOperand(3);
28173 SDValue Index = Op.getOperand(4);
28174 SDValue Mask = Op.getOperand(5);
28175 SDValue Scale = Op.getOperand(6);
28176 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28177 Chain, Subtarget);
28178 }
28179 case SCATTER: {
28180 //scatter(base, mask, index, v1, scale);
28181 SDValue Chain = Op.getOperand(0);
28182 SDValue Base = Op.getOperand(2);
28183 SDValue Mask = Op.getOperand(3);
28184 SDValue Index = Op.getOperand(4);
28185 SDValue Src = Op.getOperand(5);
28186 SDValue Scale = Op.getOperand(6);
28187 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28188 Scale, Chain, Subtarget);
28189 }
28190 case PREFETCH: {
28191 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28192 assert((HintVal == 2 || HintVal == 3) &&
28193 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28194 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28195 SDValue Chain = Op.getOperand(0);
28196 SDValue Mask = Op.getOperand(2);
28197 SDValue Index = Op.getOperand(3);
28198 SDValue Base = Op.getOperand(4);
28199 SDValue Scale = Op.getOperand(5);
28200 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28201 Subtarget);
28202 }
28203 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28204 case RDTSC: {
28206 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28207 Results);
28208 return DAG.getMergeValues(Results, dl);
28209 }
28210 // Read Performance Monitoring Counters.
28211 case RDPMC:
28212 // Read Processor Register.
28213 case RDPRU:
28214 // GetExtended Control Register.
28215 case XGETBV: {
28217
28218 // RDPMC uses ECX to select the index of the performance counter to read.
28219 // RDPRU uses ECX to select the processor register to read.
28220 // XGETBV uses ECX to select the index of the XCR register to return.
28221 // The result is stored into registers EDX:EAX.
28222 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28223 Subtarget, Results);
28224 return DAG.getMergeValues(Results, dl);
28225 }
28226 // XTEST intrinsics.
28227 case XTEST: {
28228 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28229 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28230
28231 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28232 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28233 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28234 Ret, SDValue(InTrans.getNode(), 1));
28235 }
28238 case TRUNCATE_TO_MEM_VI32: {
28239 SDValue Mask = Op.getOperand(4);
28240 SDValue DataToTruncate = Op.getOperand(3);
28241 SDValue Addr = Op.getOperand(2);
28242 SDValue Chain = Op.getOperand(0);
28243
28245 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28246
28247 EVT MemVT = MemIntr->getMemoryVT();
28248
28249 uint16_t TruncationOp = IntrData->Opc0;
28250 switch (TruncationOp) {
28251 case X86ISD::VTRUNC: {
28252 if (isAllOnesConstant(Mask)) // return just a truncate store
28253 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28254 MemIntr->getMemOperand());
28255
28256 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28257 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28258 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28259
28260 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28261 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28262 true /* truncating */);
28263 }
28264 case X86ISD::VTRUNCUS:
28265 case X86ISD::VTRUNCS: {
28266 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28267 if (isAllOnesConstant(Mask))
28268 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28269 MemIntr->getMemOperand(), DAG);
28270
28271 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28272 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28273
28274 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28275 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28276 }
28277 default:
28278 llvm_unreachable("Unsupported truncstore intrinsic");
28279 }
28280 }
28281 case INTR_TYPE_CAST_MMX:
28282 return SDValue(); // handled in combineINTRINSIC_*
28283 }
28284}
28285
28286SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28287 SelectionDAG &DAG) const {
28288 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28289 MFI.setReturnAddressIsTaken(true);
28290
28291 unsigned Depth = Op.getConstantOperandVal(0);
28292 SDLoc dl(Op);
28293 EVT PtrVT = Op.getValueType();
28294
28295 if (Depth > 0) {
28296 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28297 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28298 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28299 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28300 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28301 MachinePointerInfo());
28302 }
28303
28304 // Just load the return address.
28305 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28306 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28307 MachinePointerInfo());
28308}
28309
28310SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28311 SelectionDAG &DAG) const {
28313 return getReturnAddressFrameIndex(DAG);
28314}
28315
28316SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28317 MachineFunction &MF = DAG.getMachineFunction();
28318 MachineFrameInfo &MFI = MF.getFrameInfo();
28319 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
28320 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28321 EVT VT = Op.getValueType();
28322
28323 MFI.setFrameAddressIsTaken(true);
28324
28325 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28326 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28327 // is not possible to crawl up the stack without looking at the unwind codes
28328 // simultaneously.
28329 int FrameAddrIndex = FuncInfo->getFAIndex();
28330 if (!FrameAddrIndex) {
28331 // Set up a frame object for the return address.
28332 unsigned SlotSize = RegInfo->getSlotSize();
28333 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28334 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28335 FuncInfo->setFAIndex(FrameAddrIndex);
28336 }
28337 return DAG.getFrameIndex(FrameAddrIndex, VT);
28338 }
28339
28340 Register FrameReg =
28341 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28342 SDLoc dl(Op); // FIXME probably not meaningful
28343 unsigned Depth = Op.getConstantOperandVal(0);
28344 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28345 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28346 "Invalid Frame Register!");
28347 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28348 while (Depth--)
28349 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28350 MachinePointerInfo());
28351 return FrameAddr;
28352}
28353
28354// FIXME? Maybe this could be a TableGen attribute on some registers and
28355// this table could be generated automatically from RegInfo.
28357 const MachineFunction &MF) const {
28358 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28359
28361 .Case("esp", X86::ESP)
28362 .Case("rsp", X86::RSP)
28363 .Case("ebp", X86::EBP)
28364 .Case("rbp", X86::RBP)
28365 .Case("r14", X86::R14)
28366 .Case("r15", X86::R15)
28367 .Default(0);
28368
28369 if (Reg == X86::EBP || Reg == X86::RBP) {
28370 if (!TFI.hasFP(MF))
28371 report_fatal_error("register " + StringRef(RegName) +
28372 " is allocatable: function has no frame pointer");
28373#ifndef NDEBUG
28374 else {
28375 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28376 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28377 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28378 "Invalid Frame Register!");
28379 }
28380#endif
28381 }
28382
28383 return Reg;
28384}
28385
28386SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28387 SelectionDAG &DAG) const {
28388 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28389 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28390}
28391
28393 const Constant *PersonalityFn) const {
28394 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28395 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28396
28397 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28398}
28399
28401 const Constant *PersonalityFn) const {
28402 // Funclet personalities don't use selectors (the runtime does the selection).
28404 return X86::NoRegister;
28405 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28406}
28407
28409 return Subtarget.isTargetWin64();
28410}
28411
28412SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28413 SDValue Chain = Op.getOperand(0);
28414 SDValue Offset = Op.getOperand(1);
28415 SDValue Handler = Op.getOperand(2);
28416 SDLoc dl (Op);
28417
28418 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28419 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28420 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28421 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28422 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28423 "Invalid Frame Register!");
28424 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28425 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28426
28427 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28428 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28429 dl));
28430 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28431 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28432 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28433
28434 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28435 DAG.getRegister(StoreAddrReg, PtrVT));
28436}
28437
28438SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28439 SelectionDAG &DAG) const {
28440 SDLoc DL(Op);
28441 // If the subtarget is not 64bit, we may need the global base reg
28442 // after isel expand pseudo, i.e., after CGBR pass ran.
28443 // Therefore, ask for the GlobalBaseReg now, so that the pass
28444 // inserts the code for us in case we need it.
28445 // Otherwise, we will end up in a situation where we will
28446 // reference a virtual register that is not defined!
28447 if (!Subtarget.is64Bit()) {
28448 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28449 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28450 }
28451 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28452 DAG.getVTList(MVT::i32, MVT::Other),
28453 Op.getOperand(0), Op.getOperand(1));
28454}
28455
28456SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28457 SelectionDAG &DAG) const {
28458 SDLoc DL(Op);
28459 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28460 Op.getOperand(0), Op.getOperand(1));
28461}
28462
28463SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28464 SelectionDAG &DAG) const {
28465 SDLoc DL(Op);
28466 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28467 Op.getOperand(0));
28468}
28469
28471 return Op.getOperand(0);
28472}
28473
28474SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28475 SelectionDAG &DAG) const {
28476 SDValue Root = Op.getOperand(0);
28477 SDValue Trmp = Op.getOperand(1); // trampoline
28478 SDValue FPtr = Op.getOperand(2); // nested function
28479 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28480 SDLoc dl (Op);
28481
28482 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28483 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28484
28485 if (Subtarget.is64Bit()) {
28486 SDValue OutChains[6];
28487
28488 // Large code-model.
28489 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28490 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28491
28492 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28493 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28494
28495 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28496
28497 // Load the pointer to the nested function into R11.
28498 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28499 SDValue Addr = Trmp;
28500 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28501 Addr, MachinePointerInfo(TrmpAddr));
28502
28503 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28504 DAG.getConstant(2, dl, MVT::i64));
28505 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28506 MachinePointerInfo(TrmpAddr, 2), Align(2));
28507
28508 // Load the 'nest' parameter value into R10.
28509 // R10 is specified in X86CallingConv.td
28510 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28511 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28512 DAG.getConstant(10, dl, MVT::i64));
28513 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28514 Addr, MachinePointerInfo(TrmpAddr, 10));
28515
28516 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28517 DAG.getConstant(12, dl, MVT::i64));
28518 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28519 MachinePointerInfo(TrmpAddr, 12), Align(2));
28520
28521 // Jump to the nested function.
28522 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28523 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28524 DAG.getConstant(20, dl, MVT::i64));
28525 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28526 Addr, MachinePointerInfo(TrmpAddr, 20));
28527
28528 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28529 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28530 DAG.getConstant(22, dl, MVT::i64));
28531 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28532 Addr, MachinePointerInfo(TrmpAddr, 22));
28533
28534 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28535 } else {
28536 const Function *Func =
28537 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28538 CallingConv::ID CC = Func->getCallingConv();
28539 unsigned NestReg;
28540
28541 switch (CC) {
28542 default:
28543 llvm_unreachable("Unsupported calling convention");
28544 case CallingConv::C:
28546 // Pass 'nest' parameter in ECX.
28547 // Must be kept in sync with X86CallingConv.td
28548 NestReg = X86::ECX;
28549
28550 // Check that ECX wasn't needed by an 'inreg' parameter.
28551 FunctionType *FTy = Func->getFunctionType();
28552 const AttributeList &Attrs = Func->getAttributes();
28553
28554 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28555 unsigned InRegCount = 0;
28556 unsigned Idx = 0;
28557
28558 for (FunctionType::param_iterator I = FTy->param_begin(),
28559 E = FTy->param_end(); I != E; ++I, ++Idx)
28560 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28561 const DataLayout &DL = DAG.getDataLayout();
28562 // FIXME: should only count parameters that are lowered to integers.
28563 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28564 }
28565
28566 if (InRegCount > 2) {
28567 report_fatal_error("Nest register in use - reduce number of inreg"
28568 " parameters!");
28569 }
28570 }
28571 break;
28572 }
28575 case CallingConv::Fast:
28576 case CallingConv::Tail:
28578 // Pass 'nest' parameter in EAX.
28579 // Must be kept in sync with X86CallingConv.td
28580 NestReg = X86::EAX;
28581 break;
28582 }
28583
28584 SDValue OutChains[4];
28585 SDValue Addr, Disp;
28586
28587 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28588 DAG.getConstant(10, dl, MVT::i32));
28589 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28590
28591 // This is storing the opcode for MOV32ri.
28592 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28593 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28594 OutChains[0] =
28595 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28596 Trmp, MachinePointerInfo(TrmpAddr));
28597
28598 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28599 DAG.getConstant(1, dl, MVT::i32));
28600 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28601 MachinePointerInfo(TrmpAddr, 1), Align(1));
28602
28603 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28604 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28605 DAG.getConstant(5, dl, MVT::i32));
28606 OutChains[2] =
28607 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28608 MachinePointerInfo(TrmpAddr, 5), Align(1));
28609
28610 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28611 DAG.getConstant(6, dl, MVT::i32));
28612 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28613 MachinePointerInfo(TrmpAddr, 6), Align(1));
28614
28615 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28616 }
28617}
28618
28619SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28620 SelectionDAG &DAG) const {
28621 /*
28622 The rounding mode is in bits 11:10 of FPSR, and has the following
28623 settings:
28624 00 Round to nearest
28625 01 Round to -inf
28626 10 Round to +inf
28627 11 Round to 0
28628
28629 GET_ROUNDING, on the other hand, expects the following:
28630 -1 Undefined
28631 0 Round to 0
28632 1 Round to nearest
28633 2 Round to +inf
28634 3 Round to -inf
28635
28636 To perform the conversion, we use a packed lookup table of the four 2-bit
28637 values that we can index by FPSP[11:10]
28638 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28639
28640 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28641 */
28642
28643 MachineFunction &MF = DAG.getMachineFunction();
28644 MVT VT = Op.getSimpleValueType();
28645 SDLoc DL(Op);
28646
28647 // Save FP Control Word to stack slot
28648 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28649 SDValue StackSlot =
28650 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28651
28652 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28653
28654 SDValue Chain = Op.getOperand(0);
28655 SDValue Ops[] = {Chain, StackSlot};
28657 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28659
28660 // Load FP Control Word from stack slot
28661 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28662 Chain = CWD.getValue(1);
28663
28664 // Mask and turn the control bits into a shift for the lookup table.
28665 SDValue Shift =
28666 DAG.getNode(ISD::SRL, DL, MVT::i16,
28667 DAG.getNode(ISD::AND, DL, MVT::i16,
28668 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28669 DAG.getConstant(9, DL, MVT::i8));
28670 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28671
28672 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28673 SDValue RetVal =
28674 DAG.getNode(ISD::AND, DL, MVT::i32,
28675 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28676 DAG.getConstant(3, DL, MVT::i32));
28677
28678 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28679
28680 return DAG.getMergeValues({RetVal, Chain}, DL);
28681}
28682
28683SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28684 SelectionDAG &DAG) const {
28685 MachineFunction &MF = DAG.getMachineFunction();
28686 SDLoc DL(Op);
28687 SDValue Chain = Op.getNode()->getOperand(0);
28688
28689 // FP control word may be set only from data in memory. So we need to allocate
28690 // stack space to save/load FP control word.
28691 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28692 SDValue StackSlot =
28693 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28694 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28695 MachineMemOperand *MMO =
28697
28698 // Store FP control word into memory.
28699 SDValue Ops[] = {Chain, StackSlot};
28700 Chain = DAG.getMemIntrinsicNode(
28701 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28702
28703 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28704 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28705 Chain = CWD.getValue(1);
28706 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28707 DAG.getConstant(0xf3ff, DL, MVT::i16));
28708
28709 // Calculate new rounding mode.
28710 SDValue NewRM = Op.getNode()->getOperand(1);
28711 SDValue RMBits;
28712 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28713 uint64_t RM = CVal->getZExtValue();
28714 int FieldVal = X86::getRoundingModeX86(RM);
28715
28716 if (FieldVal == X86::rmInvalid) {
28717 FieldVal = X86::rmToNearest;
28718 LLVMContext &C = MF.getFunction().getContext();
28719 C.diagnose(DiagnosticInfoUnsupported(
28720 MF.getFunction(), "rounding mode is not supported by X86 hardware",
28721 DiagnosticLocation(DL.getDebugLoc()), DS_Error));
28722 }
28723 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28724 } else {
28725 // Need to convert argument into bits of control word:
28726 // 0 Round to 0 -> 11
28727 // 1 Round to nearest -> 00
28728 // 2 Round to +inf -> 10
28729 // 3 Round to -inf -> 01
28730 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28731 // To make the conversion, put all these values into a value 0xc9 and shift
28732 // it left depending on the rounding mode:
28733 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28734 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28735 // ...
28736 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28737 SDValue ShiftValue =
28738 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28739 DAG.getNode(ISD::ADD, DL, MVT::i32,
28740 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28741 DAG.getConstant(1, DL, MVT::i8)),
28742 DAG.getConstant(4, DL, MVT::i32)));
28743 SDValue Shifted =
28744 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28745 ShiftValue);
28746 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28747 DAG.getConstant(0xc00, DL, MVT::i16));
28748 }
28749
28750 // Update rounding mode bits and store the new FP Control Word into stack.
28751 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28752 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28753
28754 // Load FP control word from the slot.
28755 SDValue OpsLD[] = {Chain, StackSlot};
28756 MachineMemOperand *MMOL =
28758 Chain = DAG.getMemIntrinsicNode(
28759 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28760
28761 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28762 // same way but in bits 14:13.
28763 if (Subtarget.hasSSE1()) {
28764 // Store MXCSR into memory.
28765 Chain = DAG.getNode(
28766 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28767 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28768 StackSlot);
28769
28770 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28771 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28772 Chain = CWD.getValue(1);
28773 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28774 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28775
28776 // Shift X87 RM bits from 11:10 to 14:13.
28777 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28778 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28779 DAG.getConstant(3, DL, MVT::i8));
28780
28781 // Update rounding mode bits and store the new FP Control Word into stack.
28782 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28783 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28784
28785 // Load MXCSR from the slot.
28786 Chain = DAG.getNode(
28787 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28788 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28789 StackSlot);
28790 }
28791
28792 return Chain;
28793}
28794
28795const unsigned X87StateSize = 28;
28796const unsigned FPStateSize = 32;
28797[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28798
28799SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28800 SelectionDAG &DAG) const {
28802 SDLoc DL(Op);
28803 SDValue Chain = Op->getOperand(0);
28804 SDValue Ptr = Op->getOperand(1);
28806 EVT MemVT = Node->getMemoryVT();
28808 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28809
28810 // Get x87 state, if it presents.
28811 if (Subtarget.hasX87()) {
28812 Chain =
28813 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28814 {Chain, Ptr}, MemVT, MMO);
28815
28816 // FNSTENV changes the exception mask, so load back the stored environment.
28817 MachineMemOperand::Flags NewFlags =
28820 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28821 Chain =
28822 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28823 {Chain, Ptr}, MemVT, MMO);
28824 }
28825
28826 // If target supports SSE, get MXCSR as well.
28827 if (Subtarget.hasSSE1()) {
28828 // Get pointer to the MXCSR location in memory.
28830 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28831 DAG.getConstant(X87StateSize, DL, PtrVT));
28832 // Store MXCSR into memory.
28833 Chain = DAG.getNode(
28834 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28835 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28836 MXCSRAddr);
28837 }
28838
28839 return Chain;
28840}
28841
28843 EVT MemVT, MachineMemOperand *MMO,
28844 SelectionDAG &DAG,
28845 const X86Subtarget &Subtarget) {
28846 // Set x87 state, if it presents.
28847 if (Subtarget.hasX87())
28848 Chain =
28849 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28850 {Chain, Ptr}, MemVT, MMO);
28851 // If target supports SSE, set MXCSR as well.
28852 if (Subtarget.hasSSE1()) {
28853 // Get pointer to the MXCSR location in memory.
28855 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28856 DAG.getConstant(X87StateSize, DL, PtrVT));
28857 // Load MXCSR from memory.
28858 Chain = DAG.getNode(
28859 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28860 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28861 MXCSRAddr);
28862 }
28863 return Chain;
28864}
28865
28866SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28867 SelectionDAG &DAG) const {
28868 SDLoc DL(Op);
28869 SDValue Chain = Op->getOperand(0);
28870 SDValue Ptr = Op->getOperand(1);
28872 EVT MemVT = Node->getMemoryVT();
28874 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28875 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28876}
28877
28878SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28879 SelectionDAG &DAG) const {
28880 MachineFunction &MF = DAG.getMachineFunction();
28881 SDLoc DL(Op);
28882 SDValue Chain = Op.getNode()->getOperand(0);
28883
28884 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28885 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28887
28888 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28889 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28890 // for compatibility with glibc.
28891 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28892 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28893 Constant *Zero = ConstantInt::get(ItemTy, 0);
28894 for (unsigned I = 0; I < 6; ++I)
28895 FPEnvVals.push_back(Zero);
28896
28897 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28898 // all exceptions, sets DAZ and FTZ to 0.
28899 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28900 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28901 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
28902 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28903 MachinePointerInfo MPI =
28905 MachineMemOperand *MMO = MF.getMachineMemOperand(
28907
28908 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28909}
28910
28911// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28912uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28913 assert((Amt < 8) && "Shift/Rotation amount out of range");
28914 switch (Opcode) {
28915 case ISD::BITREVERSE:
28916 return 0x8040201008040201ULL;
28917 case ISD::SHL:
28918 return ((0x0102040810204080ULL >> (Amt)) &
28919 (0x0101010101010101ULL * (0xFF >> (Amt))));
28920 case ISD::SRL:
28921 return ((0x0102040810204080ULL << (Amt)) &
28922 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28923 case ISD::SRA:
28924 return (getGFNICtrlImm(ISD::SRL, Amt) |
28925 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28926 case ISD::ROTL:
28927 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28928 case ISD::ROTR:
28929 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28930 }
28931 llvm_unreachable("Unsupported GFNI opcode");
28932}
28933
28934// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28935SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28936 MVT VT, unsigned Amt = 0) {
28937 assert(VT.getVectorElementType() == MVT::i8 &&
28938 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28939 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28940 SmallVector<SDValue> MaskBits;
28941 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28942 uint64_t Bits = (Imm >> (I % 64)) & 255;
28943 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28944 }
28945 return DAG.getBuildVector(VT, DL, MaskBits);
28946}
28947
28948/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28949//
28950// i8/i16 vector implemented using dword LZCNT vector instruction
28951// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28952// split the vector, perform operation on it's Lo a Hi part and
28953// concatenate the results.
28955 const X86Subtarget &Subtarget) {
28956 assert(Op.getOpcode() == ISD::CTLZ);
28957 SDLoc dl(Op);
28958 MVT VT = Op.getSimpleValueType();
28959 MVT EltVT = VT.getVectorElementType();
28960 unsigned NumElems = VT.getVectorNumElements();
28961
28962 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28963 "Unsupported element type");
28964
28965 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28966 if (NumElems > 16 ||
28967 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28968 return splitVectorIntUnary(Op, DAG, dl);
28969
28970 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28971 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28972 "Unsupported value type for operation");
28973
28974 // Use native supported vector instruction vplzcntd.
28975 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28976 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28977 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28978 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28979
28980 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28981}
28982
28983// Lower CTLZ using a PSHUFB lookup table implementation.
28985 const X86Subtarget &Subtarget,
28986 SelectionDAG &DAG) {
28987 MVT VT = Op.getSimpleValueType();
28988 int NumElts = VT.getVectorNumElements();
28989 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28990 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28991
28992 // Per-nibble leading zero PSHUFB lookup table.
28993 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28994 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28995 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28996 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28997
28999 for (int i = 0; i < NumBytes; ++i)
29000 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29001 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29002
29003 // Begin by bitcasting the input to byte vector, then split those bytes
29004 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
29005 // If the hi input nibble is zero then we add both results together, otherwise
29006 // we just take the hi result (by masking the lo result to zero before the
29007 // add).
29008 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29009 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29010
29011 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29012 SDValue Lo = Op0;
29013 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29014 SDValue HiZ;
29015 if (CurrVT.is512BitVector()) {
29016 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29017 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29018 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29019 } else {
29020 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29021 }
29022
29023 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29024 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29025 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29026 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29027
29028 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29029 // of the current vector width in the same way we did for the nibbles.
29030 // If the upper half of the input element is zero then add the halves'
29031 // leading zero counts together, otherwise just use the upper half's.
29032 // Double the width of the result until we are at target width.
29033 while (CurrVT != VT) {
29034 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29035 int CurrNumElts = CurrVT.getVectorNumElements();
29036 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29037 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29038 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29039
29040 // Check if the upper half of the input element is zero.
29041 if (CurrVT.is512BitVector()) {
29042 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29043 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29044 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29045 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29046 } else {
29047 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29048 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29049 }
29050 HiZ = DAG.getBitcast(NextVT, HiZ);
29051
29052 // Move the upper/lower halves to the lower bits as we'll be extending to
29053 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29054 // together.
29055 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29056 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29057 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29058 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29059 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29060 CurrVT = NextVT;
29061 }
29062
29063 return Res;
29064}
29065
29067 const X86Subtarget &Subtarget,
29068 SelectionDAG &DAG) {
29069 MVT VT = Op.getSimpleValueType();
29070
29071 if (Subtarget.hasCDI() &&
29072 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29073 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29074 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29075
29076 // Decompose 256-bit ops into smaller 128-bit ops.
29077 if (VT.is256BitVector() && !Subtarget.hasInt256())
29078 return splitVectorIntUnary(Op, DAG, DL);
29079
29080 // Decompose 512-bit ops into smaller 256-bit ops.
29081 if (VT.is512BitVector() && !Subtarget.hasBWI())
29082 return splitVectorIntUnary(Op, DAG, DL);
29083
29084 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29085 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29086}
29087
29089 SelectionDAG &DAG,
29090 const X86Subtarget &Subtarget) {
29091 MVT VT = Op.getSimpleValueType();
29092 SDValue Input = Op.getOperand(0);
29093
29094 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29095 "Expected vXi8 input for GFNI-based CTLZ lowering");
29096
29097 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29098
29099 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29100 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29101
29102 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29103 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29104 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29105
29106 SDValue LZCNT =
29107 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29108 DAG.getTargetConstant(8, DL, MVT::i8));
29109 return LZCNT;
29110}
29111
29112static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29113 SelectionDAG &DAG) {
29114 MVT VT = Op.getSimpleValueType();
29115 MVT OpVT = VT;
29116 unsigned NumBits = VT.getSizeInBits();
29117 SDLoc dl(Op);
29118 unsigned Opc = Op.getOpcode();
29119
29120 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29121 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29122
29123 if (VT.isVector())
29124 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29125
29126 Op = Op.getOperand(0);
29127 if (VT == MVT::i8) {
29128 // Zero extend to i32 since there is not an i8 bsr.
29129 OpVT = MVT::i32;
29130 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29131 }
29132
29133 // Check if we can safely pass a result though BSR for zero sources.
29134 SDValue PassThru = DAG.getUNDEF(OpVT);
29135 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29136 !DAG.isKnownNeverZero(Op))
29137 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29138
29139 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29140 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29141 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29142
29143 // Skip CMOV if we're using a pass through value.
29144 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29145 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29146 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29147 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29148 Op.getValue(1)};
29149 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29150 }
29151
29152 // Finally xor with NumBits-1.
29153 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29154 DAG.getConstant(NumBits - 1, dl, OpVT));
29155
29156 if (VT == MVT::i8)
29157 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29158 return Op;
29159}
29160
29161static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29162 SelectionDAG &DAG) {
29163 MVT VT = Op.getSimpleValueType();
29164 unsigned NumBits = VT.getScalarSizeInBits();
29165 SDValue N0 = Op.getOperand(0);
29166 SDLoc dl(Op);
29167 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29168
29169 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29170 "Only scalar CTTZ requires custom lowering");
29171
29172 // Check if we can safely pass a result though BSF for zero sources.
29173 SDValue PassThru = DAG.getUNDEF(VT);
29174 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29175 PassThru = DAG.getConstant(NumBits, dl, VT);
29176
29177 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29178 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29179 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29180
29181 // Skip CMOV if src is never zero or we're using a pass through value.
29182 if (NonZeroSrc || !PassThru.isUndef())
29183 return Op;
29184
29185 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29186 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29187 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29188 Op.getValue(1)};
29189 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29190}
29191
29193 const X86Subtarget &Subtarget) {
29194 MVT VT = Op.getSimpleValueType();
29195 SDLoc DL(Op);
29196
29197 if (VT == MVT::i16 || VT == MVT::i32)
29198 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29199
29200 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29201 return splitVectorIntBinary(Op, DAG, DL);
29202
29203 assert(Op.getSimpleValueType().is256BitVector() &&
29204 Op.getSimpleValueType().isInteger() &&
29205 "Only handle AVX 256-bit vector integer operation");
29206 return splitVectorIntBinary(Op, DAG, DL);
29207}
29208
29210 const X86Subtarget &Subtarget) {
29211 MVT VT = Op.getSimpleValueType();
29212 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29213 unsigned Opcode = Op.getOpcode();
29214 SDLoc DL(Op);
29215
29216 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29217 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29218 assert(Op.getSimpleValueType().isInteger() &&
29219 "Only handle AVX vector integer operation");
29220 return splitVectorIntBinary(Op, DAG, DL);
29221 }
29222
29223 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29224 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29225 EVT SetCCResultType =
29226 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29227
29228 unsigned BitWidth = VT.getScalarSizeInBits();
29229 if (Opcode == ISD::USUBSAT) {
29230 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29231 // Handle a special-case with a bit-hack instead of cmp+select:
29232 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29233 // If the target can use VPTERNLOG, DAGToDAG will match this as
29234 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29235 // "broadcast" constant load.
29237 if (C && C->getAPIntValue().isSignMask()) {
29238 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29239 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29240 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29241 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29242 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29243 }
29244 }
29245 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29246 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29247 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29248 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29249 // TODO: Move this to DAGCombiner?
29250 if (SetCCResultType == VT &&
29251 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29252 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29253 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29254 }
29255 }
29256
29257 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29258 (!VT.isVector() || VT == MVT::v2i64)) {
29261 SDValue Zero = DAG.getConstant(0, DL, VT);
29262 SDValue Result =
29263 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29264 DAG.getVTList(VT, SetCCResultType), X, Y);
29265 SDValue SumDiff = Result.getValue(0);
29266 SDValue Overflow = Result.getValue(1);
29267 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29268 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29269 SDValue SumNeg =
29270 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29271 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29272 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29273 }
29274
29275 // Use default expansion.
29276 return SDValue();
29277}
29278
29279static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29280 SelectionDAG &DAG) {
29281 MVT VT = Op.getSimpleValueType();
29282 SDLoc DL(Op);
29283
29284 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29285 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29286 // 8-bit integer abs to NEG and CMOV.
29287 SDValue N0 = Op.getOperand(0);
29288 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29289 DAG.getConstant(0, DL, VT), N0);
29290 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29291 SDValue(Neg.getNode(), 1)};
29292 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29293 }
29294
29295 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29296 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29297 SDValue Src = Op.getOperand(0);
29298 SDValue Neg = DAG.getNegative(Src, DL, VT);
29299 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29300 }
29301
29302 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29303 assert(VT.isInteger() &&
29304 "Only handle AVX 256-bit vector integer operation");
29305 return splitVectorIntUnary(Op, DAG, DL);
29306 }
29307
29308 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29309 return splitVectorIntUnary(Op, DAG, DL);
29310
29311 // Default to expand.
29312 return SDValue();
29313}
29314
29315static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29316 SelectionDAG &DAG) {
29317 MVT VT = Op.getSimpleValueType();
29318 SDLoc DL(Op);
29319
29320 // For AVX1 cases, split to use legal ops.
29321 if (VT.is256BitVector() && !Subtarget.hasInt256())
29322 return splitVectorIntBinary(Op, DAG, DL);
29323
29324 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29325 return splitVectorIntBinary(Op, DAG, DL);
29326
29327 // Default to expand.
29328 return SDValue();
29329}
29330
29331static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29332 SelectionDAG &DAG) {
29333 MVT VT = Op.getSimpleValueType();
29334 SDLoc DL(Op);
29335
29336 // For AVX1 cases, split to use legal ops.
29337 if (VT.is256BitVector() && !Subtarget.hasInt256())
29338 return splitVectorIntBinary(Op, DAG, DL);
29339
29340 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29341 return splitVectorIntBinary(Op, DAG, DL);
29342
29343 // Default to expand.
29344 return SDValue();
29345}
29346
29348 SelectionDAG &DAG) {
29349 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29350 EVT VT = Op.getValueType();
29351 SDValue X = Op.getOperand(0);
29352 SDValue Y = Op.getOperand(1);
29353 SDLoc DL(Op);
29354 bool IsMaxOp =
29355 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29356 bool IsNum =
29357 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29358 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29359 unsigned Opc = 0;
29360 if (VT.isVector())
29362 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29364
29365 if (Opc) {
29366 SDValue Imm =
29367 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29368 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29369 }
29370 }
29371
29372 uint64_t SizeInBits = VT.getScalarSizeInBits();
29373 APInt PreferredZero = APInt::getZero(SizeInBits);
29374 APInt OppositeZero = PreferredZero;
29375 EVT IVT = VT.changeTypeToInteger();
29376 X86ISD::NodeType MinMaxOp;
29377 if (IsMaxOp) {
29378 MinMaxOp = X86ISD::FMAX;
29379 OppositeZero.setSignBit();
29380 } else {
29381 PreferredZero.setSignBit();
29382 MinMaxOp = X86ISD::FMIN;
29383 }
29384 EVT SetCCType =
29385 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29386
29387 // The tables below show the expected result of Max in cases of NaN and
29388 // signed zeros.
29389 //
29390 // Y Y
29391 // Num xNaN +0 -0
29392 // --------------- ---------------
29393 // Num | Max | Y | +0 | +0 | +0 |
29394 // X --------------- X ---------------
29395 // xNaN | X | X/Y | -0 | +0 | -0 |
29396 // --------------- ---------------
29397 //
29398 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29399 // reordering.
29400 //
29401 // We check if any of operands is NaN and return NaN. Then we check if any of
29402 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29403 // to ensure the correct zero is returned.
29404 auto MatchesZero = [](SDValue Op, APInt Zero) {
29406 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29407 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29408 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29409 return CstOp->getAPIntValue() == Zero;
29410 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29411 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29412 for (const SDValue &OpVal : Op->op_values()) {
29413 if (OpVal.isUndef())
29414 continue;
29415 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29416 if (!CstOp)
29417 return false;
29418 if (!CstOp->getValueAPF().isZero())
29419 continue;
29420 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29421 return false;
29422 }
29423 return true;
29424 }
29425 return false;
29426 };
29427
29428 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29429 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29430 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29431 Op->getFlags().hasNoSignedZeros() ||
29432 DAG.isKnownNeverZeroFloat(X) ||
29434 SDValue NewX, NewY;
29435 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29436 MatchesZero(X, OppositeZero)) {
29437 // Operands are already in right order or order does not matter.
29438 NewX = X;
29439 NewY = Y;
29440 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29441 NewX = Y;
29442 NewY = X;
29443 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29444 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29445 if (IsXNeverNaN)
29446 std::swap(X, Y);
29447 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29448 // xmm register.
29449 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29451 // Bits of classes:
29452 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29453 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29454 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29455 DL, MVT::i32);
29456 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29457 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29458 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29459 DAG.getVectorIdxConstant(0, DL));
29460 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29461 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29462 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29463 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29464 } else {
29465 SDValue IsXSigned;
29466 if (Subtarget.is64Bit() || VT != MVT::f64) {
29467 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29468 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29469 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29470 } else {
29471 assert(VT == MVT::f64);
29472 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29473 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29474 DAG.getVectorIdxConstant(0, DL));
29475 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29476 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29477 DAG.getVectorIdxConstant(1, DL));
29478 Hi = DAG.getBitcast(MVT::i32, Hi);
29479 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29480 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29481 *DAG.getContext(), MVT::i32);
29482 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29483 }
29484 if (MinMaxOp == X86ISD::FMAX) {
29485 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29486 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29487 } else {
29488 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29489 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29490 }
29491 }
29492
29493 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29494 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29495
29496 // If we did no ordering operands for signed zero handling and we need
29497 // to process NaN and we know that one of the operands is not NaN then:
29498 // - For minimum/maximum, put it in the first operand,
29499 // - For minimumnum/maximumnum, put it in the second operand,
29500 // and we will not need to post handle NaN after max/min.
29501 if (IgnoreSignedZero && !IgnoreNaN &&
29502 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29503 std::swap(NewX, NewY);
29504
29505 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29506
29507 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29508 return MinMax;
29509
29510 if (DAG.isKnownNeverNaN(NewX))
29511 NewX = NewY;
29512
29513 SDValue IsNaN =
29514 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29515
29516 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29517}
29518
29519static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29520 SelectionDAG &DAG) {
29521 MVT VT = Op.getSimpleValueType();
29522 SDLoc dl(Op);
29523
29524 // For AVX1 cases, split to use legal ops.
29525 if (VT.is256BitVector() && !Subtarget.hasInt256())
29526 return splitVectorIntBinary(Op, DAG, dl);
29527
29528 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29529 return splitVectorIntBinary(Op, DAG, dl);
29530
29531 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29532 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29533
29534 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29535 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29536 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29537
29538 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29539 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29540 if (VT.bitsGE(MVT::i32)) {
29541 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29542 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29543 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29544 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29545 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29546 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29547 DAG.getTargetConstant(CC, dl, MVT::i8),
29548 Diff1.getValue(1));
29549 }
29550
29551 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29552 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29553 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29554 MVT WideVT = MVT::getIntegerVT(WideBits);
29555 if (TLI.isTypeLegal(WideVT)) {
29556 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29557 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29558 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29559 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29560 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29561 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29562 DAG.getTargetConstant(CC, dl, MVT::i8),
29563 Diff1.getValue(1));
29564 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29565 }
29566 }
29567
29568 // Default to expand.
29569 return SDValue();
29570}
29571
29572static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29573 SelectionDAG &DAG) {
29574 SDLoc dl(Op);
29575 MVT VT = Op.getSimpleValueType();
29576
29577 // Decompose 256-bit ops into 128-bit ops.
29578 if (VT.is256BitVector() && !Subtarget.hasInt256())
29579 return splitVectorIntBinary(Op, DAG, dl);
29580
29581 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29582 return splitVectorIntBinary(Op, DAG, dl);
29583
29584 SDValue A = Op.getOperand(0);
29585 SDValue B = Op.getOperand(1);
29586
29587 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29588 // vector pairs, multiply and truncate.
29589 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29590 unsigned NumElts = VT.getVectorNumElements();
29591 unsigned NumLanes = VT.getSizeInBits() / 128;
29592 unsigned NumEltsPerLane = NumElts / NumLanes;
29593
29594 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29595 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29596 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29597 return DAG.getNode(
29598 ISD::TRUNCATE, dl, VT,
29599 DAG.getNode(ISD::MUL, dl, ExVT,
29600 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29601 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29602 }
29603
29604 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29605
29606 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29607 // Don't do this if we only need to unpack one half.
29608 if (Subtarget.hasSSSE3()) {
29609 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29610 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29611 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29612 if (BIsBuildVector) {
29613 for (auto [Idx, Val] : enumerate(B->ops())) {
29614 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29615 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29616 else
29617 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29618 }
29619 }
29620 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29621 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29622 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29623 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29624 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29625 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29626 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29627 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29628 DAG.getTargetConstant(8, dl, MVT::i8));
29629 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29630 }
29631 }
29632
29633 // Extract the lo/hi parts to any extend to i16.
29634 // We're going to mask off the low byte of each result element of the
29635 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29636 // element.
29637 SDValue Undef = DAG.getUNDEF(VT);
29638 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29639 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29640
29641 SDValue BLo, BHi;
29642 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29643 // If the RHS is a constant, manually unpackl/unpackh.
29644 SmallVector<SDValue, 16> LoOps, HiOps;
29645 for (unsigned i = 0; i != NumElts; i += 16) {
29646 for (unsigned j = 0; j != 8; ++j) {
29647 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29648 MVT::i16));
29649 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29650 MVT::i16));
29651 }
29652 }
29653
29654 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29655 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29656 } else {
29657 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29658 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29659 }
29660
29661 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29662 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29663 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29664 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29665 }
29666
29667 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29668 if (VT == MVT::v4i32) {
29669 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29670 "Should not custom lower when pmulld is available!");
29671
29672 // Extract the odd parts.
29673 static const int UnpackMask[] = {1, 1, 3, 3};
29674 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29675 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29676
29677 // Multiply the even parts.
29678 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29679 DAG.getBitcast(MVT::v2i64, A),
29680 DAG.getBitcast(MVT::v2i64, B));
29681 // Now multiply odd parts.
29682 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29683 DAG.getBitcast(MVT::v2i64, Aodds),
29684 DAG.getBitcast(MVT::v2i64, Bodds));
29685
29686 Evens = DAG.getBitcast(VT, Evens);
29687 Odds = DAG.getBitcast(VT, Odds);
29688
29689 // Merge the two vectors back together with a shuffle. This expands into 2
29690 // shuffles.
29691 static const int ShufMask[] = { 0, 4, 2, 6 };
29692 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29693 }
29694
29695 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29696 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29697 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29698
29699 // Ahi = psrlqi(a, 32);
29700 // Bhi = psrlqi(b, 32);
29701 //
29702 // AloBlo = pmuludq(a, b);
29703 // AloBhi = pmuludq(a, Bhi);
29704 // AhiBlo = pmuludq(Ahi, b);
29705 //
29706 // Hi = psllqi(AloBhi + AhiBlo, 32);
29707 // return AloBlo + Hi;
29708 KnownBits AKnown = DAG.computeKnownBits(A);
29709 KnownBits BKnown = DAG.computeKnownBits(B);
29710
29711 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29712 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29713 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29714
29715 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29716 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29717 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29718
29719 SDValue Zero = DAG.getConstant(0, dl, VT);
29720
29721 // Only multiply lo/hi halves that aren't known to be zero.
29722 SDValue AloBlo = Zero;
29723 if (!ALoIsZero && !BLoIsZero)
29724 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29725
29726 SDValue AloBhi = Zero;
29727 if (!ALoIsZero && !BHiIsZero) {
29728 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29729 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29730 }
29731
29732 SDValue AhiBlo = Zero;
29733 if (!AHiIsZero && !BLoIsZero) {
29734 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29735 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29736 }
29737
29738 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29739 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29740
29741 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29742}
29743
29745 MVT VT, bool IsSigned,
29746 const X86Subtarget &Subtarget,
29747 SelectionDAG &DAG,
29748 SDValue *Low = nullptr) {
29749 unsigned NumElts = VT.getVectorNumElements();
29750
29751 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29752 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29753 // lane results back together.
29754
29755 // We'll take different approaches for signed and unsigned.
29756 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29757 // and use pmullw to calculate the full 16-bit product.
29758 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29759 // shift them left into the upper byte of each word. This allows us to use
29760 // pmulhw to calculate the full 16-bit product. This trick means we don't
29761 // need to sign extend the bytes to use pmullw.
29762
29763 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29764 SDValue Zero = DAG.getConstant(0, dl, VT);
29765
29766 SDValue ALo, AHi;
29767 if (IsSigned) {
29768 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29769 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29770 } else {
29771 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29772 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29773 }
29774
29775 SDValue BLo, BHi;
29776 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29777 // If the RHS is a constant, manually unpackl/unpackh and extend.
29778 SmallVector<SDValue, 16> LoOps, HiOps;
29779 for (unsigned i = 0; i != NumElts; i += 16) {
29780 for (unsigned j = 0; j != 8; ++j) {
29781 SDValue LoOp = B.getOperand(i + j);
29782 SDValue HiOp = B.getOperand(i + j + 8);
29783
29784 if (IsSigned) {
29785 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29786 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29787 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29788 DAG.getConstant(8, dl, MVT::i16));
29789 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29790 DAG.getConstant(8, dl, MVT::i16));
29791 } else {
29792 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29793 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29794 }
29795
29796 LoOps.push_back(LoOp);
29797 HiOps.push_back(HiOp);
29798 }
29799 }
29800
29801 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29802 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29803 } else if (IsSigned) {
29804 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29805 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29806 } else {
29807 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29808 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29809 }
29810
29811 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29812 // pack back to vXi8.
29813 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29814 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29815 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29816
29817 if (Low)
29818 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29819
29820 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29821}
29822
29823static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29824 SelectionDAG &DAG) {
29825 SDLoc dl(Op);
29826 MVT VT = Op.getSimpleValueType();
29827 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29828 unsigned NumElts = VT.getVectorNumElements();
29829 SDValue A = Op.getOperand(0);
29830 SDValue B = Op.getOperand(1);
29831
29832 // Decompose 256-bit ops into 128-bit ops.
29833 if (VT.is256BitVector() && !Subtarget.hasInt256())
29834 return splitVectorIntBinary(Op, DAG, dl);
29835
29836 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29837 return splitVectorIntBinary(Op, DAG, dl);
29838
29839 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29840 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29841 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29842 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29843
29844 // PMULxD operations multiply each even value (starting at 0) of LHS with
29845 // the related value of RHS and produce a widen result.
29846 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29847 // => <2 x i64> <ae|cg>
29848 //
29849 // In other word, to have all the results, we need to perform two PMULxD:
29850 // 1. one with the even values.
29851 // 2. one with the odd values.
29852 // To achieve #2, with need to place the odd values at an even position.
29853 //
29854 // Place the odd value at an even position (basically, shift all values 1
29855 // step to the left):
29856 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29857 9, -1, 11, -1, 13, -1, 15, -1};
29858 // <a|b|c|d> => <b|undef|d|undef>
29859 SDValue Odd0 =
29860 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29861 // <e|f|g|h> => <f|undef|h|undef>
29862 SDValue Odd1 =
29863 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29864
29865 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29866 // ints.
29867 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29868 unsigned Opcode =
29869 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29870 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29871 // => <2 x i64> <ae|cg>
29872 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29873 DAG.getBitcast(MulVT, A),
29874 DAG.getBitcast(MulVT, B)));
29875 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29876 // => <2 x i64> <bf|dh>
29877 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29878 DAG.getBitcast(MulVT, Odd0),
29879 DAG.getBitcast(MulVT, Odd1)));
29880
29881 // Shuffle it back into the right order.
29882 SmallVector<int, 16> ShufMask(NumElts);
29883 for (int i = 0; i != (int)NumElts; ++i)
29884 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29885
29886 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29887
29888 // If we have a signed multiply but no PMULDQ fix up the result of an
29889 // unsigned multiply.
29890 if (IsSigned && !Subtarget.hasSSE41()) {
29891 SDValue Zero = DAG.getConstant(0, dl, VT);
29892 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29893 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29894 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29895 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29896
29897 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29898 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29899 }
29900
29901 return Res;
29902 }
29903
29904 // Only i8 vectors should need custom lowering after this.
29905 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29906 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29907 "Unsupported vector type");
29908
29909 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29910 // logical shift down the upper half and pack back to i8.
29911
29912 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29913 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29914
29915 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29916 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29917 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29918 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29919 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29920 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29921 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29922 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29923 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29924 }
29925
29926 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29927}
29928
29929// Custom lowering for SMULO/UMULO.
29930static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29931 SelectionDAG &DAG) {
29932 MVT VT = Op.getSimpleValueType();
29933
29934 // Scalars defer to LowerXALUO.
29935 if (!VT.isVector())
29936 return LowerXALUO(Op, DAG);
29937
29938 SDLoc dl(Op);
29939 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29940 SDValue A = Op.getOperand(0);
29941 SDValue B = Op.getOperand(1);
29942 EVT OvfVT = Op->getValueType(1);
29943
29944 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29945 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29946 // Extract the LHS Lo/Hi vectors
29947 SDValue LHSLo, LHSHi;
29948 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29949
29950 // Extract the RHS Lo/Hi vectors
29951 SDValue RHSLo, RHSHi;
29952 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29953
29954 EVT LoOvfVT, HiOvfVT;
29955 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29956 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29957 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29958
29959 // Issue the split operations.
29960 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29961 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29962
29963 // Join the separate data results and the overflow results.
29964 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29965 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29966 Hi.getValue(1));
29967
29968 return DAG.getMergeValues({Res, Ovf}, dl);
29969 }
29970
29971 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29972 EVT SetccVT =
29973 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29974
29975 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29976 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29977 unsigned NumElts = VT.getVectorNumElements();
29978 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29979 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29980 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29981 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29982 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29983
29984 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29985
29986 SDValue Ovf;
29987 if (IsSigned) {
29988 SDValue High, LowSign;
29989 if (OvfVT.getVectorElementType() == MVT::i1 &&
29990 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29991 // Rather the truncating try to do the compare on vXi16 or vXi32.
29992 // Shift the high down filling with sign bits.
29993 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29994 // Fill all 16 bits with the sign bit from the low.
29995 LowSign =
29996 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29997 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29998 15, DAG);
29999 SetccVT = OvfVT;
30000 if (!Subtarget.hasBWI()) {
30001 // We can't do a vXi16 compare so sign extend to v16i32.
30002 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
30003 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
30004 }
30005 } else {
30006 // Otherwise do the compare at vXi8.
30007 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30008 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30009 LowSign =
30010 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30011 }
30012
30013 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30014 } else {
30015 SDValue High =
30016 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30017 if (OvfVT.getVectorElementType() == MVT::i1 &&
30018 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30019 // Rather the truncating try to do the compare on vXi16 or vXi32.
30020 SetccVT = OvfVT;
30021 if (!Subtarget.hasBWI()) {
30022 // We can't do a vXi16 compare so sign extend to v16i32.
30023 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30024 }
30025 } else {
30026 // Otherwise do the compare at vXi8.
30027 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30028 }
30029
30030 Ovf =
30031 DAG.getSetCC(dl, SetccVT, High,
30032 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30033 }
30034
30035 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30036
30037 return DAG.getMergeValues({Low, Ovf}, dl);
30038 }
30039
30040 SDValue Low;
30041 SDValue High =
30042 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30043
30044 SDValue Ovf;
30045 if (IsSigned) {
30046 // SMULO overflows if the high bits don't match the sign of the low.
30047 SDValue LowSign =
30048 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30049 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30050 } else {
30051 // UMULO overflows if the high bits are non-zero.
30052 Ovf =
30053 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30054 }
30055
30056 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30057
30058 return DAG.getMergeValues({Low, Ovf}, dl);
30059}
30060
30061SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30062 assert(Subtarget.isTargetWin64() && "Unexpected target");
30063 EVT VT = Op.getValueType();
30064 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30065 "Unexpected return type for lowering");
30066
30067 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30069 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30070 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30071 }
30072
30073 RTLIB::Libcall LC;
30074 bool isSigned;
30075 switch (Op->getOpcode()) {
30076 // clang-format off
30077 default: llvm_unreachable("Unexpected request for libcall!");
30078 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30079 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30080 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30081 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30082 // clang-format on
30083 }
30084
30085 SDLoc dl(Op);
30086 SDValue InChain = DAG.getEntryNode();
30087
30089 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30090 EVT ArgVT = Op->getOperand(i).getValueType();
30091 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30092 "Unexpected argument type for lowering");
30093 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30094 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30095 MachinePointerInfo MPI =
30097 InChain =
30098 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30099 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30100 }
30101
30104
30105 TargetLowering::CallLoweringInfo CLI(DAG);
30106 CLI.setDebugLoc(dl)
30107 .setChain(InChain)
30108 .setLibCallee(
30110 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30111 std::move(Args))
30112 .setInRegister()
30113 .setSExtResult(isSigned)
30114 .setZExtResult(!isSigned);
30115
30116 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30117 return DAG.getBitcast(VT, CallInfo.first);
30118}
30119
30120SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30121 SelectionDAG &DAG,
30122 SDValue &Chain) const {
30123 assert(Subtarget.isTargetWin64() && "Unexpected target");
30124 EVT VT = Op.getValueType();
30125 bool IsStrict = Op->isStrictFPOpcode();
30126
30127 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30128 EVT ArgVT = Arg.getValueType();
30129
30130 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30131 "Unexpected return type for lowering");
30132
30133 RTLIB::Libcall LC;
30134 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30135 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30136 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30137 else
30138 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30139 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30140
30141 SDLoc dl(Op);
30142 MakeLibCallOptions CallOptions;
30143 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30144
30146 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30147 // expected VT (i128).
30148 std::tie(Result, Chain) =
30149 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30150 Result = DAG.getBitcast(VT, Result);
30151 return Result;
30152}
30153
30154SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30155 SelectionDAG &DAG) const {
30156 assert(Subtarget.isTargetWin64() && "Unexpected target");
30157 EVT VT = Op.getValueType();
30158 bool IsStrict = Op->isStrictFPOpcode();
30159
30160 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30161 EVT ArgVT = Arg.getValueType();
30162
30163 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30164 "Unexpected argument type for lowering");
30165
30166 RTLIB::Libcall LC;
30167 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30168 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30169 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30170 else
30171 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30172 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30173
30174 SDLoc dl(Op);
30175 MakeLibCallOptions CallOptions;
30176 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30177
30178 // Pass the i128 argument as an indirect argument on the stack.
30179 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30180 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30181 MachinePointerInfo MPI =
30183 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30184
30186 std::tie(Result, Chain) =
30187 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30188 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30189}
30190
30191// Return true if the required (according to Opcode) shift-imm form is natively
30192// supported by the Subtarget
30193static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30194 unsigned Opcode) {
30195 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30196 "Unexpected shift opcode");
30197
30198 if (!VT.isSimple())
30199 return false;
30200
30201 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30202 return false;
30203
30204 if (VT.getScalarSizeInBits() < 16)
30205 return false;
30206
30207 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30208 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30209 return true;
30210
30211 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30212 (VT.is256BitVector() && Subtarget.hasInt256());
30213
30214 bool AShift = LShift && (Subtarget.hasAVX512() ||
30215 (VT != MVT::v2i64 && VT != MVT::v4i64));
30216 return (Opcode == ISD::SRA) ? AShift : LShift;
30217}
30218
30219// The shift amount is a variable, but it is the same for all vector lanes.
30220// These instructions are defined together with shift-immediate.
30221static
30223 unsigned Opcode) {
30224 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30225}
30226
30227// Return true if the required (according to Opcode) variable-shift form is
30228// natively supported by the Subtarget
30229static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30230 unsigned Opcode) {
30231 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30232 "Unexpected shift opcode");
30233
30234 if (!VT.isSimple())
30235 return false;
30236
30237 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30238 return false;
30239
30240 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30241 return false;
30242
30243 // vXi16 supported only on AVX-512, BWI
30244 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30245 return false;
30246
30247 if (Subtarget.hasAVX512() &&
30248 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30249 return true;
30250
30251 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30252 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30253 return (Opcode == ISD::SRA) ? AShift : LShift;
30254}
30255
30257 const X86Subtarget &Subtarget) {
30258 MVT VT = Op.getSimpleValueType();
30259 SDLoc dl(Op);
30260 SDValue R = Op.getOperand(0);
30261 SDValue Amt = Op.getOperand(1);
30262 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30263 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30264
30265 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30266 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30267 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30268 SDValue Ex = DAG.getBitcast(ExVT, R);
30269
30270 // ashr(R, 63) === cmp_slt(R, 0)
30271 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30272 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30273 "Unsupported PCMPGT op");
30274 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30275 }
30276
30277 if (ShiftAmt >= 32) {
30278 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30279 SDValue Upper =
30280 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30282 ShiftAmt - 32, DAG);
30283 if (VT == MVT::v2i64)
30284 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30285 if (VT == MVT::v4i64)
30286 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30287 {9, 1, 11, 3, 13, 5, 15, 7});
30288 } else {
30289 // SRA upper i32, SRL whole i64 and select lower i32.
30291 ShiftAmt, DAG);
30292 SDValue Lower =
30293 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30294 Lower = DAG.getBitcast(ExVT, Lower);
30295 if (VT == MVT::v2i64)
30296 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30297 if (VT == MVT::v4i64)
30298 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30299 {8, 1, 10, 3, 12, 5, 14, 7});
30300 }
30301 return DAG.getBitcast(VT, Ex);
30302 };
30303
30304 // Optimize shl/srl/sra with constant shift amount.
30305 APInt APIntShiftAmt;
30306 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30307 return SDValue();
30308
30309 // If the shift amount is out of range, return undef.
30310 if (APIntShiftAmt.uge(EltSizeInBits))
30311 return DAG.getUNDEF(VT);
30312
30313 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30314
30315 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
30316 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30317
30318 // i64 SRA needs to be performed as partial shifts.
30319 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30320 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30321 Op.getOpcode() == ISD::SRA)
30322 return ArithmeticShiftRight64(ShiftAmt);
30323
30324 // If we're logical shifting an all-signbits value then we can just perform as
30325 // a mask.
30326 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30327 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30328 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30329 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30330 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30331 }
30332
30333 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30334 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30335 unsigned NumElts = VT.getVectorNumElements();
30336 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30337
30338 // Simple i8 add case
30339 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30340 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30341 // must be 0). (add undef, undef) however can be any value. To make this
30342 // safe, we must freeze R to ensure that register allocation uses the same
30343 // register for an undefined value. This ensures that the result will
30344 // still be even and preserves the original semantics.
30345 R = DAG.getFreeze(R);
30346 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30347 }
30348
30349 // ashr(R, 7) === cmp_slt(R, 0)
30350 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30351 SDValue Zeros = DAG.getConstant(0, dl, VT);
30352 if (VT.is512BitVector()) {
30353 assert(VT == MVT::v64i8 && "Unexpected element type!");
30354 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30355 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30356 }
30357 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30358 }
30359
30360 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30361 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30362 return SDValue();
30363
30364 if (Subtarget.hasGFNI()) {
30365 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30366 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30367 DAG.getTargetConstant(0, dl, MVT::i8));
30368 }
30369
30370 if (Op.getOpcode() == ISD::SHL) {
30371 // Make a large shift.
30372 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30373 ShiftAmt, DAG);
30374 SHL = DAG.getBitcast(VT, SHL);
30375 // Zero out the rightmost bits.
30376 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30377 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30378 }
30379 if (Op.getOpcode() == ISD::SRL) {
30380 // Make a large shift.
30381 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30382 ShiftAmt, DAG);
30383 SRL = DAG.getBitcast(VT, SRL);
30384 // Zero out the leftmost bits.
30385 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30386 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30387 }
30388 if (Op.getOpcode() == ISD::SRA) {
30389 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30390 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30391
30392 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30393 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30394 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30395 return Res;
30396 }
30397 llvm_unreachable("Unknown shift opcode.");
30398 }
30399
30400 return SDValue();
30401}
30402
30404 const X86Subtarget &Subtarget) {
30405 MVT VT = Op.getSimpleValueType();
30406 SDLoc dl(Op);
30407 SDValue R = Op.getOperand(0);
30408 SDValue Amt = Op.getOperand(1);
30409 unsigned Opcode = Op.getOpcode();
30410 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30411
30412 int BaseShAmtIdx = -1;
30413 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30414 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30415 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30416 Subtarget, DAG);
30417
30418 // vXi8 shifts - shift as v8i16 + mask result.
30419 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30420 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30421 VT == MVT::v64i8) &&
30422 !Subtarget.hasXOP()) {
30423 unsigned NumElts = VT.getVectorNumElements();
30424 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30425 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30426 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30427 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30428
30429 // Create the mask using vXi16 shifts. For shift-rights we need to move
30430 // the upper byte down before splatting the vXi8 mask.
30431 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30432 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30433 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30434 if (Opcode != ISD::SHL)
30435 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30436 8, DAG);
30437 BitMask = DAG.getBitcast(VT, BitMask);
30438 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30439 SmallVector<int, 64>(NumElts, 0));
30440
30441 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30442 DAG.getBitcast(ExtVT, R), BaseShAmt,
30443 BaseShAmtIdx, Subtarget, DAG);
30444 Res = DAG.getBitcast(VT, Res);
30445 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30446
30447 if (Opcode == ISD::SRA) {
30448 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30449 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30450 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30451 SignMask =
30452 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30453 BaseShAmtIdx, Subtarget, DAG);
30454 SignMask = DAG.getBitcast(VT, SignMask);
30455 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30456 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30457 }
30458 return Res;
30459 }
30460 }
30461 }
30462
30463 return SDValue();
30464}
30465
30466// Convert a shift/rotate left amount to a multiplication scale factor.
30468 const X86Subtarget &Subtarget,
30469 SelectionDAG &DAG) {
30470 MVT VT = Amt.getSimpleValueType();
30471 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30472 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30473 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30474 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30475 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30476 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30477 return SDValue();
30478
30479 MVT SVT = VT.getVectorElementType();
30480 unsigned SVTBits = SVT.getSizeInBits();
30481 unsigned NumElems = VT.getVectorNumElements();
30482
30483 APInt UndefElts;
30484 SmallVector<APInt> EltBits;
30485 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30486 APInt One(SVTBits, 1);
30487 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30488 for (unsigned I = 0; I != NumElems; ++I) {
30489 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30490 continue;
30491 uint64_t ShAmt = EltBits[I].getZExtValue();
30492 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30493 }
30494 return DAG.getBuildVector(VT, dl, Elts);
30495 }
30496
30497 // If the target doesn't support variable shifts, use either FP conversion
30498 // or integer multiplication to avoid shifting each element individually.
30499 if (VT == MVT::v4i32) {
30500 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30501 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30502 DAG.getConstant(0x3f800000U, dl, VT));
30503 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30504 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30505 }
30506
30507 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30508 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30509 SDValue Z = DAG.getConstant(0, dl, VT);
30510 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30511 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30512 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30513 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30514 if (Subtarget.hasSSE41())
30515 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30516 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30517 }
30518
30519 return SDValue();
30520}
30521
30522static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30523 SelectionDAG &DAG) {
30524 MVT VT = Op.getSimpleValueType();
30525 SDLoc dl(Op);
30526 SDValue R = Op.getOperand(0);
30527 SDValue Amt = Op.getOperand(1);
30528 unsigned NumElts = VT.getVectorNumElements();
30529 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30530 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30531
30532 unsigned Opc = Op.getOpcode();
30533 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30534 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30535
30536 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30537 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30538
30539 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30540 return V;
30541
30542 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30543 return V;
30544
30545 if (supportedVectorVarShift(VT, Subtarget, Opc))
30546 return Op;
30547
30548 // i64 vector arithmetic shift can be emulated with the transform:
30549 // M = lshr(SIGN_MASK, Amt)
30550 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30551 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30552 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30553 Opc == ISD::SRA) {
30554 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30555 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30556 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30557 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30558 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30559 return R;
30560 }
30561
30562 // XOP has 128-bit variable logical/arithmetic shifts.
30563 // +ve/-ve Amt = shift left/right.
30564 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30565 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30566 if (Opc == ISD::SRL || Opc == ISD::SRA)
30567 Amt = DAG.getNegative(Amt, dl, VT);
30568 if (Opc == ISD::SHL || Opc == ISD::SRL)
30569 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30570 if (Opc == ISD::SRA)
30571 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30572 }
30573
30574 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30575 // shifts per-lane and then shuffle the partial results back together.
30576 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30577 // Splat the shift amounts so the scalar shifts above will catch it.
30578 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30579 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30580 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30581 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30582 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30583 }
30584
30585 // Build a map of inrange constant amounts with element mask where they occur.
30587 if (ConstantAmt) {
30588 for (unsigned I = 0; I != NumElts; ++I) {
30589 SDValue A = Amt.getOperand(I);
30590 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30591 continue;
30592 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30593 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30594 if (!Inserted) {
30595 It->second.setBit(I);
30596 continue;
30597 }
30598 It->second = APInt::getOneBitSet(NumElts, I);
30599 }
30600 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30601 }
30602
30603 // If possible, lower this shift as a sequence of two shifts by
30604 // constant plus a BLENDing shuffle instead of scalarizing it.
30605 // Example:
30606 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30607 //
30608 // Could be rewritten as:
30609 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30610 //
30611 // The advantage is that the two shifts from the example would be
30612 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30613 if (UniqueCstAmt.size() == 2 &&
30614 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30615 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30616 unsigned AmtA = UniqueCstAmt.begin()->first;
30617 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30618 const APInt &MaskA = UniqueCstAmt.begin()->second;
30619 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30620 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30621 for (unsigned I = 0; I != NumElts; ++I) {
30622 if (MaskA[I])
30623 ShuffleMask[I] = I;
30624 if (MaskB[I])
30625 ShuffleMask[I] = I + NumElts;
30626 }
30627
30628 // Only perform this blend if we can perform it without loading a mask.
30629 if ((VT != MVT::v16i16 ||
30630 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30631 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30632 canWidenShuffleElements(ShuffleMask))) {
30633 SDValue Shift1 =
30634 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30635 SDValue Shift2 =
30636 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30637 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30638 }
30639 }
30640
30641 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30642 // using vYiM vector operations where X*N == Y*M and M > N.
30643 if (ConstantAmt &&
30644 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30645 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30646 !Subtarget.hasXOP()) {
30647 MVT NarrowScalarVT = VT.getScalarType();
30648 // We can do this extra fast if each pair of narrow elements is shifted by
30649 // the same amount by doing this SWAR style: use a shift to move the valid
30650 // bits to the right position, mask out any bits which crossed from one
30651 // element to the other.
30652 // This optimized lowering is only valid if the elements in a pair can
30653 // be treated identically.
30654 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30655 SmallVector<SDValue, 32> TmpAmtWideElts;
30656 int WideEltSizeInBits = EltSizeInBits;
30657 while (WideEltSizeInBits < 32) {
30658 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30659 // unprofitable.
30660 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30661 break;
30662 }
30663 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30664 bool SameShifts = true;
30665 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30666 unsigned DstI = SrcI / 2;
30667 // Both elements are undef? Make a note and keep going.
30668 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30669 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30670 continue;
30671 }
30672 // Even element is undef? We will shift it by the same shift amount as
30673 // the odd element.
30674 if (AmtWideElts[SrcI].isUndef()) {
30675 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30676 continue;
30677 }
30678 // Odd element is undef? We will shift it by the same shift amount as
30679 // the even element.
30680 if (AmtWideElts[SrcI + 1].isUndef()) {
30681 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30682 continue;
30683 }
30684 // Both elements are equal.
30685 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30686 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30687 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30688 continue;
30689 }
30690 // One of the provisional wide elements will not have the same shift
30691 // amount. Let's bail.
30692 SameShifts = false;
30693 break;
30694 }
30695 if (!SameShifts) {
30696 break;
30697 }
30698 WideEltSizeInBits *= 2;
30699 std::swap(TmpAmtWideElts, AmtWideElts);
30700 }
30701 APInt APIntShiftAmt;
30702 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30703 bool Profitable = WidenShift;
30704 // AVX512BW brings support for vpsllvw.
30705 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30706 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30707 Profitable = false;
30708 }
30709 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30710 // fairly cheaply in other ways.
30711 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30712 Profitable = false;
30713 }
30714 // Leave it up to GFNI if we have it around.
30715 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30716 // is probably a win to use other strategies in some cases.
30717 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30718 Profitable = false;
30719 }
30720
30721 // AVX1 does not have vpand which makes our masking impractical. It does
30722 // have vandps but that is an FP instruction and crossing FP<->int typically
30723 // has some cost.
30724 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30725 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30726 Profitable = false;
30727 }
30728 unsigned WideNumElts = AmtWideElts.size();
30729 // We are only dealing with identical pairs.
30730 if (Profitable && WideNumElts != NumElts) {
30731 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30732 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30733 // Cast the operand to vXiM.
30734 SDValue RWide = DAG.getBitcast(WideVT, R);
30735 // Create our new vector of shift amounts.
30736 SDValue AmtWide = DAG.getBuildVector(
30737 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30738 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30739 // Perform the actual shift.
30740 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30741 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30742 // Now we need to construct a mask which will "drop" bits that get
30743 // shifted past the LSB/MSB. For a logical shift left, it will look
30744 // like:
30745 // FullMask = (1 << EltSizeInBits) - 1
30746 // Mask = FullMask << Amt
30747 //
30748 // This masking ensures that bits cannot migrate from one narrow lane to
30749 // another. The construction of this mask will be constant folded.
30750 // The mask for a logical right shift is nearly identical, the only
30751 // difference is that the all ones mask is shifted right instead of left.
30752 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30753 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30754 Mask = DAG.getBitcast(WideVT, Mask);
30755 // Finally, we mask the shifted vector with the SWAR mask.
30756 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30757 Masked = DAG.getBitcast(VT, Masked);
30758 if (Opc != ISD::SRA) {
30759 // Logical shifts are complete at this point.
30760 return Masked;
30761 }
30762 // At this point, we have done a *logical* shift right. We now need to
30763 // sign extend the result so that we get behavior equivalent to an
30764 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30765 // are `EltSizeInBits-AmtWide` bits wide.
30766 //
30767 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30768 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30769 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30770 // can use the following trick to accomplish this:
30771 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30772 // (Masked ^ SignBitMask) - SignBitMask
30773 //
30774 // When the sign bit is already clear, this will compute:
30775 // Masked + SignBitMask - SignBitMask
30776 //
30777 // This is equal to Masked which is what we want: the sign bit was clear
30778 // so sign extending should be a no-op.
30779 //
30780 // When the sign bit is set, this will compute:
30781 // Masked - SignBitmask - SignBitMask
30782 //
30783 // This is equal to Masked - 2*SignBitMask which will correctly sign
30784 // extend our result.
30785 SDValue SplatHighBit =
30786 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30787 // This does not induce recursion, all operands are constants.
30788 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30789 SDValue FlippedSignBit =
30790 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30791 SDValue Subtraction =
30792 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30793 return Subtraction;
30794 }
30795 }
30796
30797 // If possible, lower this packed shift into a vector multiply instead of
30798 // expanding it into a sequence of scalar shifts.
30799 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30800 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30801 Subtarget.canExtendTo512BW())))
30802 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30803 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30804
30805 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30806 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30807 if (Opc == ISD::SRL && ConstantAmt &&
30808 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30809 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30810 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30811 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30812 SDValue Zero = DAG.getConstant(0, dl, VT);
30813 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30814 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30815 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30816 }
30817 }
30818
30819 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30820 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30821 // TODO: Special case handling for shift by 0/1, really we can afford either
30822 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30823 if (Opc == ISD::SRA && ConstantAmt &&
30824 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30825 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30826 !Subtarget.hasAVX512()) ||
30827 DAG.isKnownNeverZero(Amt))) {
30828 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30829 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30830 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30831 SDValue Amt0 =
30832 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30833 SDValue Amt1 =
30834 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30835 SDValue Sra1 =
30836 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30837 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30838 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30839 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30840 }
30841 }
30842
30843 // v4i32 Non Uniform Shifts.
30844 // If the shift amount is constant we can shift each lane using the SSE2
30845 // immediate shifts, else we need to zero-extend each lane to the lower i64
30846 // and shift using the SSE2 variable shifts.
30847 // The separate results can then be blended together.
30848 if (VT == MVT::v4i32) {
30849 SDValue Amt0, Amt1, Amt2, Amt3;
30850 if (ConstantAmt) {
30851 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30852 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30853 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30854 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30855 } else {
30856 // The SSE2 shifts use the lower i64 as the same shift amount for
30857 // all lanes and the upper i64 is ignored. On AVX we're better off
30858 // just zero-extending, but for SSE just duplicating the top 16-bits is
30859 // cheaper and has the same effect for out of range values.
30860 if (Subtarget.hasAVX()) {
30861 SDValue Z = DAG.getConstant(0, dl, VT);
30862 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30863 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30864 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30865 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30866 } else {
30867 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30868 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30869 {4, 5, 6, 7, -1, -1, -1, -1});
30870 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30871 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30872 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30873 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30874 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30875 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30876 }
30877 }
30878
30879 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30880 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30881 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30882 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30883 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30884
30885 // Merge the shifted lane results optimally with/without PBLENDW.
30886 // TODO - ideally shuffle combining would handle this.
30887 if (Subtarget.hasSSE41()) {
30888 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30889 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30890 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30891 }
30892 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30893 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30894 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30895 }
30896
30897 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30898 // look up the pre-computed shift values.
30899 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30900 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30901 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30902 unsigned NumLanes = VT.getSizeInBits() / 128u;
30903 unsigned NumEltsPerLane = NumElts / NumLanes;
30905 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30906 unsigned LoElt = Lane * NumEltsPerLane;
30907 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30908 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30909 if (!KnownLane.isConstant())
30910 break;
30911 const APInt &LaneSplat = KnownLane.getConstant();
30912 for (unsigned I = 0; I != 8; ++I) {
30913 if (Opc == ISD::SHL)
30914 LUT.push_back(LaneSplat.shl(I));
30915 else if (Opc == ISD::SRL)
30916 LUT.push_back(LaneSplat.lshr(I));
30917 else if (Opc == ISD::SRA)
30918 LUT.push_back(LaneSplat.ashr(I));
30919 }
30920 LUT.append(8, APInt::getZero(8));
30921 }
30922 if (LUT.size() == NumElts) {
30923 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30924 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30925 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30926 }
30927 }
30928
30929 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30930 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30931 // make the existing SSE solution better.
30932 // NOTE: We honor prefered vector width before promoting to 512-bits.
30933 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30934 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30935 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30936 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30937 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30938 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30939 "Unexpected vector type");
30940 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30941 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30942 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30943 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30944 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30945 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30946 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30947 }
30948
30949 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30950 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30951 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30952 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30953 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30954 !Subtarget.hasXOP()) {
30955 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30956 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30957
30958 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30959 // isn't legal).
30960 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30961 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30962 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30963 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30965 "Constant build vector expected");
30966
30967 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30968 bool IsSigned = Opc == ISD::SRA;
30969 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30970 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30971 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30972 return DAG.getZExtOrTrunc(R, dl, VT);
30973 }
30974
30975 SmallVector<SDValue, 16> LoAmt, HiAmt;
30976 for (unsigned i = 0; i != NumElts; i += 16) {
30977 for (int j = 0; j != 8; ++j) {
30978 LoAmt.push_back(Amt.getOperand(i + j));
30979 HiAmt.push_back(Amt.getOperand(i + j + 8));
30980 }
30981 }
30982
30983 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30984 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30985
30986 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30987 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30988 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30989 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30990 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30991 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30992 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30993 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30994 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30995 }
30996
30997 if (VT == MVT::v16i8 ||
30998 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
30999 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
31000 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31001
31002 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31003 if (VT.is512BitVector()) {
31004 // On AVX512BW targets we make use of the fact that VSELECT lowers
31005 // to a masked blend which selects bytes based just on the sign bit
31006 // extracted to a mask.
31007 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31008 V0 = DAG.getBitcast(VT, V0);
31009 V1 = DAG.getBitcast(VT, V1);
31010 Sel = DAG.getBitcast(VT, Sel);
31011 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31012 ISD::SETGT);
31013 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31014 } else if (Subtarget.hasSSE41()) {
31015 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31016 // on the sign bit.
31017 V0 = DAG.getBitcast(VT, V0);
31018 V1 = DAG.getBitcast(VT, V1);
31019 Sel = DAG.getBitcast(VT, Sel);
31020 return DAG.getBitcast(SelVT,
31021 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31022 }
31023 // On pre-SSE41 targets we test for the sign bit by comparing to
31024 // zero - a negative value will set all bits of the lanes to true
31025 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31026 SDValue Z = DAG.getConstant(0, dl, SelVT);
31027 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31028 return DAG.getSelect(dl, SelVT, C, V0, V1);
31029 };
31030
31031 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31032 // We can safely do this using i16 shifts as we're only interested in
31033 // the 3 lower bits of each byte.
31034 Amt = DAG.getBitcast(ExtVT, Amt);
31035 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31036 Amt = DAG.getBitcast(VT, Amt);
31037
31038 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31039 // r = VSELECT(r, shift(r, 4), a);
31040 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31041 R = SignBitSelect(VT, Amt, M, R);
31042
31043 // a += a
31044 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31045
31046 // r = VSELECT(r, shift(r, 2), a);
31047 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31048 R = SignBitSelect(VT, Amt, M, R);
31049
31050 // a += a
31051 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31052
31053 // return VSELECT(r, shift(r, 1), a);
31054 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31055 R = SignBitSelect(VT, Amt, M, R);
31056 return R;
31057 }
31058
31059 if (Opc == ISD::SRA) {
31060 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31061 // so we can correctly sign extend. We don't care what happens to the
31062 // lower byte.
31063 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31064 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31065 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31066 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31067 ALo = DAG.getBitcast(ExtVT, ALo);
31068 AHi = DAG.getBitcast(ExtVT, AHi);
31069 RLo = DAG.getBitcast(ExtVT, RLo);
31070 RHi = DAG.getBitcast(ExtVT, RHi);
31071
31072 // r = VSELECT(r, shift(r, 4), a);
31073 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31074 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31075 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31076 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31077
31078 // a += a
31079 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31080 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31081
31082 // r = VSELECT(r, shift(r, 2), a);
31083 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31084 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31085 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31086 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31087
31088 // a += a
31089 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31090 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31091
31092 // r = VSELECT(r, shift(r, 1), a);
31093 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31094 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31095 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31096 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31097
31098 // Logical shift the result back to the lower byte, leaving a zero upper
31099 // byte meaning that we can safely pack with PACKUSWB.
31100 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31101 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31102 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31103 }
31104 }
31105
31106 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31107 MVT ExtVT = MVT::v8i32;
31108 SDValue Z = DAG.getConstant(0, dl, VT);
31109 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31110 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31111 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31112 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31113 ALo = DAG.getBitcast(ExtVT, ALo);
31114 AHi = DAG.getBitcast(ExtVT, AHi);
31115 RLo = DAG.getBitcast(ExtVT, RLo);
31116 RHi = DAG.getBitcast(ExtVT, RHi);
31117 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31118 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31119 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31120 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31121 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31122 }
31123
31124 if (VT == MVT::v8i16) {
31125 // If we have a constant shift amount, the non-SSE41 path is best as
31126 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31127 bool UseSSE41 = Subtarget.hasSSE41() &&
31129
31130 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31131 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31132 // the sign bit.
31133 if (UseSSE41) {
31134 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31135 V0 = DAG.getBitcast(ExtVT, V0);
31136 V1 = DAG.getBitcast(ExtVT, V1);
31137 Sel = DAG.getBitcast(ExtVT, Sel);
31138 return DAG.getBitcast(
31139 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31140 }
31141 // On pre-SSE41 targets we splat the sign bit - a negative value will
31142 // set all bits of the lanes to true and VSELECT uses that in
31143 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31144 SDValue C =
31145 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31146 return DAG.getSelect(dl, VT, C, V0, V1);
31147 };
31148
31149 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31150 if (UseSSE41) {
31151 // On SSE41 targets we need to replicate the shift mask in both
31152 // bytes for PBLENDVB.
31153 Amt = DAG.getNode(
31154 ISD::OR, dl, VT,
31155 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31156 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31157 } else {
31158 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31159 }
31160
31161 // r = VSELECT(r, shift(r, 8), a);
31162 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31163 R = SignBitSelect(Amt, M, R);
31164
31165 // a += a
31166 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31167
31168 // r = VSELECT(r, shift(r, 4), a);
31169 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31170 R = SignBitSelect(Amt, M, R);
31171
31172 // a += a
31173 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31174
31175 // r = VSELECT(r, shift(r, 2), a);
31176 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31177 R = SignBitSelect(Amt, M, R);
31178
31179 // a += a
31180 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31181
31182 // return VSELECT(r, shift(r, 1), a);
31183 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31184 R = SignBitSelect(Amt, M, R);
31185 return R;
31186 }
31187
31188 // Decompose 256-bit shifts into 128-bit shifts.
31189 if (VT.is256BitVector())
31190 return splitVectorIntBinary(Op, DAG, dl);
31191
31192 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31193 return splitVectorIntBinary(Op, DAG, dl);
31194
31195 return SDValue();
31196}
31197
31199 SelectionDAG &DAG) {
31200 MVT VT = Op.getSimpleValueType();
31201 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31202 "Unexpected funnel shift opcode!");
31203
31204 SDLoc DL(Op);
31205 SDValue Op0 = Op.getOperand(0);
31206 SDValue Op1 = Op.getOperand(1);
31207 SDValue Amt = Op.getOperand(2);
31208 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31209 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31210
31211 if (VT.isVector()) {
31212 APInt APIntShiftAmt;
31213 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31214 unsigned NumElts = VT.getVectorNumElements();
31215
31216 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31217
31218 if (IsCstSplat) {
31219 if (IsFSHR)
31220 std::swap(Op0, Op1);
31221 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31222 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31223 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31224 {Op0, Op1, Imm}, DAG, Subtarget);
31225 }
31226 return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
31227 {Op0, Op1, Amt}, DAG, Subtarget);
31228 }
31229 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31230 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31231 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31232 "Unexpected funnel shift type!");
31233
31234 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31235 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31236 if (IsCstSplat) {
31237 // TODO: Can't use generic expansion as UNDEF amt elements can be
31238 // converted to other values when folded to shift amounts, losing the
31239 // splat.
31240 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31241 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31242 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31243 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31244 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31245
31246 if (EltSizeInBits == 8 &&
31247 (Subtarget.hasXOP() ||
31248 (useVPTERNLOG(Subtarget, VT) &&
31249 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31250 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31251 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31252 // the original vector width to handle cases where we split.
31253 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31254 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31255 SDValue ShX =
31256 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31257 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31258 SDValue ShY =
31259 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31260 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31261 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31262 DAG.getConstant(MaskX, DL, VT));
31263 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31264 DAG.getConstant(MaskY, DL, VT));
31265 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31266 }
31267
31268 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31269 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31270 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31271 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31272 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31273 }
31274
31275 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31276 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31277 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31278
31279 // Constant vXi16 funnel shifts can be efficiently handled by default.
31280 if (IsCst && EltSizeInBits == 16)
31281 return SDValue();
31282
31283 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31284 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31285 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31286
31287 // Split 256-bit integers on XOP/pre-AVX2 targets.
31288 // Split 512-bit integers on non 512-bit BWI targets.
31289 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31290 !Subtarget.hasAVX2())) ||
31291 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31292 EltSizeInBits < 32)) {
31293 // Pre-mask the amount modulo using the wider vector.
31294 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31295 return splitVectorOp(Op, DAG, DL);
31296 }
31297
31298 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31299 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31300 int ScalarAmtIdx = -1;
31301 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31302 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31303 if (EltSizeInBits == 16)
31304 return SDValue();
31305
31306 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31307 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31308 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31309 ScalarAmtIdx, Subtarget, DAG);
31310 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31311 ScalarAmtIdx, Subtarget, DAG);
31312 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31313 }
31314 }
31315
31316 MVT WideSVT = MVT::getIntegerVT(
31317 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31318 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31319
31320 // If per-element shifts are legal, fallback to generic expansion.
31321 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31322 return SDValue();
31323
31324 // Attempt to fold as:
31325 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31326 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31327 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31328 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31329 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31330 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31331 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31332 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31333 EltSizeInBits, DAG);
31334 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31335 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31336 if (!IsFSHR)
31337 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31338 EltSizeInBits, DAG);
31339 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31340 }
31341
31342 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31343 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31344 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31345 SDValue Z = DAG.getConstant(0, DL, VT);
31346 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31347 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31348 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31349 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31350 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31351 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31352 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31353 }
31354
31355 // Fallback to generic expansion.
31356 return SDValue();
31357 }
31358 assert(
31359 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31360 "Unexpected funnel shift type!");
31361
31362 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31363 bool OptForSize = DAG.shouldOptForSize();
31364 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31365
31366 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31367 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31368 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31369 !isa<ConstantSDNode>(Amt)) {
31370 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31371 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31372 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31373 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31374 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31375 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31376 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31377 if (IsFSHR) {
31378 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31379 } else {
31380 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31381 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31382 }
31383 return DAG.getZExtOrTrunc(Res, DL, VT);
31384 }
31385
31386 if (VT == MVT::i8 || ExpandFunnel)
31387 return SDValue();
31388
31389 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31390 if (VT == MVT::i16) {
31391 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31392 DAG.getConstant(15, DL, Amt.getValueType()));
31393 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31394 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31395 }
31396
31397 return Op;
31398}
31399
31400static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31401 SelectionDAG &DAG) {
31402 MVT VT = Op.getSimpleValueType();
31403 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31404
31405 SDLoc DL(Op);
31406 SDValue R = Op.getOperand(0);
31407 SDValue Amt = Op.getOperand(1);
31408 unsigned Opcode = Op.getOpcode();
31409 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31410 int NumElts = VT.getVectorNumElements();
31411 bool IsROTL = Opcode == ISD::ROTL;
31412
31413 // Check for constant splat rotation amount.
31414 APInt CstSplatValue;
31415 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31416
31417 // Check for splat rotate by zero.
31418 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31419 return R;
31420
31421 // AVX512 implicitly uses modulo rotation amounts.
31422 if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
31423 // Attempt to rotate by immediate.
31424 if (IsCstSplat) {
31425 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31426 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31427 return DAG.getNode(RotOpc, DL, VT, R,
31428 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31429 }
31430
31431 // Else, fall-back on VPROLV/VPRORV.
31432 return Op;
31433 }
31434
31435 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31436 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31437 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31438 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31439 }
31440
31441 SDValue Z = DAG.getConstant(0, DL, VT);
31442
31443 if (!IsROTL) {
31444 // If the ISD::ROTR amount is constant, we're always better converting to
31445 // ISD::ROTL.
31446 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31447 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31448
31449 // XOP targets always prefers ISD::ROTL.
31450 if (Subtarget.hasXOP())
31451 return DAG.getNode(ISD::ROTL, DL, VT, R,
31452 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31453 }
31454
31455 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31456 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31458 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31459 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31460 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31461 DAG.getTargetConstant(0, DL, MVT::i8));
31462 }
31463
31464 // Split 256-bit integers on XOP/pre-AVX2 targets.
31465 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31466 return splitVectorIntBinary(Op, DAG, DL);
31467
31468 // XOP has 128-bit vector variable + immediate rotates.
31469 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31470 // XOP implicitly uses modulo rotation amounts.
31471 if (Subtarget.hasXOP()) {
31472 assert(IsROTL && "Only ROTL expected");
31473 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31474
31475 // Attempt to rotate by immediate.
31476 if (IsCstSplat) {
31477 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31478 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31479 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31480 }
31481
31482 // Use general rotate by variable (per-element).
31483 return Op;
31484 }
31485
31486 // Rotate by an uniform constant - expand back to shifts.
31487 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31488 // to other values when folded to shift amounts, losing the splat.
31489 if (IsCstSplat) {
31490 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31491 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31492 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31493 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31494 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31495 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31496 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31497 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31498 }
31499
31500 // Split 512-bit integers on non 512-bit BWI targets.
31501 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31502 return splitVectorIntBinary(Op, DAG, DL);
31503
31504 assert(
31505 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31506 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31507 Subtarget.hasAVX2()) ||
31508 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31509 "Only vXi32/vXi16/vXi8 vector rotates supported");
31510
31511 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31512 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31513
31514 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31515 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31516
31517 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31518 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31519 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31520 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31521 int BaseRotAmtIdx = -1;
31522 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31523 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31524 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31525 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31526 }
31527 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31528 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31529 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31530 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31531 BaseRotAmtIdx, Subtarget, DAG);
31532 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31533 BaseRotAmtIdx, Subtarget, DAG);
31534 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31535 }
31536 }
31537
31538 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31539 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31540
31541 // Attempt to fold as unpack(x,x) << zext(y):
31542 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31543 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31544 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31545 if (!(ConstantAmt && EltSizeInBits != 8) &&
31546 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31547 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31548 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31549 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31550 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31551 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31552 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31553 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31554 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31555 }
31556
31557 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31558 // the amount bit.
31559 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31560 if (EltSizeInBits == 8) {
31561 MVT WideVT =
31562 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31563
31564 // Attempt to fold as:
31565 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31566 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31567 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31568 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31569 // If we're rotating by constant, just use default promotion.
31570 if (ConstantAmt)
31571 return SDValue();
31572 // See if we can perform this by widening to vXi16 or vXi32.
31573 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31574 R = DAG.getNode(
31575 ISD::OR, DL, WideVT, R,
31576 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31577 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31578 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31579 if (IsROTL)
31580 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31581 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31582 }
31583
31584 // We don't need ModuloAmt here as we just peek at individual bits.
31585 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31586 if (Subtarget.hasSSE41()) {
31587 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31588 // on the sign bit.
31589 V0 = DAG.getBitcast(VT, V0);
31590 V1 = DAG.getBitcast(VT, V1);
31591 Sel = DAG.getBitcast(VT, Sel);
31592 return DAG.getBitcast(SelVT,
31593 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31594 }
31595 // On pre-SSE41 targets we test for the sign bit by comparing to
31596 // zero - a negative value will set all bits of the lanes to true
31597 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31598 SDValue Z = DAG.getConstant(0, DL, SelVT);
31599 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31600 return DAG.getSelect(DL, SelVT, C, V0, V1);
31601 };
31602
31603 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31604 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31605 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31606 IsROTL = true;
31607 }
31608
31609 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31610 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31611
31612 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31613 // We can safely do this using i16 shifts as we're only interested in
31614 // the 3 lower bits of each byte.
31615 Amt = DAG.getBitcast(ExtVT, Amt);
31616 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31617 Amt = DAG.getBitcast(VT, Amt);
31618
31619 // r = VSELECT(r, rot(r, 4), a);
31620 SDValue M;
31621 M = DAG.getNode(
31622 ISD::OR, DL, VT,
31623 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31624 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31625 R = SignBitSelect(VT, Amt, M, R);
31626
31627 // a += a
31628 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31629
31630 // r = VSELECT(r, rot(r, 2), a);
31631 M = DAG.getNode(
31632 ISD::OR, DL, VT,
31633 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31634 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31635 R = SignBitSelect(VT, Amt, M, R);
31636
31637 // a += a
31638 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31639
31640 // return VSELECT(r, rot(r, 1), a);
31641 M = DAG.getNode(
31642 ISD::OR, DL, VT,
31643 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31644 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31645 return SignBitSelect(VT, Amt, M, R);
31646 }
31647
31648 bool IsSplatAmt = DAG.isSplatValue(Amt);
31649 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31650 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31651
31652 // Fallback for splats + all supported variable shifts.
31653 // Fallback for non-constants AVX2 vXi16 as well.
31654 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31655 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31656 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31657 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31658 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31659 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31660 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31661 }
31662
31663 // Everything below assumes ISD::ROTL.
31664 if (!IsROTL) {
31665 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31666 IsROTL = true;
31667 }
31668
31669 // ISD::ROT* uses modulo rotate amounts.
31670 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31671
31672 assert(IsROTL && "Only ROTL supported");
31673
31674 // As with shifts, attempt to convert the rotation amount to a multiplication
31675 // factor, fallback to general expansion.
31676 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31677 if (!Scale)
31678 return SDValue();
31679
31680 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31681 if (EltSizeInBits == 16) {
31682 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31683 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31684 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31685 }
31686
31687 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31688 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31689 // that can then be OR'd with the lower 32-bits.
31690 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31691 static const int OddMask[] = {1, 1, 3, 3};
31692 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31693 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31694
31695 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31696 DAG.getBitcast(MVT::v2i64, R),
31697 DAG.getBitcast(MVT::v2i64, Scale));
31698 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31699 DAG.getBitcast(MVT::v2i64, R13),
31700 DAG.getBitcast(MVT::v2i64, Scale13));
31701 Res02 = DAG.getBitcast(VT, Res02);
31702 Res13 = DAG.getBitcast(VT, Res13);
31703
31704 return DAG.getNode(ISD::OR, DL, VT,
31705 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31706 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31707}
31708
31709/// Returns true if the operand type is exactly twice the native width, and
31710/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31711/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31712/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31713bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31714 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31715
31716 if (OpWidth == 64)
31717 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31718 if (OpWidth == 128)
31719 return Subtarget.canUseCMPXCHG16B();
31720
31721 return false;
31722}
31723
31725X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31726 Type *MemType = SI->getValueOperand()->getType();
31727
31728 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31729 !Subtarget.useSoftFloat()) {
31730 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31731 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31733
31734 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31735 Subtarget.hasAVX())
31737 }
31738
31739 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31741}
31742
31743// Note: this turns large loads into lock cmpxchg8b/16b.
31745X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31746 Type *MemType = LI->getType();
31747
31748 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31749 !Subtarget.useSoftFloat()) {
31750 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31751 // can use movq to do the load. If we have X87 we can load into an 80-bit
31752 // X87 register and store it to a stack temporary.
31753 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31754 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31756
31757 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31758 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31759 Subtarget.hasAVX())
31761 }
31762
31763 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31765}
31766
31774
31775static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31776 using namespace llvm::PatternMatch;
31777 BitTestKind BTK = UndefBit;
31778 if (auto *C = dyn_cast<ConstantInt>(V)) {
31779 // Check if V is a power of 2 or NOT power of 2.
31780 if (isPowerOf2_64(C->getZExtValue()))
31781 BTK = ConstantBit;
31782 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31783 BTK = NotConstantBit;
31784 return {V, BTK};
31785 }
31786
31787 // Check if V is some power of 2 pattern known to be non-zero
31788 if (auto *I = dyn_cast<Instruction>(V)) {
31789 bool Not = false;
31790 // Check if we have a NOT
31791 Value *PeekI;
31792 if (match(I, m_Not(m_Value(PeekI))) ||
31793 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31794 Not = true;
31795 I = dyn_cast<Instruction>(PeekI);
31796
31797 // If I is constant, it will fold and we can evaluate later. If its an
31798 // argument or something of that nature, we can't analyze.
31799 if (I == nullptr)
31800 return {nullptr, UndefBit};
31801 }
31802 // We can only use 1 << X without more sophisticated analysis. C << X where
31803 // C is a power of 2 but not 1 can result in zero which cannot be translated
31804 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31805 if (I->getOpcode() == Instruction::Shl) {
31806 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31807 // -X` and some other provable power of 2 patterns that we can use CTZ on
31808 // may be profitable.
31809 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31810 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31811 // be provably a non-zero power of 2.
31812 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31813 // transformable to bittest.
31814 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31815 if (!ShiftVal)
31816 return {nullptr, UndefBit};
31817 if (ShiftVal->equalsInt(1))
31818 BTK = Not ? NotShiftBit : ShiftBit;
31819
31820 if (BTK == UndefBit)
31821 return {nullptr, UndefBit};
31822
31823 Value *BitV = I->getOperand(1);
31824
31825 // Read past a shiftmask instruction to find count
31826 Value *AndOp;
31827 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31828 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31829 BitV = AndOp;
31830
31831 return {BitV, BTK};
31832 }
31833 }
31834 return {nullptr, UndefBit};
31835}
31836
31838X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31839 using namespace llvm::PatternMatch;
31840 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31841 // prefix to a normal instruction for these operations.
31842 if (AI->use_empty())
31844
31845 if (AI->getOperation() == AtomicRMWInst::Xor) {
31846 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31847 // preferable to both `cmpxchg` and `btc`.
31848 if (match(AI->getOperand(1), m_SignMask()))
31850 }
31851
31852 // If the atomicrmw's result is used by a single bit AND, we may use
31853 // bts/btr/btc instruction for these operations.
31854 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31855 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31856 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31857 // detect it.
31858 Instruction *I = AI->user_back();
31859 auto BitChange = FindSingleBitChange(AI->getValOperand());
31860 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31861 I->getOpcode() != Instruction::And ||
31862 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31863 AI->getParent() != I->getParent())
31865
31866 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31867
31868 // This is a redundant AND, it should get cleaned up elsewhere.
31869 if (AI == I->getOperand(OtherIdx))
31871
31872 // The following instruction must be a AND single bit.
31873 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31874 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31875 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31876 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31878 }
31879 if (AI->getOperation() == AtomicRMWInst::And) {
31880 return ~C1->getValue() == C2->getValue()
31883 }
31886 }
31887
31888 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31889
31890 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31891 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31893
31894 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31895
31896 // If shift amounts are not the same we can't use BitTestIntrinsic.
31897 if (BitChange.first != BitTested.first)
31899
31900 // If atomic AND need to be masking all be one bit and testing the one bit
31901 // unset in the mask.
31902 if (AI->getOperation() == AtomicRMWInst::And)
31903 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31906
31907 // If atomic XOR/OR need to be setting and testing the same bit.
31908 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31911}
31912
31913void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31914 IRBuilder<> Builder(AI);
31915 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31918 switch (AI->getOperation()) {
31919 default:
31920 llvm_unreachable("Unknown atomic operation");
31921 case AtomicRMWInst::Or:
31922 IID_C = Intrinsic::x86_atomic_bts;
31923 IID_I = Intrinsic::x86_atomic_bts_rm;
31924 break;
31925 case AtomicRMWInst::Xor:
31926 IID_C = Intrinsic::x86_atomic_btc;
31927 IID_I = Intrinsic::x86_atomic_btc_rm;
31928 break;
31929 case AtomicRMWInst::And:
31930 IID_C = Intrinsic::x86_atomic_btr;
31931 IID_I = Intrinsic::x86_atomic_btr_rm;
31932 break;
31933 }
31934 Instruction *I = AI->user_back();
31935 LLVMContext &Ctx = AI->getContext();
31936 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31938 Value *Result = nullptr;
31939 auto BitTested = FindSingleBitChange(AI->getValOperand());
31940 assert(BitTested.first != nullptr);
31941
31942 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31943 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31944
31945 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31946 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31947 {Addr, Builder.getInt8(Imm)});
31948 } else {
31949 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31950
31951 Value *SI = BitTested.first;
31952 assert(SI != nullptr);
31953
31954 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31955 // mask it.
31956 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31957 Value *BitPos =
31958 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31959 // Todo(1): In many cases it may be provable that SI is less than
31960 // ShiftBits in which case this mask is unnecessary
31961 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31962 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31963 // favor of just a raw BT{S|R|C}.
31964
31965 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31966 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31967
31968 // If the result is only used for zero/non-zero status then we don't need to
31969 // shift value back. Otherwise do so.
31970 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31971 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31972 if (ICmp->isEquality()) {
31973 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31974 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31975 if (C0 || C1) {
31976 assert(C0 == nullptr || C1 == nullptr);
31977 if ((C0 ? C0 : C1)->isZero())
31978 continue;
31979 }
31980 }
31981 }
31982 Result = Builder.CreateShl(Result, BitPos);
31983 break;
31984 }
31985 }
31986
31987 I->replaceAllUsesWith(Result);
31988 I->eraseFromParent();
31989 AI->eraseFromParent();
31990}
31991
31993 using namespace llvm::PatternMatch;
31994 if (!AI->hasOneUse())
31995 return false;
31996
31997 Value *Op = AI->getOperand(1);
31998 CmpPredicate Pred;
31999 Instruction *I = AI->user_back();
32001 if (Opc == AtomicRMWInst::Add) {
32002 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32003 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32004 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32005 if (match(I->user_back(),
32007 return true;
32008 if (match(I->user_back(),
32010 return true;
32011 }
32012 return false;
32013 }
32014 if (Opc == AtomicRMWInst::Sub) {
32015 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32016 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32017 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32018 if (match(I->user_back(),
32020 return true;
32021 if (match(I->user_back(),
32023 return true;
32024 }
32025 return false;
32026 }
32027 if ((Opc == AtomicRMWInst::Or &&
32029 (Opc == AtomicRMWInst::And &&
32031 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32032 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32033 Pred == CmpInst::ICMP_SLT;
32034 if (match(I->user_back(),
32036 return true;
32037 return false;
32038 }
32039 if (Opc == AtomicRMWInst::Xor) {
32040 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32041 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32042 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32043 if (match(I->user_back(),
32045 return true;
32046 if (match(I->user_back(),
32048 return true;
32049 }
32050 return false;
32051 }
32052
32053 return false;
32054}
32055
32056void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32057 AtomicRMWInst *AI) const {
32058 IRBuilder<> Builder(AI);
32059 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32060 Instruction *TempI = nullptr;
32061 LLVMContext &Ctx = AI->getContext();
32062 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32063 if (!ICI) {
32064 TempI = AI->user_back();
32065 assert(TempI->hasOneUse() && "Must have one use");
32066 ICI = cast<ICmpInst>(TempI->user_back());
32067 }
32069 ICmpInst::Predicate Pred = ICI->getPredicate();
32070 switch (Pred) {
32071 default:
32072 llvm_unreachable("Not supported Pred");
32073 case CmpInst::ICMP_EQ:
32074 CC = X86::COND_E;
32075 break;
32076 case CmpInst::ICMP_NE:
32077 CC = X86::COND_NE;
32078 break;
32079 case CmpInst::ICMP_SLT:
32080 CC = X86::COND_S;
32081 break;
32082 case CmpInst::ICMP_SGT:
32083 CC = X86::COND_NS;
32084 break;
32085 }
32087 switch (AI->getOperation()) {
32088 default:
32089 llvm_unreachable("Unknown atomic operation");
32090 case AtomicRMWInst::Add:
32091 IID = Intrinsic::x86_atomic_add_cc;
32092 break;
32093 case AtomicRMWInst::Sub:
32094 IID = Intrinsic::x86_atomic_sub_cc;
32095 break;
32096 case AtomicRMWInst::Or:
32097 IID = Intrinsic::x86_atomic_or_cc;
32098 break;
32099 case AtomicRMWInst::And:
32100 IID = Intrinsic::x86_atomic_and_cc;
32101 break;
32102 case AtomicRMWInst::Xor:
32103 IID = Intrinsic::x86_atomic_xor_cc;
32104 break;
32105 }
32106 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32108 Value *Call = Builder.CreateIntrinsic(
32109 IID, AI->getType(),
32110 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32111 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32112 ICI->replaceAllUsesWith(Result);
32113 ICI->eraseFromParent();
32114 if (TempI)
32115 TempI->eraseFromParent();
32116 AI->eraseFromParent();
32117}
32118
32120X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32121 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32122 Type *MemType = AI->getType();
32123
32124 // If the operand is too big, we must see if cmpxchg8/16b is available
32125 // and default to library calls otherwise.
32126 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32127 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32129 }
32130
32132 switch (Op) {
32135 case AtomicRMWInst::Add:
32136 case AtomicRMWInst::Sub:
32139 // It's better to use xadd, xsub or xchg for these in other cases.
32141 case AtomicRMWInst::Or:
32142 case AtomicRMWInst::And:
32143 case AtomicRMWInst::Xor:
32146 return shouldExpandLogicAtomicRMWInIR(AI);
32148 case AtomicRMWInst::Max:
32149 case AtomicRMWInst::Min:
32160 default:
32161 // These always require a non-trivial set of data operations on x86. We must
32162 // use a cmpxchg loop.
32164 }
32165}
32166
32167LoadInst *
32168X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32169 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32170 Type *MemType = AI->getType();
32171 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32172 // there is no benefit in turning such RMWs into loads, and it is actually
32173 // harmful as it introduces a mfence.
32174 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32175 return nullptr;
32176
32177 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32178 // lowering available in lowerAtomicArith.
32179 // TODO: push more cases through this path.
32180 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32181 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32182 AI->use_empty())
32183 return nullptr;
32184
32185 IRBuilder<> Builder(AI);
32186 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32187 auto SSID = AI->getSyncScopeID();
32188 // We must restrict the ordering to avoid generating loads with Release or
32189 // ReleaseAcquire orderings.
32191
32192 // Before the load we need a fence. Here is an example lifted from
32193 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32194 // is required:
32195 // Thread 0:
32196 // x.store(1, relaxed);
32197 // r1 = y.fetch_add(0, release);
32198 // Thread 1:
32199 // y.fetch_add(42, acquire);
32200 // r2 = x.load(relaxed);
32201 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32202 // lowered to just a load without a fence. A mfence flushes the store buffer,
32203 // making the optimization clearly correct.
32204 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32205 // otherwise, we might be able to be more aggressive on relaxed idempotent
32206 // rmw. In practice, they do not look useful, so we don't try to be
32207 // especially clever.
32208
32209 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32210 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32211 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32212
32213 // Finally we can emit the atomic load.
32214 LoadInst *Loaded = Builder.CreateAlignedLoad(
32215 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32216 Loaded->setAtomic(Order, SSID);
32217 AI->replaceAllUsesWith(Loaded);
32218 AI->eraseFromParent();
32219 return Loaded;
32220}
32221
32222/// Emit a locked operation on a stack location which does not change any
32223/// memory location, but does involve a lock prefix. Location is chosen to be
32224/// a) very likely accessed only by a single thread to minimize cache traffic,
32225/// and b) definitely dereferenceable. Returns the new Chain result.
32227 const X86Subtarget &Subtarget, SDValue Chain,
32228 const SDLoc &DL) {
32229 // Implementation notes:
32230 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32231 // operations issued by the current processor. As such, the location
32232 // referenced is not relevant for the ordering properties of the instruction.
32233 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32234 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32235 // 2) Using an immediate operand appears to be the best encoding choice
32236 // here since it doesn't require an extra register.
32237 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32238 // is small enough it might just be measurement noise.)
32239 // 4) When choosing offsets, there are several contributing factors:
32240 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32241 // line aligned stack object to improve this case.)
32242 // b) To minimize our chances of introducing a false dependence, we prefer
32243 // to offset the stack usage from TOS slightly.
32244 // c) To minimize concerns about cross thread stack usage - in particular,
32245 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32246 // captures state in the TOS frame and accesses it from many threads -
32247 // we want to use an offset such that the offset is in a distinct cache
32248 // line from the TOS frame.
32249 //
32250 // For a general discussion of the tradeoffs and benchmark results, see:
32251 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32252
32253 auto &MF = DAG.getMachineFunction();
32254 auto &TFL = *Subtarget.getFrameLowering();
32255 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32256
32257 if (Subtarget.is64Bit()) {
32258 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32259 SDValue Ops[] = {
32260 DAG.getRegister(X86::RSP, MVT::i64), // Base
32261 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32262 DAG.getRegister(0, MVT::i64), // Index
32263 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32264 DAG.getRegister(0, MVT::i16), // Segment.
32265 Zero,
32266 Chain};
32267 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32268 MVT::Other, Ops);
32269 return SDValue(Res, 1);
32270 }
32271
32272 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32273 SDValue Ops[] = {
32274 DAG.getRegister(X86::ESP, MVT::i32), // Base
32275 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32276 DAG.getRegister(0, MVT::i32), // Index
32277 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32278 DAG.getRegister(0, MVT::i16), // Segment.
32279 Zero,
32280 Chain
32281 };
32282 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32283 MVT::Other, Ops);
32284 return SDValue(Res, 1);
32285}
32286
32288 SelectionDAG &DAG) {
32289 SDLoc dl(Op);
32290 AtomicOrdering FenceOrdering =
32291 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32292 SyncScope::ID FenceSSID =
32293 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32294
32295 // The only fence that needs an instruction is a sequentially-consistent
32296 // cross-thread fence.
32297 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32298 FenceSSID == SyncScope::System) {
32299 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32300 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32301
32302 SDValue Chain = Op.getOperand(0);
32303 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32304 }
32305
32306 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32307 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32308}
32309
32311 SelectionDAG &DAG) {
32312 MVT T = Op.getSimpleValueType();
32313 SDLoc DL(Op);
32314 unsigned Reg = 0;
32315 unsigned size = 0;
32316 switch(T.SimpleTy) {
32317 default: llvm_unreachable("Invalid value type!");
32318 case MVT::i8: Reg = X86::AL; size = 1; break;
32319 case MVT::i16: Reg = X86::AX; size = 2; break;
32320 case MVT::i32: Reg = X86::EAX; size = 4; break;
32321 case MVT::i64:
32322 assert(Subtarget.is64Bit() && "Node not type legal!");
32323 Reg = X86::RAX; size = 8;
32324 break;
32325 }
32326 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32327 Op.getOperand(2), SDValue());
32328 SDValue Ops[] = { cpIn.getValue(0),
32329 Op.getOperand(1),
32330 Op.getOperand(3),
32331 DAG.getTargetConstant(size, DL, MVT::i8),
32332 cpIn.getValue(1) };
32333 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32334 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32336 Ops, T, MMO);
32337
32338 SDValue cpOut =
32339 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32340 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32341 MVT::i32, cpOut.getValue(2));
32342 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32343
32344 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32345 cpOut, Success, EFLAGS.getValue(1));
32346}
32347
32348// Create MOVMSKB, taking into account whether we need to split for AVX1.
32350 const X86Subtarget &Subtarget) {
32351 MVT InVT = V.getSimpleValueType();
32352
32353 if (InVT == MVT::v64i8) {
32354 SDValue Lo, Hi;
32355 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32356 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32357 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32358 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32359 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32360 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32361 DAG.getConstant(32, DL, MVT::i8));
32362 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32363 }
32364 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32365 SDValue Lo, Hi;
32366 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32367 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32368 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32369 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32370 DAG.getConstant(16, DL, MVT::i8));
32371 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32372 }
32373
32374 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32375}
32376
32377static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32378 SelectionDAG &DAG) {
32379 SDValue Src = Op.getOperand(0);
32380 MVT SrcVT = Src.getSimpleValueType();
32381 MVT DstVT = Op.getSimpleValueType();
32382
32383 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32384 // half to v32i1 and concatenating the result.
32385 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32386 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32387 assert(Subtarget.hasBWI() && "Expected BWI target");
32388 SDLoc dl(Op);
32389 SDValue Lo, Hi;
32390 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32391 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32392 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32393 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32394 }
32395
32396 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32397 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32398 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32399 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32400 SDLoc DL(Op);
32401 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32402 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32403 return DAG.getZExtOrTrunc(V, DL, DstVT);
32404 }
32405
32406 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32407 SrcVT == MVT::i64) && "Unexpected VT!");
32408
32409 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32410 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32411 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32412 // This conversion needs to be expanded.
32413 return SDValue();
32414
32415 SDLoc dl(Op);
32416 if (SrcVT.isVector()) {
32417 // Widen the vector in input in the case of MVT::v2i32.
32418 // Example: from MVT::v2i32 to MVT::v4i32.
32420 SrcVT.getVectorNumElements() * 2);
32421 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32422 DAG.getUNDEF(SrcVT));
32423 } else {
32424 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32425 "Unexpected source type in LowerBITCAST");
32426 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32427 }
32428
32429 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32430 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32431
32432 if (DstVT == MVT::x86mmx)
32433 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32434
32435 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32436 DAG.getVectorIdxConstant(0, dl));
32437}
32438
32439/// Compute the horizontal sum of bytes in V for the elements of VT.
32440///
32441/// Requires V to be a byte vector and VT to be an integer vector type with
32442/// wider elements than V's type. The width of the elements of VT determines
32443/// how many bytes of V are summed horizontally to produce each element of the
32444/// result.
32446 const X86Subtarget &Subtarget,
32447 SelectionDAG &DAG) {
32448 SDLoc DL(V);
32449 MVT ByteVecVT = V.getSimpleValueType();
32450 MVT EltVT = VT.getVectorElementType();
32451 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32452 "Expected value to have byte element type.");
32453 assert(EltVT != MVT::i8 &&
32454 "Horizontal byte sum only makes sense for wider elements!");
32455 unsigned VecSize = VT.getSizeInBits();
32456 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32457
32458 // PSADBW instruction horizontally add all bytes and leave the result in i64
32459 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32460 if (EltVT == MVT::i64) {
32461 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32462 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32463 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32464 return DAG.getBitcast(VT, V);
32465 }
32466
32467 if (EltVT == MVT::i32) {
32468 // We unpack the low half and high half into i32s interleaved with zeros so
32469 // that we can use PSADBW to horizontally sum them. The most useful part of
32470 // this is that it lines up the results of two PSADBW instructions to be
32471 // two v2i64 vectors which concatenated are the 4 population counts. We can
32472 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32473 SDValue Zeros = DAG.getConstant(0, DL, VT);
32474 SDValue V32 = DAG.getBitcast(VT, V);
32475 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32476 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32477
32478 // Do the horizontal sums into two v2i64s.
32479 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32480 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32481 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32482 DAG.getBitcast(ByteVecVT, Low), Zeros);
32483 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32484 DAG.getBitcast(ByteVecVT, High), Zeros);
32485
32486 // Merge them together.
32487 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32488 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32489 DAG.getBitcast(ShortVecVT, Low),
32490 DAG.getBitcast(ShortVecVT, High));
32491
32492 return DAG.getBitcast(VT, V);
32493 }
32494
32495 // The only element type left is i16.
32496 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32497
32498 // To obtain pop count for each i16 element starting from the pop count for
32499 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32500 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32501 // directly supported.
32502 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32503 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32504 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32505 DAG.getBitcast(ByteVecVT, V));
32506 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32507}
32508
32510 const X86Subtarget &Subtarget,
32511 SelectionDAG &DAG) {
32512 MVT VT = Op.getSimpleValueType();
32513 MVT EltVT = VT.getVectorElementType();
32514 int NumElts = VT.getVectorNumElements();
32515 (void)EltVT;
32516 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32517
32518 // Implement a lookup table in register by using an algorithm based on:
32519 // http://wm.ite.pl/articles/sse-popcount.html
32520 //
32521 // The general idea is that every lower byte nibble in the input vector is an
32522 // index into a in-register pre-computed pop count table. We then split up the
32523 // input vector in two new ones: (1) a vector with only the shifted-right
32524 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32525 // masked out higher ones) for each byte. PSHUFB is used separately with both
32526 // to index the in-register table. Next, both are added and the result is a
32527 // i8 vector where each element contains the pop count for input byte.
32528 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32529 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32530 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32531 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32532
32534 for (int i = 0; i < NumElts; ++i)
32535 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32536 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32537 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32538
32539 // High nibbles
32540 SDValue FourV = DAG.getConstant(4, DL, VT);
32541 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32542
32543 // Low nibbles
32544 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32545
32546 // The input vector is used as the shuffle mask that index elements into the
32547 // LUT. After counting low and high nibbles, add the vector to obtain the
32548 // final pop count per i8 element.
32549 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32550 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32551 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32552}
32553
32554// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32555// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32557 const X86Subtarget &Subtarget,
32558 SelectionDAG &DAG) {
32559 MVT VT = Op.getSimpleValueType();
32560 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32561 "Unknown CTPOP type to handle");
32562 SDValue Op0 = Op.getOperand(0);
32563
32564 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32565 if (Subtarget.hasVPOPCNTDQ()) {
32566 unsigned NumElems = VT.getVectorNumElements();
32567 assert((VT.getVectorElementType() == MVT::i8 ||
32568 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32569 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32570 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32571 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32572 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32573 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32574 }
32575 }
32576
32577 // Decompose 256-bit ops into smaller 128-bit ops.
32578 if (VT.is256BitVector() && !Subtarget.hasInt256())
32579 return splitVectorIntUnary(Op, DAG, DL);
32580
32581 // Decompose 512-bit ops into smaller 256-bit ops.
32582 if (VT.is512BitVector() && !Subtarget.hasBWI())
32583 return splitVectorIntUnary(Op, DAG, DL);
32584
32585 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32586 if (VT.getScalarType() != MVT::i8) {
32587 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32588 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32589 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32590 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32591 }
32592
32593 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32594 if (!Subtarget.hasSSSE3())
32595 return SDValue();
32596
32597 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32598}
32599
32600static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32601 SelectionDAG &DAG) {
32602 MVT VT = N.getSimpleValueType();
32603 SDValue Op = N.getOperand(0);
32604 SDLoc DL(N);
32605
32606 if (VT.isScalarInteger()) {
32607 // Compute the lower/upper bounds of the active bits of the value,
32608 // allowing us to shift the active bits down if necessary to fit into the
32609 // special cases below.
32610 KnownBits Known = DAG.computeKnownBits(Op);
32611 if (Known.isConstant())
32612 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32613 unsigned LZ = Known.countMinLeadingZeros();
32614 unsigned TZ = Known.countMinTrailingZeros();
32615 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32616 unsigned ActiveBits = Known.getBitWidth() - LZ;
32617 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32618
32619 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32620 if (ShiftedActiveBits <= 2) {
32621 if (ActiveBits > 2)
32622 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32623 DAG.getShiftAmountConstant(TZ, VT, DL));
32624 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32625 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32626 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32627 DAG.getShiftAmountConstant(1, VT, DL)));
32628 return DAG.getZExtOrTrunc(Op, DL, VT);
32629 }
32630
32631 // i3 CTPOP - perform LUT into i32 integer.
32632 if (ShiftedActiveBits <= 3) {
32633 if (ActiveBits > 3)
32634 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32635 DAG.getShiftAmountConstant(TZ, VT, DL));
32636 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32637 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32638 DAG.getShiftAmountConstant(1, VT, DL));
32639 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32640 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32641 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32642 DAG.getConstant(0x3, DL, MVT::i32));
32643 return DAG.getZExtOrTrunc(Op, DL, VT);
32644 }
32645
32646 // i4 CTPOP - perform LUT into i64 integer.
32647 if (ShiftedActiveBits <= 4 &&
32648 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32649 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32650 if (ActiveBits > 4)
32651 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32652 DAG.getShiftAmountConstant(TZ, VT, DL));
32653 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32654 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32655 DAG.getConstant(4, DL, MVT::i32));
32656 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32657 DAG.getShiftAmountOperand(MVT::i64, Op));
32658 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32659 DAG.getConstant(0x7, DL, MVT::i64));
32660 return DAG.getZExtOrTrunc(Op, DL, VT);
32661 }
32662
32663 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32664 if (ShiftedActiveBits <= 8) {
32665 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32666 if (ActiveBits > 8)
32667 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32668 DAG.getShiftAmountConstant(TZ, VT, DL));
32669 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32670 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32671 DAG.getConstant(0x08040201U, DL, MVT::i32));
32672 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32673 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32674 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32675 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32676 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32677 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32678 return DAG.getZExtOrTrunc(Op, DL, VT);
32679 }
32680
32681 return SDValue(); // fallback to generic expansion.
32682 }
32683
32684 assert(VT.isVector() &&
32685 "We only do custom lowering for vector population count.");
32686 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32687}
32688
32690 MVT VT = Op.getSimpleValueType();
32691 SDValue In = Op.getOperand(0);
32692 SDLoc DL(Op);
32693
32694 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32695 // perform the BITREVERSE.
32696 if (!VT.isVector()) {
32697 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32698 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32699 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32700 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32701 DAG.getVectorIdxConstant(0, DL));
32702 }
32703
32704 int NumElts = VT.getVectorNumElements();
32705 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32706
32707 // Decompose 256-bit ops into smaller 128-bit ops.
32708 if (VT.is256BitVector())
32709 return splitVectorIntUnary(Op, DAG, DL);
32710
32711 assert(VT.is128BitVector() &&
32712 "Only 128-bit vector bitreverse lowering supported.");
32713
32714 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32715 // perform the BSWAP in the shuffle.
32716 // Its best to shuffle using the second operand as this will implicitly allow
32717 // memory folding for multiple vectors.
32718 SmallVector<SDValue, 16> MaskElts;
32719 for (int i = 0; i != NumElts; ++i) {
32720 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32721 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32722 int PermuteByte = SourceByte | (2 << 5);
32723 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32724 }
32725 }
32726
32727 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32728 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32729 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32730 Res, Mask);
32731 return DAG.getBitcast(VT, Res);
32732}
32733
32735 SelectionDAG &DAG) {
32736 MVT VT = Op.getSimpleValueType();
32737
32738 if (Subtarget.hasXOP() && !VT.is512BitVector())
32739 return LowerBITREVERSE_XOP(Op, DAG);
32740
32741 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32742 "SSSE3 or GFNI required for BITREVERSE");
32743
32744 SDValue In = Op.getOperand(0);
32745 SDLoc DL(Op);
32746
32747 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32748 if (VT.is512BitVector() && !Subtarget.hasBWI())
32749 return splitVectorIntUnary(Op, DAG, DL);
32750
32751 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32752 if (VT.is256BitVector() && !Subtarget.hasInt256())
32753 return splitVectorIntUnary(Op, DAG, DL);
32754
32755 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32756 if (!VT.isVector()) {
32757 assert(
32758 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32759 "Only tested for i8/i16/i32/i64");
32760 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32761 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32762 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32763 DAG.getBitcast(MVT::v16i8, Res));
32764 Res =
32765 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32766 DAG.getVectorIdxConstant(0, DL));
32767 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32768 }
32769
32770 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32771
32772 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32773 if (VT.getScalarType() != MVT::i8) {
32774 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32775 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32776 Res = DAG.getBitcast(ByteVT, Res);
32777 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32778 return DAG.getBitcast(VT, Res);
32779 }
32780 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32781 "Only byte vector BITREVERSE supported");
32782
32783 unsigned NumElts = VT.getVectorNumElements();
32784
32785 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32786 if (Subtarget.hasGFNI()) {
32788 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32789 DAG.getTargetConstant(0, DL, MVT::i8));
32790 }
32791
32792 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32793 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32794 // 0-15 value (moved to the other nibble).
32795 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32796 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32797 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32798
32799 const int LoLUT[16] = {
32800 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32801 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32802 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32803 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32804 const int HiLUT[16] = {
32805 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32806 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32807 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32808 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32809
32810 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32811 for (unsigned i = 0; i < NumElts; ++i) {
32812 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32813 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32814 }
32815
32816 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32817 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32818 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32819 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32820 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32821}
32822
32823static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32824 SelectionDAG &DAG) {
32825 SDLoc DL(Op);
32826 SDValue X = Op.getOperand(0);
32827 MVT VT = Op.getSimpleValueType();
32828
32829 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32830 if (VT == MVT::i8 ||
32832 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32833 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32834 DAG.getConstant(0, DL, MVT::i8));
32835 // Copy the inverse of the parity flag into a register with setcc.
32836 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32837 // Extend to the original type.
32838 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32839 }
32840
32841 // If we have POPCNT, use the default expansion.
32842 if (Subtarget.hasPOPCNT())
32843 return SDValue();
32844
32845 if (VT == MVT::i64) {
32846 // Xor the high and low 16-bits together using a 32-bit operation.
32847 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32848 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32849 DAG.getConstant(32, DL, MVT::i8)));
32850 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32851 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32852 }
32853
32854 if (VT != MVT::i16) {
32855 // Xor the high and low 16-bits together using a 32-bit operation.
32856 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32857 DAG.getConstant(16, DL, MVT::i8));
32858 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32859 } else {
32860 // If the input is 16-bits, we need to extend to use an i32 shift below.
32861 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32862 }
32863
32864 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32865 // This should allow an h-reg to be used to save a shift.
32866 SDValue Hi = DAG.getNode(
32867 ISD::TRUNCATE, DL, MVT::i8,
32868 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32869 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32870 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32871 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32872
32873 // Copy the inverse of the parity flag into a register with setcc.
32874 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32875 // Extend to the original type.
32876 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32877}
32878
32880 const X86Subtarget &Subtarget) {
32881 unsigned NewOpc = 0;
32882 switch (N->getOpcode()) {
32883 case ISD::ATOMIC_LOAD_ADD:
32884 NewOpc = X86ISD::LADD;
32885 break;
32886 case ISD::ATOMIC_LOAD_SUB:
32887 NewOpc = X86ISD::LSUB;
32888 break;
32889 case ISD::ATOMIC_LOAD_OR:
32890 NewOpc = X86ISD::LOR;
32891 break;
32892 case ISD::ATOMIC_LOAD_XOR:
32893 NewOpc = X86ISD::LXOR;
32894 break;
32895 case ISD::ATOMIC_LOAD_AND:
32896 NewOpc = X86ISD::LAND;
32897 break;
32898 default:
32899 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32900 }
32901
32902 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32903
32904 return DAG.getMemIntrinsicNode(
32905 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32906 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32907 /*MemVT=*/N->getSimpleValueType(0), MMO);
32908}
32909
32910/// Lower atomic_load_ops into LOCK-prefixed operations.
32912 const X86Subtarget &Subtarget) {
32913 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32914 SDValue Chain = N->getOperand(0);
32915 SDValue LHS = N->getOperand(1);
32916 SDValue RHS = N->getOperand(2);
32917 unsigned Opc = N->getOpcode();
32918 MVT VT = N->getSimpleValueType(0);
32919 SDLoc DL(N);
32920
32921 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32922 // can only be lowered when the result is unused. They should have already
32923 // been transformed into a cmpxchg loop in AtomicExpand.
32924 if (N->hasAnyUseOfValue(0)) {
32925 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32926 // select LXADD if LOCK_SUB can't be selected.
32927 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32928 // can use LXADD as opposed to cmpxchg.
32929 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32930 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))
32931 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32932 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32933
32934 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
32935 "Used AtomicRMW ops other than Add should have been expanded!");
32936 return N;
32937 }
32938
32939 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32940 // The core idea here is that since the memory location isn't actually
32941 // changing, all we need is a lowering for the *ordering* impacts of the
32942 // atomicrmw. As such, we can chose a different operation and memory
32943 // location to minimize impact on other code.
32944 // The above holds unless the node is marked volatile in which
32945 // case it needs to be preserved according to the langref.
32946 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32947 // On X86, the only ordering which actually requires an instruction is
32948 // seq_cst which isn't SingleThread, everything just needs to be preserved
32949 // during codegen and then dropped. Note that we expect (but don't assume),
32950 // that orderings other than seq_cst and acq_rel have been canonicalized to
32951 // a store or load.
32954 // Prefer a locked operation against a stack location to minimize cache
32955 // traffic. This assumes that stack locations are very likely to be
32956 // accessed only by the owning thread.
32957 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32958 assert(!N->hasAnyUseOfValue(0));
32959 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32960 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32961 DAG.getUNDEF(VT), NewChain);
32962 }
32963 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32964 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32965 assert(!N->hasAnyUseOfValue(0));
32966 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32967 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32968 DAG.getUNDEF(VT), NewChain);
32969 }
32970
32971 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32972 // RAUW the chain, but don't worry about the result, as it's unused.
32973 assert(!N->hasAnyUseOfValue(0));
32974 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32975 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32976 DAG.getUNDEF(VT), LockOp.getValue(1));
32977}
32978
32980 const X86Subtarget &Subtarget) {
32981 auto *Node = cast<AtomicSDNode>(Op.getNode());
32982 SDLoc dl(Node);
32983 EVT VT = Node->getMemoryVT();
32984
32985 bool IsSeqCst =
32986 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32987 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32988
32989 // If this store is not sequentially consistent and the type is legal
32990 // we can just keep it.
32991 if (!IsSeqCst && IsTypeLegal)
32992 return Op;
32993
32994 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32996 Attribute::NoImplicitFloat)) {
32997 SDValue Chain;
32998 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
32999 // vector store.
33000 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
33001 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
33002 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
33003 Node->getMemOperand());
33004 }
33005
33006 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33007 // is enabled.
33008 if (VT == MVT::i64) {
33009 if (Subtarget.hasSSE1()) {
33010 SDValue SclToVec =
33011 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
33012 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33013 SclToVec = DAG.getBitcast(StVT, SclToVec);
33014 SDVTList Tys = DAG.getVTList(MVT::Other);
33015 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33016 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33017 MVT::i64, Node->getMemOperand());
33018 } else if (Subtarget.hasX87()) {
33019 // First load this into an 80-bit X87 register using a stack temporary.
33020 // This will put the whole integer into the significand.
33021 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33022 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33023 MachinePointerInfo MPI =
33025 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33027 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33028 SDValue LdOps[] = {Chain, StackPtr};
33030 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33031 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33032 Chain = Value.getValue(1);
33033
33034 // Now use an FIST to do the atomic store.
33035 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33036 Chain =
33037 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33038 StoreOps, MVT::i64, Node->getMemOperand());
33039 }
33040 }
33041
33042 if (Chain) {
33043 // If this is a sequentially consistent store, also emit an appropriate
33044 // barrier.
33045 if (IsSeqCst)
33046 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33047
33048 return Chain;
33049 }
33050 }
33051
33052 // Convert seq_cst store -> xchg
33053 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33054 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33055 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33056 Node->getOperand(0), Node->getOperand(2),
33057 Node->getOperand(1), Node->getMemOperand());
33058 return Swap.getValue(1);
33059}
33060
33062 SDNode *N = Op.getNode();
33063 MVT VT = N->getSimpleValueType(0);
33064 unsigned Opc = Op.getOpcode();
33065
33066 // Let legalize expand this if it isn't a legal type yet.
33067 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33068 return SDValue();
33069
33070 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33071 SDLoc DL(N);
33072
33073 // Set the carry flag.
33074 SDValue Carry = Op.getOperand(2);
33075 EVT CarryVT = Carry.getValueType();
33076 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33077 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33078
33079 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33080 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33081 Op.getOperand(0), Op.getOperand(1),
33082 Carry.getValue(1));
33083
33084 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33085 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33086 Sum.getValue(1), DL, DAG);
33087 if (N->getValueType(1) == MVT::i1)
33088 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33089
33090 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33091}
33092
33093static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33094 SelectionDAG &DAG) {
33095 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33096
33097 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33098 // which returns the values as { float, float } (in XMM0) or
33099 // { double, double } (which is returned in XMM0, XMM1).
33100 SDLoc dl(Op);
33101 SDValue Arg = Op.getOperand(0);
33102 EVT ArgVT = Arg.getValueType();
33103 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33104
33106 Args.emplace_back(Arg, ArgTy);
33107
33108 bool isF64 = ArgVT == MVT::f64;
33109 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33110 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33111 // the results are returned via SRet in memory.
33112 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33113 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33114 const char *LibcallName = TLI.getLibcallName(LC);
33115 SDValue Callee =
33116 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33117
33118 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33119 : (Type *)FixedVectorType::get(ArgTy, 4);
33120
33122 CLI.setDebugLoc(dl)
33123 .setChain(DAG.getEntryNode())
33124 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33125
33126 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33127
33128 if (isF64)
33129 // Returned in xmm0 and xmm1.
33130 return CallResult.first;
33131
33132 // Returned in bits 0:31 and 32:64 xmm0.
33133 SDValue SinVal =
33134 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33135 DAG.getVectorIdxConstant(0, dl));
33136 SDValue CosVal =
33137 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33138 DAG.getVectorIdxConstant(1, dl));
33139 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33140 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33141}
33142
33143/// Widen a vector input to a vector of NVT. The
33144/// input vector must have the same element type as NVT.
33146 bool FillWithZeroes = false) {
33147 // Check if InOp already has the right width.
33148 MVT InVT = InOp.getSimpleValueType();
33149 if (InVT == NVT)
33150 return InOp;
33151
33152 if (InOp.isUndef())
33153 return DAG.getUNDEF(NVT);
33154
33156 "input and widen element type must match");
33157
33158 unsigned InNumElts = InVT.getVectorNumElements();
33159 unsigned WidenNumElts = NVT.getVectorNumElements();
33160 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33161 "Unexpected request for vector widening");
33162
33163 SDLoc dl(InOp);
33164 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33165 SDValue N1 = InOp.getOperand(1);
33166 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33167 N1.isUndef()) {
33168 InOp = InOp.getOperand(0);
33169 InVT = InOp.getSimpleValueType();
33170 InNumElts = InVT.getVectorNumElements();
33171 }
33172 }
33175 EVT EltVT = InOp.getOperand(0).getValueType();
33176 SDValue FillVal =
33177 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33179 Ops.append(WidenNumElts - InNumElts, FillVal);
33180 return DAG.getBuildVector(NVT, dl, Ops);
33181 }
33182 SDValue FillVal =
33183 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33184 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33185 DAG.getVectorIdxConstant(0, dl));
33186}
33187
33189 SelectionDAG &DAG) {
33190 assert(Subtarget.hasAVX512() &&
33191 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33192
33194 SDValue Src = N->getValue();
33195 MVT VT = Src.getSimpleValueType();
33196 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33197 SDLoc dl(Op);
33198
33199 SDValue Scale = N->getScale();
33200 SDValue Index = N->getIndex();
33201 SDValue Mask = N->getMask();
33202 SDValue Chain = N->getChain();
33203 SDValue BasePtr = N->getBasePtr();
33204
33205 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33206 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33207 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33208 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33209 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33210 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33211 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33212 SDVTList VTs = DAG.getVTList(MVT::Other);
33213 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33214 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33215 N->getMemoryVT(), N->getMemOperand());
33216 }
33217 return SDValue();
33218 }
33219
33220 MVT IndexVT = Index.getSimpleValueType();
33221
33222 // If the index is v2i32, we're being called by type legalization and we
33223 // should just let the default handling take care of it.
33224 if (IndexVT == MVT::v2i32)
33225 return SDValue();
33226
33227 // If we don't have VLX and neither the passthru or index is 512-bits, we
33228 // need to widen until one is.
33229 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33230 !Index.getSimpleValueType().is512BitVector()) {
33231 // Determine how much we need to widen by to get a 512-bit type.
33232 unsigned Factor = std::min(512/VT.getSizeInBits(),
33233 512/IndexVT.getSizeInBits());
33234 unsigned NumElts = VT.getVectorNumElements() * Factor;
33235
33236 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33237 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33238 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33239
33240 Src = ExtendToType(Src, VT, DAG);
33241 Index = ExtendToType(Index, IndexVT, DAG);
33242 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33243 }
33244
33245 SDVTList VTs = DAG.getVTList(MVT::Other);
33246 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33247 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33248 N->getMemoryVT(), N->getMemOperand());
33249}
33250
33251static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33252 SelectionDAG &DAG) {
33253
33255 MVT VT = Op.getSimpleValueType();
33256 MVT ScalarVT = VT.getScalarType();
33257 SDValue Mask = N->getMask();
33258 MVT MaskVT = Mask.getSimpleValueType();
33259 SDValue PassThru = N->getPassThru();
33260 SDLoc dl(Op);
33261
33262 // Handle AVX masked loads which don't support passthru other than 0.
33263 if (MaskVT.getVectorElementType() != MVT::i1) {
33264 // We also allow undef in the isel pattern.
33265 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33266 return Op;
33267
33268 SDValue NewLoad = DAG.getMaskedLoad(
33269 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33270 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33271 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33272 N->isExpandingLoad());
33273 // Emit a blend.
33274 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33275 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33276 }
33277
33278 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33279 "Expanding masked load is supported on AVX-512 target only!");
33280
33281 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33282 "Expanding masked load is supported for 32 and 64-bit types only!");
33283
33284 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33285 "Cannot lower masked load op.");
33286
33287 assert((ScalarVT.getSizeInBits() >= 32 ||
33288 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33289 ScalarVT == MVT::f16))) &&
33290 "Unsupported masked load op.");
33291
33292 // This operation is legal for targets with VLX, but without
33293 // VLX the vector should be widened to 512 bit
33294 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33295 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33296 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33297
33298 // Mask element has to be i1.
33299 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33300 "Unexpected mask type");
33301
33302 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33303
33304 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33305 SDValue NewLoad = DAG.getMaskedLoad(
33306 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33307 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33308 N->getExtensionType(), N->isExpandingLoad());
33309
33310 SDValue Extract =
33311 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33312 DAG.getVectorIdxConstant(0, dl));
33313 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33314 return DAG.getMergeValues(RetOps, dl);
33315}
33316
33317static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33318 SelectionDAG &DAG) {
33320 SDValue DataToStore = N->getValue();
33321 MVT VT = DataToStore.getSimpleValueType();
33322 MVT ScalarVT = VT.getScalarType();
33323 SDValue Mask = N->getMask();
33324 SDLoc dl(Op);
33325
33326 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33327 "Expanding masked load is supported on AVX-512 target only!");
33328
33329 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33330 "Expanding masked load is supported for 32 and 64-bit types only!");
33331
33332 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33333 "Cannot lower masked store op.");
33334
33335 assert((ScalarVT.getSizeInBits() >= 32 ||
33336 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33337 ScalarVT == MVT::f16))) &&
33338 "Unsupported masked store op.");
33339
33340 // This operation is legal for targets with VLX, but without
33341 // VLX the vector should be widened to 512 bit
33342 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33343 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33344
33345 // Mask element has to be i1.
33346 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33347 "Unexpected mask type");
33348
33349 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33350
33351 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33352 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33353 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33354 N->getOffset(), Mask, N->getMemoryVT(),
33355 N->getMemOperand(), N->getAddressingMode(),
33356 N->isTruncatingStore(), N->isCompressingStore());
33357}
33358
33359static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33360 SelectionDAG &DAG) {
33361 assert(Subtarget.hasAVX2() &&
33362 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33363
33365 SDLoc dl(Op);
33366 MVT VT = Op.getSimpleValueType();
33367 SDValue Index = N->getIndex();
33368 SDValue Mask = N->getMask();
33369 SDValue PassThru = N->getPassThru();
33370 MVT IndexVT = Index.getSimpleValueType();
33371
33372 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33373
33374 // If the index is v2i32, we're being called by type legalization.
33375 if (IndexVT == MVT::v2i32)
33376 return SDValue();
33377
33378 // If we don't have VLX and neither the passthru or index is 512-bits, we
33379 // need to widen until one is.
33380 MVT OrigVT = VT;
33381 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33382 !IndexVT.is512BitVector()) {
33383 // Determine how much we need to widen by to get a 512-bit type.
33384 unsigned Factor = std::min(512/VT.getSizeInBits(),
33385 512/IndexVT.getSizeInBits());
33386
33387 unsigned NumElts = VT.getVectorNumElements() * Factor;
33388
33389 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33390 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33391 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33392
33393 PassThru = ExtendToType(PassThru, VT, DAG);
33394 Index = ExtendToType(Index, IndexVT, DAG);
33395 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33396 }
33397
33398 // Break dependency on the data register.
33399 if (PassThru.isUndef())
33400 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33401
33402 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33403 N->getScale() };
33404 SDValue NewGather = DAG.getMemIntrinsicNode(
33405 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33406 N->getMemOperand());
33407 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33408 DAG.getVectorIdxConstant(0, dl));
33409 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33410}
33411
33413 SDLoc dl(Op);
33414 SDValue Src = Op.getOperand(0);
33415 MVT DstVT = Op.getSimpleValueType();
33416
33418 unsigned SrcAS = N->getSrcAddressSpace();
33419
33420 assert(SrcAS != N->getDestAddressSpace() &&
33421 "addrspacecast must be between different address spaces");
33422
33423 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33424 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33425 } else if (DstVT == MVT::i64) {
33426 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33427 } else if (DstVT == MVT::i32) {
33428 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33429 } else {
33430 report_fatal_error("Bad address space in addrspacecast");
33431 }
33432 return Op;
33433}
33434
33435SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33436 SelectionDAG &DAG) const {
33437 // TODO: Eventually, the lowering of these nodes should be informed by or
33438 // deferred to the GC strategy for the function in which they appear. For
33439 // now, however, they must be lowered to something. Since they are logically
33440 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33441 // require special handling for these nodes), lower them as literal NOOPs for
33442 // the time being.
33444 Ops.push_back(Op.getOperand(0));
33445 if (Op->getGluedNode())
33446 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33447
33448 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33449 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33450}
33451
33452// Custom split CVTPS2PH with wide types.
33454 SDLoc dl(Op);
33455 EVT VT = Op.getValueType();
33456 SDValue Lo, Hi;
33457 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33458 EVT LoVT, HiVT;
33459 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33460 SDValue RC = Op.getOperand(1);
33461 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33462 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33463 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33464}
33465
33467 SelectionDAG &DAG) {
33468 unsigned IsData = Op.getConstantOperandVal(4);
33469
33470 // We don't support non-data prefetch without PREFETCHI.
33471 // Just preserve the chain.
33472 if (!IsData && !Subtarget.hasPREFETCHI())
33473 return Op.getOperand(0);
33474
33475 return Op;
33476}
33477
33479 SDNode *N = Op.getNode();
33480 SDValue Operand = N->getOperand(0);
33481 EVT VT = Operand.getValueType();
33482 SDLoc dl(N);
33483
33484 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33485
33486 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33487 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33488 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33489 // promote this operator's result!
33490 SDValue Chain = DAG.getEntryNode();
33491 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33492 {Chain, Operand, One});
33493 return StrictFmul;
33494}
33495
33497 unsigned OpNo) {
33498 const APInt Operand(32, OpNo);
33499 std::string OpNoStr = llvm::toString(Operand, 10, false);
33500 std::string Str(" $");
33501
33502 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33503 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33504
33505 auto I = StringRef::npos;
33506 for (auto &AsmStr : AsmStrs) {
33507 // Match the OpNo string. We should match exactly to exclude match
33508 // sub-string, e.g. "$12" contain "$1"
33509 if (AsmStr.ends_with(OpNoStr1))
33510 I = AsmStr.size() - OpNoStr1.size();
33511
33512 // Get the index of operand in AsmStr.
33513 if (I == StringRef::npos)
33514 I = AsmStr.find(OpNoStr1 + ",");
33515 if (I == StringRef::npos)
33516 I = AsmStr.find(OpNoStr2);
33517
33518 if (I == StringRef::npos)
33519 continue;
33520
33521 assert(I > 0 && "Unexpected inline asm string!");
33522 // Remove the operand string and label (if exsit).
33523 // For example:
33524 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33525 // ==>
33526 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33527 // ==>
33528 // "call dword ptr "
33529 auto TmpStr = AsmStr.substr(0, I);
33530 I = TmpStr.rfind(':');
33531 if (I != StringRef::npos)
33532 TmpStr = TmpStr.substr(I + 1);
33533 return TmpStr.take_while(llvm::isAlpha);
33534 }
33535
33536 return StringRef();
33537}
33538
33540 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33541 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33542 // changed from indirect TargetLowering::C_Memory to direct
33543 // TargetLowering::C_Address.
33544 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33545 // location.
33546 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33547 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33548}
33549
33551 SDValue Mask) {
33552 EVT Ty = MVT::i8;
33553 auto V = DAG.getBitcast(MVT::i1, Mask);
33554 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33555 auto Zero = DAG.getConstant(0, DL, Ty);
33556 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33557 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33558 return SDValue(CmpZero.getNode(), 1);
33559}
33560
33562 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33563 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33564 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33565 // ->
33566 // _, flags = SUB 0, mask
33567 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33568 // bit_cast_to_vector<res>
33569 EVT VTy = PassThru.getValueType();
33570 EVT Ty = VTy.getVectorElementType();
33571 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33572 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33573 : DAG.getBitcast(Ty, PassThru);
33574 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33575 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33576 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33577 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33578 return DAG.getBitcast(VTy, NewLoad);
33579}
33580
33582 SDValue Chain,
33584 SDValue Val, SDValue Mask) const {
33585 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33586 // ->
33587 // _, flags = SUB 0, mask
33588 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33590 SDVTList Tys = DAG.getVTList(MVT::Other);
33591 auto ScalarVal = DAG.getBitcast(Ty, Val);
33592 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33593 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33594 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33595 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33596}
33597
33598/// Provide custom lowering hooks for some operations.
33600 switch (Op.getOpcode()) {
33601 // clang-format off
33602 default: llvm_unreachable("Should not custom lower this!");
33603 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33604 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33605 return LowerCMP_SWAP(Op, Subtarget, DAG);
33606 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33607 case ISD::ATOMIC_LOAD_ADD:
33608 case ISD::ATOMIC_LOAD_SUB:
33609 case ISD::ATOMIC_LOAD_OR:
33610 case ISD::ATOMIC_LOAD_XOR:
33611 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33612 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33613 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33614 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33615 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33616 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33617 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33618 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33619 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33620 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33621 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33622 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33623 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33624 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33625 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33626 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33627 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33628 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33629 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33630 case ISD::SHL_PARTS:
33631 case ISD::SRA_PARTS:
33632 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33633 case ISD::FSHL:
33634 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33635 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33637 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33639 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33640 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33641 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33642 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33643 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33646 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33647 case ISD::FP_TO_SINT:
33649 case ISD::FP_TO_UINT:
33650 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33652 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33653 case ISD::FP_EXTEND:
33654 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33655 case ISD::FP_ROUND:
33656 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33657 case ISD::FP16_TO_FP:
33658 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33659 case ISD::FP_TO_FP16:
33660 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33661 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33662 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33663 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33664 case ISD::FADD:
33665 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33666 case ISD::FROUND: return LowerFROUND(Op, DAG);
33667 case ISD::FABS:
33668 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33669 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33670 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33671 case ISD::LRINT:
33672 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33673 case ISD::SETCC:
33674 case ISD::STRICT_FSETCC:
33675 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33676 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33677 case ISD::SELECT: return LowerSELECT(Op, DAG);
33678 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33679 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33680 case ISD::VASTART: return LowerVASTART(Op, DAG);
33681 case ISD::VAARG: return LowerVAARG(Op, DAG);
33682 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33683 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33685 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33686 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33687 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33688 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33690 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33691 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33692 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33693 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33694 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33696 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33697 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33698 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33699 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33700 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33701 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33702 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33703 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33704 case ISD::CTLZ:
33705 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33706 case ISD::CTTZ:
33707 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33708 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33709 case ISD::MULHS:
33710 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33711 case ISD::ROTL:
33712 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33713 case ISD::SRA:
33714 case ISD::SRL:
33715 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33716 case ISD::SADDO:
33717 case ISD::UADDO:
33718 case ISD::SSUBO:
33719 case ISD::USUBO: return LowerXALUO(Op, DAG);
33720 case ISD::SMULO:
33721 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33722 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33723 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33724 case ISD::SADDO_CARRY:
33725 case ISD::SSUBO_CARRY:
33726 case ISD::UADDO_CARRY:
33727 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33728 case ISD::ADD:
33729 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33730 case ISD::UADDSAT:
33731 case ISD::SADDSAT:
33732 case ISD::USUBSAT:
33733 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33734 case ISD::SMAX:
33735 case ISD::SMIN:
33736 case ISD::UMAX:
33737 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33738 case ISD::FMINIMUM:
33739 case ISD::FMAXIMUM:
33740 case ISD::FMINIMUMNUM:
33741 case ISD::FMAXIMUMNUM:
33742 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33743 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33744 case ISD::ABDS:
33745 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33746 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33747 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33748 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33749 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33750 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33751 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33752 case ISD::GC_TRANSITION_START:
33753 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33754 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33755 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33756 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33757 // clang-format on
33758 }
33759}
33760
33761/// Replace a node with an illegal result type with a new node built out of
33762/// custom code.
33765 SelectionDAG &DAG) const {
33766 SDLoc dl(N);
33767 unsigned Opc = N->getOpcode();
33768 switch (Opc) {
33769 default:
33770#ifndef NDEBUG
33771 dbgs() << "ReplaceNodeResults: ";
33772 N->dump(&DAG);
33773#endif
33774 llvm_unreachable("Do not know how to custom type legalize this operation!");
33775 case X86ISD::CVTPH2PS: {
33776 EVT VT = N->getValueType(0);
33777 SDValue Lo, Hi;
33778 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33779 EVT LoVT, HiVT;
33780 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33781 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33782 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33783 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33784 Results.push_back(Res);
33785 return;
33786 }
33788 EVT VT = N->getValueType(0);
33789 SDValue Lo, Hi;
33790 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33791 EVT LoVT, HiVT;
33792 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33793 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33794 {N->getOperand(0), Lo});
33795 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33796 {N->getOperand(0), Hi});
33797 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33798 Lo.getValue(1), Hi.getValue(1));
33799 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33800 Results.push_back(Res);
33801 Results.push_back(Chain);
33802 return;
33803 }
33804 case X86ISD::CVTPS2PH:
33805 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33806 return;
33807 case ISD::CTPOP: {
33808 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33809 // If we have at most 32 active bits, then perform as i32 CTPOP.
33810 // TODO: Perform this in generic legalizer?
33811 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33812 unsigned LZ = Known.countMinLeadingZeros();
33813 unsigned TZ = Known.countMinTrailingZeros();
33814 if ((LZ + TZ) >= 32) {
33815 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33816 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33817 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33818 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33819 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33820 Results.push_back(Op);
33821 return;
33822 }
33823 // Use a v2i64 if possible.
33824 bool NoImplicitFloatOps =
33826 Attribute::NoImplicitFloat);
33827 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33828 SDValue Wide =
33829 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33830 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33831 // Bit count should fit in 32-bits, extract it as that and then zero
33832 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33833 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33834 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33835 DAG.getVectorIdxConstant(0, dl));
33836 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33837 Results.push_back(Wide);
33838 }
33839 return;
33840 }
33841 case ISD::MUL: {
33842 EVT VT = N->getValueType(0);
33844 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33845 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33846 // elements are needed.
33847 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33848 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33849 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33850 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33851 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33852 unsigned NumConcats = 16 / VT.getVectorNumElements();
33853 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33854 ConcatOps[0] = Res;
33855 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33856 Results.push_back(Res);
33857 return;
33858 }
33859 case ISD::SMULO:
33860 case ISD::UMULO: {
33861 EVT VT = N->getValueType(0);
33863 VT == MVT::v2i32 && "Unexpected VT!");
33864 bool IsSigned = Opc == ISD::SMULO;
33865 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33866 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33867 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33868 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33869 // Extract the high 32 bits from each result using PSHUFD.
33870 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33871 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33872 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33873 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33874 DAG.getVectorIdxConstant(0, dl));
33875
33876 // Truncate the low bits of the result. This will become PSHUFD.
33877 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33878
33879 SDValue HiCmp;
33880 if (IsSigned) {
33881 // SMULO overflows if the high bits don't match the sign of the low.
33882 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33883 } else {
33884 // UMULO overflows if the high bits are non-zero.
33885 HiCmp = DAG.getConstant(0, dl, VT);
33886 }
33887 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33888
33889 // Widen the result with by padding with undef.
33890 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33891 DAG.getUNDEF(VT));
33892 Results.push_back(Res);
33893 Results.push_back(Ovf);
33894 return;
33895 }
33896 case X86ISD::VPMADDWD: {
33897 // Legalize types for X86ISD::VPMADDWD by widening.
33898 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33899
33900 EVT VT = N->getValueType(0);
33901 EVT InVT = N->getOperand(0).getValueType();
33902 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33903 "Expected a VT that divides into 128 bits.");
33905 "Unexpected type action!");
33906 unsigned NumConcat = 128 / InVT.getSizeInBits();
33907
33908 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33909 InVT.getVectorElementType(),
33910 NumConcat * InVT.getVectorNumElements());
33911 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33913 NumConcat * VT.getVectorNumElements());
33914
33915 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33916 Ops[0] = N->getOperand(0);
33917 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33918 Ops[0] = N->getOperand(1);
33919 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33920
33921 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33922 Results.push_back(Res);
33923 return;
33924 }
33925 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33926 case X86ISD::FMINC:
33927 case X86ISD::FMIN:
33928 case X86ISD::FMAXC:
33929 case X86ISD::FMAX:
33931 case X86ISD::STRICT_FMAX: {
33932 EVT VT = N->getValueType(0);
33933 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33934 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33935 SDValue UNDEF = DAG.getUNDEF(VT);
33936 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33937 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33938 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33939 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33940 SDValue Res;
33941 if (IsStrict)
33942 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33943 {N->getOperand(0), LHS, RHS});
33944 else
33945 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33946 Results.push_back(Res);
33947 if (IsStrict)
33948 Results.push_back(Res.getValue(1));
33949 return;
33950 }
33951 case ISD::SDIV:
33952 case ISD::UDIV:
33953 case ISD::SREM:
33954 case ISD::UREM: {
33955 EVT VT = N->getValueType(0);
33956 if (VT.isVector()) {
33958 "Unexpected type action!");
33959 // If this RHS is a constant splat vector we can widen this and let
33960 // division/remainder by constant optimize it.
33961 // TODO: Can we do something for non-splat?
33962 APInt SplatVal;
33963 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33964 unsigned NumConcats = 128 / VT.getSizeInBits();
33965 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33966 Ops0[0] = N->getOperand(0);
33967 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33968 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33969 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33970 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33971 Results.push_back(Res);
33972 }
33973 return;
33974 }
33975
33976 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33977 Results.push_back(V);
33978 return;
33979 }
33980 case ISD::TRUNCATE: {
33981 MVT VT = N->getSimpleValueType(0);
33982 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33983 return;
33984
33985 // The generic legalizer will try to widen the input type to the same
33986 // number of elements as the widened result type. But this isn't always
33987 // the best thing so do some custom legalization to avoid some cases.
33988 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33989 SDValue In = N->getOperand(0);
33990 EVT InVT = In.getValueType();
33991 EVT InEltVT = InVT.getVectorElementType();
33992 EVT EltVT = VT.getVectorElementType();
33993 unsigned MinElts = VT.getVectorNumElements();
33994 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33995 unsigned InBits = InVT.getSizeInBits();
33996
33997 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
33998 unsigned PackOpcode;
33999 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
34000 Subtarget, N->getFlags())) {
34001 if (SDValue Res =
34002 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
34003 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
34004 Results.push_back(Res);
34005 return;
34006 }
34007 }
34008
34009 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
34010 // 128 bit and smaller inputs should avoid truncate all together and
34011 // use a shuffle.
34012 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
34013 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
34014 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
34015 for (unsigned I = 0; I < MinElts; ++I)
34016 TruncMask[I] = Scale * I;
34017 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
34018 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
34019 "Illegal vector type in truncation");
34020 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
34021 Results.push_back(
34022 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
34023 return;
34024 }
34025 }
34026
34027 // With AVX512 there are some cases that can use a target specific
34028 // truncate node to go from 256/512 to less than 128 with zeros in the
34029 // upper elements of the 128 bit result.
34030 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34031 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34032 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34033 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34034 return;
34035 }
34036 // There's one case we can widen to 512 bits and use VTRUNC.
34037 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34038 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34039 DAG.getUNDEF(MVT::v4i64));
34040 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34041 return;
34042 }
34043 }
34044 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34045 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34046 isTypeLegal(MVT::v4i64)) {
34047 // Input needs to be split and output needs to widened. Let's use two
34048 // VTRUNCs, and shuffle their results together into the wider type.
34049 SDValue Lo, Hi;
34050 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34051
34052 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34053 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34054 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34055 { 0, 1, 2, 3, 16, 17, 18, 19,
34056 -1, -1, -1, -1, -1, -1, -1, -1 });
34057 Results.push_back(Res);
34058 return;
34059 }
34060
34061 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34062 // this via type legalization.
34063 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34064 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34065 (!Subtarget.hasSSSE3() ||
34066 (!isTypeLegal(InVT) &&
34067 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34068 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34069 InEltVT.getSizeInBits() * WidenNumElts);
34070 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34071 return;
34072 }
34073
34074 return;
34075 }
34076 case ISD::ANY_EXTEND:
34077 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34078 // It's intended to custom handle the input type.
34079 assert(N->getValueType(0) == MVT::v8i8 &&
34080 "Do not know how to legalize this Node");
34081 return;
34082 case ISD::SIGN_EXTEND:
34083 case ISD::ZERO_EXTEND: {
34084 EVT VT = N->getValueType(0);
34085 SDValue In = N->getOperand(0);
34086 EVT InVT = In.getValueType();
34087 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34088 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34090 "Unexpected type action!");
34091 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34092 // Custom split this so we can extend i8/i16->i32 invec. This is better
34093 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34094 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34095 // we allow the sra from the extend to i32 to be shared by the split.
34096 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34097
34098 // Fill a vector with sign bits for each element.
34099 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34100 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34101
34102 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34103 // to v2i64.
34104 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34105 {0, 4, 1, 5});
34106 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34107 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34108 {2, 6, 3, 7});
34109 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34110
34111 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34112 Results.push_back(Res);
34113 return;
34114 }
34115
34116 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34117 if (!InVT.is128BitVector()) {
34118 // Not a 128 bit vector, but maybe type legalization will promote
34119 // it to 128 bits.
34120 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34121 return;
34122 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34123 if (!InVT.is128BitVector())
34124 return;
34125
34126 // Promote the input to 128 bits. Type legalization will turn this into
34127 // zext_inreg/sext_inreg.
34128 In = DAG.getNode(Opc, dl, InVT, In);
34129 }
34130
34131 // Perform custom splitting instead of the two stage extend we would get
34132 // by default.
34133 EVT LoVT, HiVT;
34134 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34135 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34136
34137 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34138
34139 // We need to shift the input over by half the number of elements.
34140 unsigned NumElts = InVT.getVectorNumElements();
34141 unsigned HalfNumElts = NumElts / 2;
34142 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34143 for (unsigned i = 0; i != HalfNumElts; ++i)
34144 ShufMask[i] = i + HalfNumElts;
34145
34146 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34147 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34148
34149 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34150 Results.push_back(Res);
34151 }
34152 return;
34153 }
34155 case ISD::FP_TO_UINT_SAT: {
34156 if (!Subtarget.hasAVX10_2())
34157 return;
34158
34159 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34160 EVT VT = N->getValueType(0);
34161 SDValue Op = N->getOperand(0);
34162 EVT OpVT = Op.getValueType();
34163 SDValue Res;
34164
34165 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34166 if (IsSigned)
34167 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34168 else
34169 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34170 Results.push_back(Res);
34171 }
34172 return;
34173 }
34174 case ISD::FP_TO_SINT:
34176 case ISD::FP_TO_UINT:
34178 bool IsStrict = N->isStrictFPOpcode();
34179 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34180 EVT VT = N->getValueType(0);
34181 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34182 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34183 EVT SrcVT = Src.getValueType();
34184
34185 SDValue Res;
34186 if (isSoftF16(SrcVT, Subtarget)) {
34187 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34188 if (IsStrict) {
34189 Res =
34190 DAG.getNode(Opc, dl, {VT, MVT::Other},
34191 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34192 {NVT, MVT::Other}, {Chain, Src})});
34193 Chain = Res.getValue(1);
34194 } else {
34195 Res =
34196 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34197 }
34198 Results.push_back(Res);
34199 if (IsStrict)
34200 Results.push_back(Chain);
34201
34202 return;
34203 }
34204
34205 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34206 SrcVT.getVectorElementType() == MVT::f16) {
34207 EVT EleVT = VT.getVectorElementType();
34208 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34209
34210 if (SrcVT != MVT::v8f16) {
34211 SDValue Tmp =
34212 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34213 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34214 Ops[0] = Src;
34215 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34216 }
34217
34218 if (IsStrict) {
34220 Res =
34221 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34222 Chain = Res.getValue(1);
34223 } else {
34224 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34225 Res = DAG.getNode(Opc, dl, ResVT, Src);
34226 }
34227
34228 // TODO: Need to add exception check code for strict FP.
34229 if (EleVT.getSizeInBits() < 16) {
34230 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34231 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34232
34233 // Now widen to 128 bits.
34234 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34235 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34236 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34237 ConcatOps[0] = Res;
34238 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34239 }
34240
34241 Results.push_back(Res);
34242 if (IsStrict)
34243 Results.push_back(Chain);
34244
34245 return;
34246 }
34247
34248 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34250 "Unexpected type action!");
34251
34252 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34253 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34254 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34256 SDValue Res;
34257 SDValue Chain;
34258 if (IsStrict) {
34259 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34260 {N->getOperand(0), Src});
34261 Chain = Res.getValue(1);
34262 } else
34263 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34264
34265 // Preserve what we know about the size of the original result. If the
34266 // result is v2i32, we have to manually widen the assert.
34267 if (PromoteVT == MVT::v2i32)
34268 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34269 DAG.getUNDEF(MVT::v2i32));
34270
34271 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34272 Res.getValueType(), Res,
34274
34275 if (PromoteVT == MVT::v2i32)
34276 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34277 DAG.getVectorIdxConstant(0, dl));
34278
34279 // Truncate back to the original width.
34280 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34281
34282 // Now widen to 128 bits.
34283 unsigned NumConcats = 128 / VT.getSizeInBits();
34285 VT.getVectorNumElements() * NumConcats);
34286 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34287 ConcatOps[0] = Res;
34288 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34289 Results.push_back(Res);
34290 if (IsStrict)
34291 Results.push_back(Chain);
34292 return;
34293 }
34294
34295
34296 if (VT == MVT::v2i32) {
34297 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34298 "Strict unsigned conversion requires AVX512");
34299 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34301 "Unexpected type action!");
34302 if (Src.getValueType() == MVT::v2f64) {
34303 if (!IsSigned && !Subtarget.hasAVX512()) {
34304 SDValue Res =
34305 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34306 Results.push_back(Res);
34307 return;
34308 }
34309
34310 if (IsStrict)
34312 else
34313 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34314
34315 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34316 if (!IsSigned && !Subtarget.hasVLX()) {
34317 // Otherwise we can defer to the generic legalizer which will widen
34318 // the input as well. This will be further widened during op
34319 // legalization to v8i32<-v8f64.
34320 // For strict nodes we'll need to widen ourselves.
34321 // FIXME: Fix the type legalizer to safely widen strict nodes?
34322 if (!IsStrict)
34323 return;
34324 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34325 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34326 Opc = N->getOpcode();
34327 }
34328 SDValue Res;
34329 SDValue Chain;
34330 if (IsStrict) {
34331 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34332 {N->getOperand(0), Src});
34333 Chain = Res.getValue(1);
34334 } else {
34335 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34336 }
34337 Results.push_back(Res);
34338 if (IsStrict)
34339 Results.push_back(Chain);
34340 return;
34341 }
34342
34343 // Custom widen strict v2f32->v2i32 by padding with zeros.
34344 // FIXME: Should generic type legalizer do this?
34345 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34346 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34347 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34348 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34349 {N->getOperand(0), Src});
34350 Results.push_back(Res);
34351 Results.push_back(Res.getValue(1));
34352 return;
34353 }
34354
34355 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34356 // so early out here.
34357 return;
34358 }
34359
34360 assert(!VT.isVector() && "Vectors should have been handled above!");
34361
34362 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34363 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34364 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34365 assert(!Subtarget.is64Bit() && "i64 should be legal");
34366 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34367 // If we use a 128-bit result we might need to use a target specific node.
34368 unsigned SrcElts =
34369 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34370 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34371 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34372 if (NumElts != SrcElts) {
34373 if (IsStrict)
34375 else
34376 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34377 }
34378
34379 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34380 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34381 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34382 ZeroIdx);
34383 SDValue Chain;
34384 if (IsStrict) {
34385 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34386 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34387 Chain = Res.getValue(1);
34388 } else
34389 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34390 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34391 Results.push_back(Res);
34392 if (IsStrict)
34393 Results.push_back(Chain);
34394 return;
34395 }
34396
34397 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34398 SDValue Chain;
34399 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34400 Results.push_back(V);
34401 if (IsStrict)
34402 Results.push_back(Chain);
34403 return;
34404 }
34405
34406 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34407 Results.push_back(V);
34408 if (IsStrict)
34409 Results.push_back(Chain);
34410 }
34411 return;
34412 }
34413 case ISD::LRINT:
34414 if (N->getValueType(0) == MVT::v2i32) {
34415 SDValue Src = N->getOperand(0);
34416 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34417 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34418 DAG.getUNDEF(MVT::v2f16));
34419 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34420 DAG.getUNDEF(MVT::v4f16));
34421 } else if (Src.getValueType() != MVT::v2f64) {
34422 return;
34423 }
34424 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34425 return;
34426 }
34427 [[fallthrough]];
34428 case ISD::LLRINT: {
34429 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34430 Results.push_back(V);
34431 return;
34432 }
34433
34434 case ISD::SINT_TO_FP:
34436 case ISD::UINT_TO_FP:
34438 bool IsStrict = N->isStrictFPOpcode();
34439 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34440 EVT VT = N->getValueType(0);
34441 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34442 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34443 Subtarget.hasVLX()) {
34444 if (Src.getValueType().getVectorElementType() == MVT::i16)
34445 return;
34446
34447 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34448 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34449 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34450 : DAG.getUNDEF(MVT::v2i32));
34451 if (IsStrict) {
34452 unsigned Opc =
34454 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34455 {N->getOperand(0), Src});
34456 Results.push_back(Res);
34457 Results.push_back(Res.getValue(1));
34458 } else {
34459 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34460 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34461 }
34462 return;
34463 }
34464 if (VT != MVT::v2f32)
34465 return;
34466 EVT SrcVT = Src.getValueType();
34467 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34468 if (IsStrict) {
34469 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34471 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34472 {N->getOperand(0), Src});
34473 Results.push_back(Res);
34474 Results.push_back(Res.getValue(1));
34475 } else {
34476 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34477 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34478 }
34479 return;
34480 }
34481 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34482 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34483 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34484 SDValue One = DAG.getConstant(1, dl, SrcVT);
34485 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34486 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34487 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34488 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34489 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34490 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34491 for (int i = 0; i != 2; ++i) {
34492 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34493 SignSrc, DAG.getVectorIdxConstant(i, dl));
34494 if (IsStrict)
34495 SignCvts[i] =
34496 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34497 {N->getOperand(0), Elt});
34498 else
34499 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34500 };
34501 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34502 SDValue Slow, Chain;
34503 if (IsStrict) {
34504 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34505 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34506 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34507 {Chain, SignCvt, SignCvt});
34508 Chain = Slow.getValue(1);
34509 } else {
34510 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34511 }
34512 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34513 IsNeg =
34514 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34515 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34516 Results.push_back(Cvt);
34517 if (IsStrict)
34518 Results.push_back(Chain);
34519 return;
34520 }
34521
34522 if (SrcVT != MVT::v2i32)
34523 return;
34524
34525 if (IsSigned || Subtarget.hasAVX512()) {
34526 if (!IsStrict)
34527 return;
34528
34529 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34530 // FIXME: Should generic type legalizer do this?
34531 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34532 DAG.getConstant(0, dl, MVT::v2i32));
34533 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34534 {N->getOperand(0), Src});
34535 Results.push_back(Res);
34536 Results.push_back(Res.getValue(1));
34537 return;
34538 }
34539
34540 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34541 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34542 SDValue VBias = DAG.getConstantFP(
34543 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34544 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34545 DAG.getBitcast(MVT::v2i64, VBias));
34546 Or = DAG.getBitcast(MVT::v2f64, Or);
34547 if (IsStrict) {
34548 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34549 {N->getOperand(0), Or, VBias});
34551 {MVT::v4f32, MVT::Other},
34552 {Sub.getValue(1), Sub});
34553 Results.push_back(Res);
34554 Results.push_back(Res.getValue(1));
34555 } else {
34556 // TODO: Are there any fast-math-flags to propagate here?
34557 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34558 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34559 }
34560 return;
34561 }
34563 case ISD::FP_ROUND: {
34564 bool IsStrict = N->isStrictFPOpcode();
34565 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34566 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34567 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34568 EVT SrcVT = Src.getValueType();
34569 EVT VT = N->getValueType(0);
34570 SDValue V;
34571 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34572 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34573 : DAG.getUNDEF(MVT::v2f32);
34574 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34575 }
34576 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34577 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34578 if (SrcVT.getVectorElementType() != MVT::f32)
34579 return;
34580
34581 if (IsStrict)
34582 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34583 {Chain, Src, Rnd});
34584 else
34585 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34586
34587 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34588 if (IsStrict)
34589 Results.push_back(V.getValue(1));
34590 return;
34591 }
34592 if (!isTypeLegal(Src.getValueType()))
34593 return;
34594 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34595 if (IsStrict)
34596 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34597 {Chain, Src});
34598 else
34599 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34600 Results.push_back(V);
34601 if (IsStrict)
34602 Results.push_back(V.getValue(1));
34603 return;
34604 }
34605 case ISD::FP_EXTEND:
34606 case ISD::STRICT_FP_EXTEND: {
34607 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34608 // No other ValueType for FP_EXTEND should reach this point.
34609 assert(N->getValueType(0) == MVT::v2f32 &&
34610 "Do not know how to legalize this Node");
34611 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34612 return;
34613 bool IsStrict = N->isStrictFPOpcode();
34614 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34615 if (Src.getValueType().getVectorElementType() != MVT::f16)
34616 return;
34617 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34618 : DAG.getUNDEF(MVT::v2f16);
34619 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34620 if (IsStrict)
34621 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34622 {N->getOperand(0), V});
34623 else
34624 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34625 Results.push_back(V);
34626 if (IsStrict)
34627 Results.push_back(V.getValue(1));
34628 return;
34629 }
34631 unsigned IntNo = N->getConstantOperandVal(1);
34632 switch (IntNo) {
34633 default : llvm_unreachable("Do not know how to custom type "
34634 "legalize this intrinsic operation!");
34635 case Intrinsic::x86_rdtsc:
34636 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34637 Results);
34638 case Intrinsic::x86_rdtscp:
34639 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34640 Results);
34641 case Intrinsic::x86_rdpmc:
34642 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34643 Results);
34644 return;
34645 case Intrinsic::x86_rdpru:
34646 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34647 Results);
34648 return;
34649 case Intrinsic::x86_xgetbv:
34650 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34651 Results);
34652 return;
34653 }
34654 }
34655 case ISD::READCYCLECOUNTER: {
34656 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34657 }
34658 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34659 EVT T = N->getValueType(0);
34660 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34661 bool Regs64bit = T == MVT::i128;
34662 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34663 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34664 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34665 SDValue cpInL, cpInH;
34666 std::tie(cpInL, cpInH) =
34667 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34668 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34669 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34670 cpInH =
34671 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34672 cpInH, cpInL.getValue(1));
34673 SDValue swapInL, swapInH;
34674 std::tie(swapInL, swapInH) =
34675 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34676 swapInH =
34677 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34678 swapInH, cpInH.getValue(1));
34679
34680 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34681 // until later. So we keep the RBX input in a vreg and use a custom
34682 // inserter.
34683 // Since RBX will be a reserved register the register allocator will not
34684 // make sure its value will be properly saved and restored around this
34685 // live-range.
34686 SDValue Result;
34687 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34688 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34689 if (Regs64bit) {
34690 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34691 swapInH.getValue(1)};
34692 Result =
34693 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34694 } else {
34695 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34696 swapInH.getValue(1));
34697 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34698 swapInL.getValue(1)};
34699 Result =
34700 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34701 }
34702
34703 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34704 Regs64bit ? X86::RAX : X86::EAX,
34705 HalfT, Result.getValue(1));
34706 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34707 Regs64bit ? X86::RDX : X86::EDX,
34708 HalfT, cpOutL.getValue(2));
34709 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34710
34711 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34712 MVT::i32, cpOutH.getValue(2));
34713 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34714 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34715
34716 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34717 Results.push_back(Success);
34718 Results.push_back(EFLAGS.getValue(1));
34719 return;
34720 }
34721 case ISD::ATOMIC_LOAD: {
34722 assert(
34723 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34724 "Unexpected VT!");
34725 bool NoImplicitFloatOps =
34727 Attribute::NoImplicitFloat);
34728 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34729 auto *Node = cast<AtomicSDNode>(N);
34730
34731 if (N->getValueType(0) == MVT::i128) {
34732 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34733 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34734 Node->getBasePtr(), Node->getMemOperand());
34735 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34736 DAG.getVectorIdxConstant(0, dl));
34737 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34738 DAG.getVectorIdxConstant(1, dl));
34739 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34740 {ResL, ResH}));
34741 Results.push_back(Ld.getValue(1));
34742 return;
34743 }
34744 break;
34745 }
34746 if (Subtarget.hasSSE1()) {
34747 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34748 // Then extract the lower 64-bits.
34749 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34750 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34751 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34753 MVT::i64, Node->getMemOperand());
34754 if (Subtarget.hasSSE2()) {
34755 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34756 DAG.getVectorIdxConstant(0, dl));
34757 Results.push_back(Res);
34758 Results.push_back(Ld.getValue(1));
34759 return;
34760 }
34761 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34762 // then casts to i64. This avoids a 128-bit stack temporary being
34763 // created by type legalization if we were to cast v4f32->v2i64.
34764 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34765 DAG.getVectorIdxConstant(0, dl));
34766 Res = DAG.getBitcast(MVT::i64, Res);
34767 Results.push_back(Res);
34768 Results.push_back(Ld.getValue(1));
34769 return;
34770 }
34771 if (Subtarget.hasX87()) {
34772 // First load this into an 80-bit X87 register. This will put the whole
34773 // integer into the significand.
34774 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34775 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34777 dl, Tys, Ops, MVT::i64,
34778 Node->getMemOperand());
34779 SDValue Chain = Result.getValue(1);
34780
34781 // Now store the X87 register to a stack temporary and convert to i64.
34782 // This store is not atomic and doesn't need to be.
34783 // FIXME: We don't need a stack temporary if the result of the load
34784 // is already being stored. We could just directly store there.
34785 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34786 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34787 MachinePointerInfo MPI =
34789 SDValue StoreOps[] = { Chain, Result, StackPtr };
34790 Chain = DAG.getMemIntrinsicNode(
34791 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34792 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34793
34794 // Finally load the value back from the stack temporary and return it.
34795 // This load is not atomic and doesn't need to be.
34796 // This load will be further type legalized.
34797 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34798 Results.push_back(Result);
34799 Results.push_back(Result.getValue(1));
34800 return;
34801 }
34802 }
34803 // TODO: Use MOVLPS when SSE1 is available?
34804 // Delegate to generic TypeLegalization. Situations we can really handle
34805 // should have already been dealt with by AtomicExpandPass.cpp.
34806 break;
34807 }
34808 case ISD::ATOMIC_SWAP:
34809 case ISD::ATOMIC_LOAD_ADD:
34810 case ISD::ATOMIC_LOAD_SUB:
34811 case ISD::ATOMIC_LOAD_AND:
34812 case ISD::ATOMIC_LOAD_OR:
34813 case ISD::ATOMIC_LOAD_XOR:
34814 case ISD::ATOMIC_LOAD_NAND:
34815 case ISD::ATOMIC_LOAD_MIN:
34816 case ISD::ATOMIC_LOAD_MAX:
34817 case ISD::ATOMIC_LOAD_UMIN:
34818 case ISD::ATOMIC_LOAD_UMAX:
34819 // Delegate to generic TypeLegalization. Situations we can really handle
34820 // should have already been dealt with by AtomicExpandPass.cpp.
34821 break;
34822
34823 case ISD::BITCAST: {
34824 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34825 EVT DstVT = N->getValueType(0);
34826 EVT SrcVT = N->getOperand(0).getValueType();
34827
34828 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34829 // we can split using the k-register rather than memory.
34830 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34831 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34832 SDValue Lo, Hi;
34833 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34834 Lo = DAG.getBitcast(MVT::i32, Lo);
34835 Hi = DAG.getBitcast(MVT::i32, Hi);
34836 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34837 Results.push_back(Res);
34838 return;
34839 }
34840
34841 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34842 // FIXME: Use v4f32 for SSE1?
34843 assert(Subtarget.hasSSE2() && "Requires SSE2");
34844 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34845 "Unexpected type action!");
34846 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34847 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34848 N->getOperand(0));
34849 Res = DAG.getBitcast(WideVT, Res);
34850 Results.push_back(Res);
34851 return;
34852 }
34853
34854 return;
34855 }
34856 case ISD::MGATHER: {
34857 EVT VT = N->getValueType(0);
34858 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34859 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34860 auto *Gather = cast<MaskedGatherSDNode>(N);
34861 SDValue Index = Gather->getIndex();
34862 if (Index.getValueType() != MVT::v2i64)
34863 return;
34865 "Unexpected type action!");
34866 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34867 SDValue Mask = Gather->getMask();
34868 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34869 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34870 Gather->getPassThru(),
34871 DAG.getUNDEF(VT));
34872 if (!Subtarget.hasVLX()) {
34873 // We need to widen the mask, but the instruction will only use 2
34874 // of its elements. So we can use undef.
34875 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34876 DAG.getUNDEF(MVT::v2i1));
34877 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34878 }
34879 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34880 Gather->getBasePtr(), Index, Gather->getScale() };
34881 SDValue Res = DAG.getMemIntrinsicNode(
34882 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34883 Gather->getMemoryVT(), Gather->getMemOperand());
34884 Results.push_back(Res);
34885 Results.push_back(Res.getValue(1));
34886 return;
34887 }
34888 return;
34889 }
34890 case ISD::LOAD: {
34891 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34892 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34893 // cast since type legalization will try to use an i64 load.
34894 MVT VT = N->getSimpleValueType(0);
34895 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34897 "Unexpected type action!");
34898 if (!ISD::isNON_EXTLoad(N))
34899 return;
34900 auto *Ld = cast<LoadSDNode>(N);
34901 if (Subtarget.hasSSE2()) {
34902 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34903 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34904 Ld->getPointerInfo(), Ld->getBaseAlign(),
34905 Ld->getMemOperand()->getFlags());
34906 SDValue Chain = Res.getValue(1);
34907 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34908 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34909 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34910 Res = DAG.getBitcast(WideVT, Res);
34911 Results.push_back(Res);
34912 Results.push_back(Chain);
34913 return;
34914 }
34915 assert(Subtarget.hasSSE1() && "Expected SSE");
34916 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34917 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34919 MVT::i64, Ld->getMemOperand());
34920 Results.push_back(Res);
34921 Results.push_back(Res.getValue(1));
34922 return;
34923 }
34924 case ISD::ADDRSPACECAST: {
34925 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34926 Results.push_back(V);
34927 return;
34928 }
34929 case ISD::BITREVERSE: {
34930 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34931 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34932 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34933 // We'll need to move the scalar in two i32 pieces.
34934 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34935 return;
34936 }
34938 // f16 = extract vXf16 %vec, i64 %idx
34939 assert(N->getSimpleValueType(0) == MVT::f16 &&
34940 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34941 assert(Subtarget.hasFP16() && "Expected FP16");
34942 SDValue VecOp = N->getOperand(0);
34944 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34945 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34946 N->getOperand(1));
34947 Split = DAG.getBitcast(MVT::f16, Split);
34948 Results.push_back(Split);
34949 return;
34950 }
34951 }
34952}
34953
34954const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34955 switch ((X86ISD::NodeType)Opcode) {
34956 case X86ISD::FIRST_NUMBER: break;
34957#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34958 NODE_NAME_CASE(BSF)
34959 NODE_NAME_CASE(BSR)
34960 NODE_NAME_CASE(FSHL)
34961 NODE_NAME_CASE(FSHR)
34962 NODE_NAME_CASE(FAND)
34963 NODE_NAME_CASE(FANDN)
34964 NODE_NAME_CASE(FOR)
34965 NODE_NAME_CASE(FXOR)
34966 NODE_NAME_CASE(FILD)
34967 NODE_NAME_CASE(FIST)
34968 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34969 NODE_NAME_CASE(FLD)
34970 NODE_NAME_CASE(FST)
34971 NODE_NAME_CASE(CALL)
34972 NODE_NAME_CASE(CALL_RVMARKER)
34973 NODE_NAME_CASE(IMP_CALL)
34975 NODE_NAME_CASE(CMP)
34976 NODE_NAME_CASE(FCMP)
34977 NODE_NAME_CASE(STRICT_FCMP)
34978 NODE_NAME_CASE(STRICT_FCMPS)
34980 NODE_NAME_CASE(UCOMI)
34981 NODE_NAME_CASE(COMX)
34982 NODE_NAME_CASE(UCOMX)
34983 NODE_NAME_CASE(CMPM)
34984 NODE_NAME_CASE(CMPMM)
34985 NODE_NAME_CASE(STRICT_CMPM)
34986 NODE_NAME_CASE(CMPMM_SAE)
34987 NODE_NAME_CASE(SETCC)
34988 NODE_NAME_CASE(SETCC_CARRY)
34989 NODE_NAME_CASE(FSETCC)
34990 NODE_NAME_CASE(FSETCCM)
34991 NODE_NAME_CASE(FSETCCM_SAE)
34992 NODE_NAME_CASE(CMOV)
34993 NODE_NAME_CASE(BRCOND)
34994 NODE_NAME_CASE(RET_GLUE)
34995 NODE_NAME_CASE(IRET)
34996 NODE_NAME_CASE(REP_STOS)
34997 NODE_NAME_CASE(REP_MOVS)
34998 NODE_NAME_CASE(GlobalBaseReg)
35000 NODE_NAME_CASE(WrapperRIP)
35001 NODE_NAME_CASE(MOVQ2DQ)
35002 NODE_NAME_CASE(MOVDQ2Q)
35003 NODE_NAME_CASE(MMX_MOVD2W)
35004 NODE_NAME_CASE(MMX_MOVW2D)
35005 NODE_NAME_CASE(PEXTRB)
35006 NODE_NAME_CASE(PEXTRW)
35007 NODE_NAME_CASE(INSERTPS)
35008 NODE_NAME_CASE(PINSRB)
35009 NODE_NAME_CASE(PINSRW)
35010 NODE_NAME_CASE(PSHUFB)
35011 NODE_NAME_CASE(ANDNP)
35012 NODE_NAME_CASE(BLENDI)
35014 NODE_NAME_CASE(HADD)
35015 NODE_NAME_CASE(HSUB)
35016 NODE_NAME_CASE(FHADD)
35017 NODE_NAME_CASE(FHSUB)
35018 NODE_NAME_CASE(CONFLICT)
35019 NODE_NAME_CASE(FMAX)
35020 NODE_NAME_CASE(FMAXS)
35021 NODE_NAME_CASE(FMAX_SAE)
35022 NODE_NAME_CASE(FMAXS_SAE)
35023 NODE_NAME_CASE(STRICT_FMAX)
35024 NODE_NAME_CASE(FMIN)
35025 NODE_NAME_CASE(FMINS)
35026 NODE_NAME_CASE(FMIN_SAE)
35027 NODE_NAME_CASE(FMINS_SAE)
35028 NODE_NAME_CASE(STRICT_FMIN)
35029 NODE_NAME_CASE(FMAXC)
35030 NODE_NAME_CASE(FMINC)
35031 NODE_NAME_CASE(FRSQRT)
35032 NODE_NAME_CASE(FRCP)
35033 NODE_NAME_CASE(EXTRQI)
35034 NODE_NAME_CASE(INSERTQI)
35035 NODE_NAME_CASE(TLSADDR)
35036 NODE_NAME_CASE(TLSBASEADDR)
35037 NODE_NAME_CASE(TLSCALL)
35038 NODE_NAME_CASE(TLSDESC)
35039 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35040 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35041 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35042 NODE_NAME_CASE(EH_RETURN)
35043 NODE_NAME_CASE(TC_RETURN)
35044 NODE_NAME_CASE(FNSTCW16m)
35045 NODE_NAME_CASE(FLDCW16m)
35046 NODE_NAME_CASE(FNSTENVm)
35047 NODE_NAME_CASE(FLDENVm)
35048 NODE_NAME_CASE(LCMPXCHG_DAG)
35049 NODE_NAME_CASE(LCMPXCHG8_DAG)
35050 NODE_NAME_CASE(LCMPXCHG16_DAG)
35051 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35052 NODE_NAME_CASE(LADD)
35053 NODE_NAME_CASE(LSUB)
35054 NODE_NAME_CASE(LOR)
35055 NODE_NAME_CASE(LXOR)
35056 NODE_NAME_CASE(LAND)
35057 NODE_NAME_CASE(LBTS)
35058 NODE_NAME_CASE(LBTC)
35059 NODE_NAME_CASE(LBTR)
35060 NODE_NAME_CASE(LBTS_RM)
35061 NODE_NAME_CASE(LBTC_RM)
35062 NODE_NAME_CASE(LBTR_RM)
35063 NODE_NAME_CASE(AADD)
35064 NODE_NAME_CASE(AOR)
35065 NODE_NAME_CASE(AXOR)
35066 NODE_NAME_CASE(AAND)
35067 NODE_NAME_CASE(VZEXT_MOVL)
35068 NODE_NAME_CASE(VZEXT_LOAD)
35069 NODE_NAME_CASE(VEXTRACT_STORE)
35070 NODE_NAME_CASE(VTRUNC)
35071 NODE_NAME_CASE(VTRUNCS)
35072 NODE_NAME_CASE(VTRUNCUS)
35073 NODE_NAME_CASE(VMTRUNC)
35074 NODE_NAME_CASE(VMTRUNCS)
35075 NODE_NAME_CASE(VMTRUNCUS)
35076 NODE_NAME_CASE(VTRUNCSTORES)
35077 NODE_NAME_CASE(VTRUNCSTOREUS)
35078 NODE_NAME_CASE(VMTRUNCSTORES)
35079 NODE_NAME_CASE(VMTRUNCSTOREUS)
35080 NODE_NAME_CASE(VFPEXT)
35081 NODE_NAME_CASE(STRICT_VFPEXT)
35082 NODE_NAME_CASE(VFPEXT_SAE)
35083 NODE_NAME_CASE(VFPEXTS)
35084 NODE_NAME_CASE(VFPEXTS_SAE)
35085 NODE_NAME_CASE(VFPROUND)
35086 NODE_NAME_CASE(VFPROUND2)
35087 NODE_NAME_CASE(VFPROUND2_RND)
35088 NODE_NAME_CASE(STRICT_VFPROUND)
35089 NODE_NAME_CASE(VMFPROUND)
35090 NODE_NAME_CASE(VFPROUND_RND)
35091 NODE_NAME_CASE(VFPROUNDS)
35092 NODE_NAME_CASE(VFPROUNDS_RND)
35093 NODE_NAME_CASE(VSHLDQ)
35094 NODE_NAME_CASE(VSRLDQ)
35095 NODE_NAME_CASE(VSHL)
35096 NODE_NAME_CASE(VSRL)
35097 NODE_NAME_CASE(VSRA)
35098 NODE_NAME_CASE(VSHLI)
35099 NODE_NAME_CASE(VSRLI)
35100 NODE_NAME_CASE(VSRAI)
35101 NODE_NAME_CASE(VSHLV)
35102 NODE_NAME_CASE(VSRLV)
35103 NODE_NAME_CASE(VSRAV)
35104 NODE_NAME_CASE(VROTLI)
35105 NODE_NAME_CASE(VROTRI)
35106 NODE_NAME_CASE(VPPERM)
35107 NODE_NAME_CASE(CMPP)
35108 NODE_NAME_CASE(STRICT_CMPP)
35109 NODE_NAME_CASE(PCMPEQ)
35110 NODE_NAME_CASE(PCMPGT)
35111 NODE_NAME_CASE(PHMINPOS)
35112 NODE_NAME_CASE(ADD)
35113 NODE_NAME_CASE(SUB)
35114 NODE_NAME_CASE(ADC)
35115 NODE_NAME_CASE(SBB)
35116 NODE_NAME_CASE(SMUL)
35117 NODE_NAME_CASE(UMUL)
35118 NODE_NAME_CASE(OR)
35119 NODE_NAME_CASE(XOR)
35120 NODE_NAME_CASE(AND)
35121 NODE_NAME_CASE(BEXTR)
35123 NODE_NAME_CASE(BZHI)
35124 NODE_NAME_CASE(PDEP)
35125 NODE_NAME_CASE(PEXT)
35126 NODE_NAME_CASE(MUL_IMM)
35127 NODE_NAME_CASE(MOVMSK)
35128 NODE_NAME_CASE(PTEST)
35129 NODE_NAME_CASE(TESTP)
35130 NODE_NAME_CASE(KORTEST)
35131 NODE_NAME_CASE(KTEST)
35132 NODE_NAME_CASE(KADD)
35133 NODE_NAME_CASE(KSHIFTL)
35134 NODE_NAME_CASE(KSHIFTR)
35135 NODE_NAME_CASE(PACKSS)
35136 NODE_NAME_CASE(PACKUS)
35137 NODE_NAME_CASE(PALIGNR)
35138 NODE_NAME_CASE(VALIGN)
35139 NODE_NAME_CASE(VSHLD)
35140 NODE_NAME_CASE(VSHRD)
35141 NODE_NAME_CASE(PSHUFD)
35142 NODE_NAME_CASE(PSHUFHW)
35143 NODE_NAME_CASE(PSHUFLW)
35144 NODE_NAME_CASE(SHUFP)
35145 NODE_NAME_CASE(SHUF128)
35146 NODE_NAME_CASE(MOVLHPS)
35147 NODE_NAME_CASE(MOVHLPS)
35148 NODE_NAME_CASE(MOVDDUP)
35149 NODE_NAME_CASE(MOVSHDUP)
35150 NODE_NAME_CASE(MOVSLDUP)
35151 NODE_NAME_CASE(MOVSD)
35152 NODE_NAME_CASE(MOVSS)
35153 NODE_NAME_CASE(MOVSH)
35154 NODE_NAME_CASE(UNPCKL)
35155 NODE_NAME_CASE(UNPCKH)
35156 NODE_NAME_CASE(VBROADCAST)
35157 NODE_NAME_CASE(VBROADCAST_LOAD)
35158 NODE_NAME_CASE(VBROADCASTM)
35159 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35160 NODE_NAME_CASE(VPERMILPV)
35161 NODE_NAME_CASE(VPERMILPI)
35162 NODE_NAME_CASE(VPERM2X128)
35163 NODE_NAME_CASE(VPERMV)
35164 NODE_NAME_CASE(VPERMV3)
35165 NODE_NAME_CASE(VPERMI)
35166 NODE_NAME_CASE(VPTERNLOG)
35167 NODE_NAME_CASE(FP_TO_SINT_SAT)
35168 NODE_NAME_CASE(FP_TO_UINT_SAT)
35169 NODE_NAME_CASE(VFIXUPIMM)
35170 NODE_NAME_CASE(VFIXUPIMM_SAE)
35171 NODE_NAME_CASE(VFIXUPIMMS)
35172 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35173 NODE_NAME_CASE(VRANGE)
35174 NODE_NAME_CASE(VRANGE_SAE)
35175 NODE_NAME_CASE(VRANGES)
35176 NODE_NAME_CASE(VRANGES_SAE)
35177 NODE_NAME_CASE(PMULUDQ)
35178 NODE_NAME_CASE(PMULDQ)
35179 NODE_NAME_CASE(PSADBW)
35180 NODE_NAME_CASE(DBPSADBW)
35181 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35182 NODE_NAME_CASE(VAARG_64)
35183 NODE_NAME_CASE(VAARG_X32)
35184 NODE_NAME_CASE(DYN_ALLOCA)
35185 NODE_NAME_CASE(MFENCE)
35186 NODE_NAME_CASE(SEG_ALLOCA)
35187 NODE_NAME_CASE(PROBED_ALLOCA)
35190 NODE_NAME_CASE(RDPKRU)
35191 NODE_NAME_CASE(WRPKRU)
35192 NODE_NAME_CASE(VPMADDUBSW)
35193 NODE_NAME_CASE(VPMADDWD)
35194 NODE_NAME_CASE(VPSHA)
35195 NODE_NAME_CASE(VPSHL)
35196 NODE_NAME_CASE(VPCOM)
35197 NODE_NAME_CASE(VPCOMU)
35198 NODE_NAME_CASE(VPERMIL2)
35200 NODE_NAME_CASE(STRICT_FMSUB)
35202 NODE_NAME_CASE(STRICT_FNMADD)
35204 NODE_NAME_CASE(STRICT_FNMSUB)
35205 NODE_NAME_CASE(FMADDSUB)
35206 NODE_NAME_CASE(FMSUBADD)
35207 NODE_NAME_CASE(FMADD_RND)
35208 NODE_NAME_CASE(FNMADD_RND)
35209 NODE_NAME_CASE(FMSUB_RND)
35210 NODE_NAME_CASE(FNMSUB_RND)
35211 NODE_NAME_CASE(FMADDSUB_RND)
35212 NODE_NAME_CASE(FMSUBADD_RND)
35213 NODE_NAME_CASE(VFMADDC)
35214 NODE_NAME_CASE(VFMADDC_RND)
35215 NODE_NAME_CASE(VFCMADDC)
35216 NODE_NAME_CASE(VFCMADDC_RND)
35217 NODE_NAME_CASE(VFMULC)
35218 NODE_NAME_CASE(VFMULC_RND)
35219 NODE_NAME_CASE(VFCMULC)
35220 NODE_NAME_CASE(VFCMULC_RND)
35221 NODE_NAME_CASE(VFMULCSH)
35222 NODE_NAME_CASE(VFMULCSH_RND)
35223 NODE_NAME_CASE(VFCMULCSH)
35224 NODE_NAME_CASE(VFCMULCSH_RND)
35225 NODE_NAME_CASE(VFMADDCSH)
35226 NODE_NAME_CASE(VFMADDCSH_RND)
35227 NODE_NAME_CASE(VFCMADDCSH)
35228 NODE_NAME_CASE(VFCMADDCSH_RND)
35229 NODE_NAME_CASE(VPMADD52H)
35230 NODE_NAME_CASE(VPMADD52L)
35231 NODE_NAME_CASE(VRNDSCALE)
35232 NODE_NAME_CASE(STRICT_VRNDSCALE)
35233 NODE_NAME_CASE(VRNDSCALE_SAE)
35234 NODE_NAME_CASE(VRNDSCALES)
35235 NODE_NAME_CASE(VRNDSCALES_SAE)
35236 NODE_NAME_CASE(VREDUCE)
35237 NODE_NAME_CASE(VREDUCE_SAE)
35238 NODE_NAME_CASE(VREDUCES)
35239 NODE_NAME_CASE(VREDUCES_SAE)
35240 NODE_NAME_CASE(VGETMANT)
35241 NODE_NAME_CASE(VGETMANT_SAE)
35242 NODE_NAME_CASE(VGETMANTS)
35243 NODE_NAME_CASE(VGETMANTS_SAE)
35244 NODE_NAME_CASE(PCMPESTR)
35245 NODE_NAME_CASE(PCMPISTR)
35247 NODE_NAME_CASE(COMPRESS)
35249 NODE_NAME_CASE(SELECTS)
35250 NODE_NAME_CASE(ADDSUB)
35251 NODE_NAME_CASE(RCP14)
35252 NODE_NAME_CASE(RCP14S)
35253 NODE_NAME_CASE(RSQRT14)
35254 NODE_NAME_CASE(RSQRT14S)
35255 NODE_NAME_CASE(FADD_RND)
35256 NODE_NAME_CASE(FADDS)
35257 NODE_NAME_CASE(FADDS_RND)
35258 NODE_NAME_CASE(FSUB_RND)
35259 NODE_NAME_CASE(FSUBS)
35260 NODE_NAME_CASE(FSUBS_RND)
35261 NODE_NAME_CASE(FMUL_RND)
35262 NODE_NAME_CASE(FMULS)
35263 NODE_NAME_CASE(FMULS_RND)
35264 NODE_NAME_CASE(FDIV_RND)
35265 NODE_NAME_CASE(FDIVS)
35266 NODE_NAME_CASE(FDIVS_RND)
35267 NODE_NAME_CASE(FSQRT_RND)
35268 NODE_NAME_CASE(FSQRTS)
35269 NODE_NAME_CASE(FSQRTS_RND)
35270 NODE_NAME_CASE(FGETEXP)
35271 NODE_NAME_CASE(FGETEXP_SAE)
35272 NODE_NAME_CASE(FGETEXPS)
35273 NODE_NAME_CASE(FGETEXPS_SAE)
35274 NODE_NAME_CASE(SCALEF)
35275 NODE_NAME_CASE(SCALEF_RND)
35276 NODE_NAME_CASE(SCALEFS)
35277 NODE_NAME_CASE(SCALEFS_RND)
35278 NODE_NAME_CASE(MULHRS)
35279 NODE_NAME_CASE(SINT_TO_FP_RND)
35280 NODE_NAME_CASE(UINT_TO_FP_RND)
35281 NODE_NAME_CASE(CVTTP2SI)
35282 NODE_NAME_CASE(CVTTP2UI)
35283 NODE_NAME_CASE(STRICT_CVTTP2SI)
35284 NODE_NAME_CASE(STRICT_CVTTP2UI)
35285 NODE_NAME_CASE(MCVTTP2SI)
35286 NODE_NAME_CASE(MCVTTP2UI)
35287 NODE_NAME_CASE(CVTTP2SI_SAE)
35288 NODE_NAME_CASE(CVTTP2UI_SAE)
35289 NODE_NAME_CASE(CVTTS2SI)
35290 NODE_NAME_CASE(CVTTS2UI)
35291 NODE_NAME_CASE(CVTTS2SI_SAE)
35292 NODE_NAME_CASE(CVTTS2UI_SAE)
35293 NODE_NAME_CASE(CVTSI2P)
35294 NODE_NAME_CASE(CVTUI2P)
35295 NODE_NAME_CASE(STRICT_CVTSI2P)
35296 NODE_NAME_CASE(STRICT_CVTUI2P)
35297 NODE_NAME_CASE(MCVTSI2P)
35298 NODE_NAME_CASE(MCVTUI2P)
35299 NODE_NAME_CASE(VFPCLASS)
35300 NODE_NAME_CASE(VFPCLASSS)
35301 NODE_NAME_CASE(MULTISHIFT)
35302 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35303 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35304 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35305 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35306 NODE_NAME_CASE(CVTPS2PH)
35307 NODE_NAME_CASE(STRICT_CVTPS2PH)
35308 NODE_NAME_CASE(CVTPS2PH_SAE)
35309 NODE_NAME_CASE(MCVTPS2PH)
35310 NODE_NAME_CASE(MCVTPS2PH_SAE)
35311 NODE_NAME_CASE(CVTPH2PS)
35312 NODE_NAME_CASE(STRICT_CVTPH2PS)
35313 NODE_NAME_CASE(CVTPH2PS_SAE)
35314 NODE_NAME_CASE(CVTP2SI)
35315 NODE_NAME_CASE(CVTP2UI)
35316 NODE_NAME_CASE(MCVTP2SI)
35317 NODE_NAME_CASE(MCVTP2UI)
35318 NODE_NAME_CASE(CVTP2SI_RND)
35319 NODE_NAME_CASE(CVTP2UI_RND)
35320 NODE_NAME_CASE(CVTS2SI)
35321 NODE_NAME_CASE(CVTS2UI)
35322 NODE_NAME_CASE(CVTS2SI_RND)
35323 NODE_NAME_CASE(CVTS2UI_RND)
35324 NODE_NAME_CASE(CVTNEPS2BF16)
35325 NODE_NAME_CASE(MCVTNEPS2BF16)
35326 NODE_NAME_CASE(DPBF16PS)
35327 NODE_NAME_CASE(DPFP16PS)
35328 NODE_NAME_CASE(MPSADBW)
35329 NODE_NAME_CASE(LWPINS)
35330 NODE_NAME_CASE(MGATHER)
35331 NODE_NAME_CASE(MSCATTER)
35332 NODE_NAME_CASE(VPDPBUSD)
35333 NODE_NAME_CASE(VPDPBUSDS)
35334 NODE_NAME_CASE(VPDPWSSD)
35335 NODE_NAME_CASE(VPDPWSSDS)
35336 NODE_NAME_CASE(VPSHUFBITQMB)
35337 NODE_NAME_CASE(GF2P8MULB)
35338 NODE_NAME_CASE(GF2P8AFFINEQB)
35339 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35340 NODE_NAME_CASE(NT_CALL)
35341 NODE_NAME_CASE(NT_BRIND)
35342 NODE_NAME_CASE(UMWAIT)
35343 NODE_NAME_CASE(TPAUSE)
35344 NODE_NAME_CASE(ENQCMD)
35345 NODE_NAME_CASE(ENQCMDS)
35346 NODE_NAME_CASE(VP2INTERSECT)
35347 NODE_NAME_CASE(VPDPBSUD)
35348 NODE_NAME_CASE(VPDPBSUDS)
35349 NODE_NAME_CASE(VPDPBUUD)
35350 NODE_NAME_CASE(VPDPBUUDS)
35351 NODE_NAME_CASE(VPDPBSSD)
35352 NODE_NAME_CASE(VPDPBSSDS)
35353 NODE_NAME_CASE(VPDPWSUD)
35354 NODE_NAME_CASE(VPDPWSUDS)
35355 NODE_NAME_CASE(VPDPWUSD)
35356 NODE_NAME_CASE(VPDPWUSDS)
35357 NODE_NAME_CASE(VPDPWUUD)
35358 NODE_NAME_CASE(VPDPWUUDS)
35359 NODE_NAME_CASE(VMINMAX)
35360 NODE_NAME_CASE(VMINMAX_SAE)
35361 NODE_NAME_CASE(VMINMAXS)
35362 NODE_NAME_CASE(VMINMAXS_SAE)
35363 NODE_NAME_CASE(CVTP2IBS)
35364 NODE_NAME_CASE(CVTP2IUBS)
35365 NODE_NAME_CASE(CVTP2IBS_RND)
35366 NODE_NAME_CASE(CVTP2IUBS_RND)
35367 NODE_NAME_CASE(CVTTP2IBS)
35368 NODE_NAME_CASE(CVTTP2IUBS)
35369 NODE_NAME_CASE(CVTTP2IBS_SAE)
35370 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35371 NODE_NAME_CASE(VCVT2PH2BF8)
35372 NODE_NAME_CASE(VCVT2PH2BF8S)
35373 NODE_NAME_CASE(VCVT2PH2HF8)
35374 NODE_NAME_CASE(VCVT2PH2HF8S)
35375 NODE_NAME_CASE(VCVTBIASPH2BF8)
35376 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35377 NODE_NAME_CASE(VCVTBIASPH2HF8)
35378 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35379 NODE_NAME_CASE(VCVTPH2BF8)
35380 NODE_NAME_CASE(VCVTPH2BF8S)
35381 NODE_NAME_CASE(VCVTPH2HF8)
35382 NODE_NAME_CASE(VCVTPH2HF8S)
35383 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35384 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35385 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35386 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35387 NODE_NAME_CASE(VMCVTPH2BF8)
35388 NODE_NAME_CASE(VMCVTPH2BF8S)
35389 NODE_NAME_CASE(VMCVTPH2HF8)
35390 NODE_NAME_CASE(VMCVTPH2HF8S)
35391 NODE_NAME_CASE(VCVTHF82PH)
35392 NODE_NAME_CASE(AESENC128KL)
35393 NODE_NAME_CASE(AESDEC128KL)
35394 NODE_NAME_CASE(AESENC256KL)
35395 NODE_NAME_CASE(AESDEC256KL)
35396 NODE_NAME_CASE(AESENCWIDE128KL)
35397 NODE_NAME_CASE(AESDECWIDE128KL)
35398 NODE_NAME_CASE(AESENCWIDE256KL)
35399 NODE_NAME_CASE(AESDECWIDE256KL)
35400 NODE_NAME_CASE(CMPCCXADD)
35401 NODE_NAME_CASE(TESTUI)
35402 NODE_NAME_CASE(FP80_ADD)
35403 NODE_NAME_CASE(STRICT_FP80_ADD)
35404 NODE_NAME_CASE(CCMP)
35405 NODE_NAME_CASE(CTEST)
35406 NODE_NAME_CASE(CLOAD)
35407 NODE_NAME_CASE(CSTORE)
35408 NODE_NAME_CASE(CVTTS2SIS)
35409 NODE_NAME_CASE(CVTTS2UIS)
35410 NODE_NAME_CASE(CVTTS2SIS_SAE)
35411 NODE_NAME_CASE(CVTTS2UIS_SAE)
35412 NODE_NAME_CASE(CVTTP2SIS)
35413 NODE_NAME_CASE(MCVTTP2SIS)
35414 NODE_NAME_CASE(CVTTP2UIS_SAE)
35415 NODE_NAME_CASE(CVTTP2SIS_SAE)
35416 NODE_NAME_CASE(CVTTP2UIS)
35417 NODE_NAME_CASE(MCVTTP2UIS)
35418 NODE_NAME_CASE(POP_FROM_X87_REG)
35419 }
35420 return nullptr;
35421#undef NODE_NAME_CASE
35422}
35423
35424/// Return true if the addressing mode represented by AM is legal for this
35425/// target, for a load/store of the specified type.
35427 const AddrMode &AM, Type *Ty,
35428 unsigned AS,
35429 Instruction *I) const {
35430 // X86 supports extremely general addressing modes.
35432
35433 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35434 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35435 return false;
35436
35437 if (AM.BaseGV) {
35438 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35439
35440 // If a reference to this global requires an extra load, we can't fold it.
35441 if (isGlobalStubReference(GVFlags))
35442 return false;
35443
35444 // If BaseGV requires a register for the PIC base, we cannot also have a
35445 // BaseReg specified.
35446 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35447 return false;
35448
35449 // If lower 4G is not available, then we must use rip-relative addressing.
35450 if ((M != CodeModel::Small || isPositionIndependent()) &&
35451 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35452 return false;
35453 }
35454
35455 switch (AM.Scale) {
35456 case 0:
35457 case 1:
35458 case 2:
35459 case 4:
35460 case 8:
35461 // These scales always work.
35462 break;
35463 case 3:
35464 case 5:
35465 case 9:
35466 // These scales are formed with basereg+scalereg. Only accept if there is
35467 // no basereg yet.
35468 if (AM.HasBaseReg)
35469 return false;
35470 break;
35471 default: // Other stuff never works.
35472 return false;
35473 }
35474
35475 return true;
35476}
35477
35478bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35479 switch (Opcode) {
35480 // These are non-commutative binops.
35481 // TODO: Add more X86ISD opcodes once we have test coverage.
35482 case X86ISD::ANDNP:
35483 case X86ISD::PCMPGT:
35484 case X86ISD::FMAX:
35485 case X86ISD::FMIN:
35486 case X86ISD::FANDN:
35487 case X86ISD::VPSHA:
35488 case X86ISD::VPSHL:
35489 case X86ISD::VSHLV:
35490 case X86ISD::VSRLV:
35491 case X86ISD::VSRAV:
35492 return true;
35493 }
35494
35495 return TargetLoweringBase::isBinOp(Opcode);
35496}
35497
35498bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35499 switch (Opcode) {
35500 // TODO: Add more X86ISD opcodes once we have test coverage.
35501 case X86ISD::PCMPEQ:
35502 case X86ISD::PMULDQ:
35503 case X86ISD::PMULUDQ:
35504 case X86ISD::FMAXC:
35505 case X86ISD::FMINC:
35506 case X86ISD::FAND:
35507 case X86ISD::FOR:
35508 case X86ISD::FXOR:
35509 return true;
35510 }
35511
35513}
35514
35516 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35517 return false;
35518 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35519 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35520 return NumBits1 > NumBits2;
35521}
35522
35524 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35525 return false;
35526
35527 if (!isTypeLegal(EVT::getEVT(Ty1)))
35528 return false;
35529
35530 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35531
35532 // Assuming the caller doesn't have a zeroext or signext return parameter,
35533 // truncation all the way down to i1 is valid.
35534 return true;
35535}
35536
35538 return isInt<32>(Imm);
35539}
35540
35542 // Can also use sub to handle negated immediates.
35543 return isInt<32>(Imm);
35544}
35545
35547 return isInt<32>(Imm);
35548}
35549
35551 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35552 return false;
35553 unsigned NumBits1 = VT1.getSizeInBits();
35554 unsigned NumBits2 = VT2.getSizeInBits();
35555 return NumBits1 > NumBits2;
35556}
35557
35559 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35560 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35561}
35562
35564 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35565 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35566}
35567
35569 EVT VT1 = Val.getValueType();
35570 if (isZExtFree(VT1, VT2))
35571 return true;
35572
35573 if (Val.getOpcode() != ISD::LOAD)
35574 return false;
35575
35576 if (!VT1.isSimple() || !VT1.isInteger() ||
35577 !VT2.isSimple() || !VT2.isInteger())
35578 return false;
35579
35580 switch (VT1.getSimpleVT().SimpleTy) {
35581 default: break;
35582 case MVT::i8:
35583 case MVT::i16:
35584 case MVT::i32:
35585 // X86 has 8, 16, and 32-bit zero-extending loads.
35586 return true;
35587 }
35588
35589 return false;
35590}
35591
35593 if (!Subtarget.is64Bit())
35594 return false;
35595 return TargetLowering::shouldConvertPhiType(From, To);
35596}
35597
35599 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35600 return false;
35601
35602 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35603
35604 // There is no extending load for vXi1.
35605 if (SrcVT.getScalarType() == MVT::i1)
35606 return false;
35607
35608 return true;
35609}
35610
35612 EVT VT) const {
35613 if (Subtarget.useSoftFloat())
35614 return false;
35615
35616 if (!Subtarget.hasAnyFMA())
35617 return false;
35618
35619 VT = VT.getScalarType();
35620
35621 if (!VT.isSimple())
35622 return false;
35623
35624 switch (VT.getSimpleVT().SimpleTy) {
35625 case MVT::f16:
35626 return Subtarget.hasFP16();
35627 case MVT::f32:
35628 case MVT::f64:
35629 return true;
35630 default:
35631 break;
35632 }
35633
35634 return false;
35635}
35636
35638 EVT DestVT) const {
35639 // i16 instructions are longer (0x66 prefix) and potentially slower.
35640 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35641}
35642
35644 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35645 SDValue Y) const {
35646 if (SelectOpcode == ISD::SELECT) {
35647 if (VT.isVector())
35648 return false;
35649 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35650 return false;
35651 using namespace llvm::SDPatternMatch;
35652 // BLSI
35653 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35655 return true;
35656 // BLSR
35657 if (BinOpcode == ISD::AND &&
35660 return true;
35661 // BLSMSK
35662 if (BinOpcode == ISD::XOR &&
35665 return true;
35666
35667 return false;
35668 }
35669 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35670 // benefit. The transform may also be profitable for scalar code.
35671 if (!Subtarget.hasAVX512())
35672 return false;
35673 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35674 return false;
35675 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35676 return false;
35677
35678 return true;
35679}
35680
35681/// Targets can use this to indicate that they only support *some*
35682/// VECTOR_SHUFFLE operations, those with specific masks.
35683/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35684/// are assumed to be legal.
35686 if (!VT.isSimple())
35687 return false;
35688
35689 // Not for i1 vectors
35690 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35691 return false;
35692
35693 // Very little shuffling can be done for 64-bit vectors right now.
35694 if (VT.getSimpleVT().getSizeInBits() == 64)
35695 return false;
35696
35697 // We only care that the types being shuffled are legal. The lowering can
35698 // handle any possible shuffle mask that results.
35699 return isTypeLegal(VT.getSimpleVT());
35700}
35701
35703 EVT VT) const {
35704 // Don't convert an 'and' into a shuffle that we don't directly support.
35705 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35706 if (!Subtarget.hasAVX2())
35707 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35708 return false;
35709
35710 // Just delegate to the generic legality, clear masks aren't special.
35711 return isShuffleMaskLegal(Mask, VT);
35712}
35713
35715 // If the subtarget is using thunks, we need to not generate jump tables.
35716 if (Subtarget.useIndirectThunkBranches())
35717 return false;
35718
35719 // Otherwise, fallback on the generic logic.
35721}
35722
35724 EVT ConditionVT) const {
35725 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35726 // zero-extensions.
35727 if (ConditionVT.getSizeInBits() < 32)
35728 return MVT::i32;
35730 ConditionVT);
35731}
35732
35733//===----------------------------------------------------------------------===//
35734// X86 Scheduler Hooks
35735//===----------------------------------------------------------------------===//
35736
35737/// Utility function to emit xbegin specifying the start of an RTM region.
35739 const TargetInstrInfo *TII) {
35740 const MIMetadata MIMD(MI);
35741
35742 const BasicBlock *BB = MBB->getBasicBlock();
35743 MachineFunction::iterator I = ++MBB->getIterator();
35744
35745 // For the v = xbegin(), we generate
35746 //
35747 // thisMBB:
35748 // xbegin sinkMBB
35749 //
35750 // mainMBB:
35751 // s0 = -1
35752 //
35753 // fallBB:
35754 // eax = # XABORT_DEF
35755 // s1 = eax
35756 //
35757 // sinkMBB:
35758 // v = phi(s0/mainBB, s1/fallBB)
35759
35760 MachineBasicBlock *thisMBB = MBB;
35761 MachineFunction *MF = MBB->getParent();
35762 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35763 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35764 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35765 MF->insert(I, mainMBB);
35766 MF->insert(I, fallMBB);
35767 MF->insert(I, sinkMBB);
35768
35769 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35770 mainMBB->addLiveIn(X86::EFLAGS);
35771 fallMBB->addLiveIn(X86::EFLAGS);
35772 sinkMBB->addLiveIn(X86::EFLAGS);
35773 }
35774
35775 // Transfer the remainder of BB and its successor edges to sinkMBB.
35776 sinkMBB->splice(sinkMBB->begin(), MBB,
35777 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35779
35781 Register DstReg = MI.getOperand(0).getReg();
35782 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35783 Register mainDstReg = MRI.createVirtualRegister(RC);
35784 Register fallDstReg = MRI.createVirtualRegister(RC);
35785
35786 // thisMBB:
35787 // xbegin fallMBB
35788 // # fallthrough to mainMBB
35789 // # abortion to fallMBB
35790 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35791 thisMBB->addSuccessor(mainMBB);
35792 thisMBB->addSuccessor(fallMBB);
35793
35794 // mainMBB:
35795 // mainDstReg := -1
35796 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35797 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35798 mainMBB->addSuccessor(sinkMBB);
35799
35800 // fallMBB:
35801 // ; pseudo instruction to model hardware's definition from XABORT
35802 // EAX := XABORT_DEF
35803 // fallDstReg := EAX
35804 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35805 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35806 .addReg(X86::EAX);
35807 fallMBB->addSuccessor(sinkMBB);
35808
35809 // sinkMBB:
35810 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35811 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35812 .addReg(mainDstReg).addMBB(mainMBB)
35813 .addReg(fallDstReg).addMBB(fallMBB);
35814
35815 MI.eraseFromParent();
35816 return sinkMBB;
35817}
35818
35820X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35821 MachineBasicBlock *MBB) const {
35822 // Emit va_arg instruction on X86-64.
35823
35824 // Operands to this pseudo-instruction:
35825 // 0 ) Output : destination address (reg)
35826 // 1-5) Input : va_list address (addr, i64mem)
35827 // 6 ) ArgSize : Size (in bytes) of vararg type
35828 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35829 // 8 ) Align : Alignment of type
35830 // 9 ) EFLAGS (implicit-def)
35831
35832 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35833 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35834
35835 Register DestReg = MI.getOperand(0).getReg();
35836 MachineOperand &Base = MI.getOperand(1);
35837 MachineOperand &Scale = MI.getOperand(2);
35838 MachineOperand &Index = MI.getOperand(3);
35839 MachineOperand &Disp = MI.getOperand(4);
35840 MachineOperand &Segment = MI.getOperand(5);
35841 unsigned ArgSize = MI.getOperand(6).getImm();
35842 unsigned ArgMode = MI.getOperand(7).getImm();
35843 Align Alignment = Align(MI.getOperand(8).getImm());
35844
35845 MachineFunction *MF = MBB->getParent();
35846
35847 // Memory Reference
35848 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35849
35850 MachineMemOperand *OldMMO = MI.memoperands().front();
35851
35852 // Clone the MMO into two separate MMOs for loading and storing
35853 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35854 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35855 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35856 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35857
35858 // Machine Information
35859 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35860 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35861 const TargetRegisterClass *AddrRegClass =
35863 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35864 const MIMetadata MIMD(MI);
35865
35866 // struct va_list {
35867 // i32 gp_offset
35868 // i32 fp_offset
35869 // i64 overflow_area (address)
35870 // i64 reg_save_area (address)
35871 // }
35872 // sizeof(va_list) = 24
35873 // alignment(va_list) = 8
35874
35875 unsigned TotalNumIntRegs = 6;
35876 unsigned TotalNumXMMRegs = 8;
35877 bool UseGPOffset = (ArgMode == 1);
35878 bool UseFPOffset = (ArgMode == 2);
35879 unsigned MaxOffset = TotalNumIntRegs * 8 +
35880 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35881
35882 /* Align ArgSize to a multiple of 8 */
35883 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35884 bool NeedsAlign = (Alignment > 8);
35885
35886 MachineBasicBlock *thisMBB = MBB;
35887 MachineBasicBlock *overflowMBB;
35888 MachineBasicBlock *offsetMBB;
35889 MachineBasicBlock *endMBB;
35890
35891 Register OffsetDestReg; // Argument address computed by offsetMBB
35892 Register OverflowDestReg; // Argument address computed by overflowMBB
35893 Register OffsetReg;
35894
35895 if (!UseGPOffset && !UseFPOffset) {
35896 // If we only pull from the overflow region, we don't create a branch.
35897 // We don't need to alter control flow.
35898 OffsetDestReg = Register(); // unused
35899 OverflowDestReg = DestReg;
35900
35901 offsetMBB = nullptr;
35902 overflowMBB = thisMBB;
35903 endMBB = thisMBB;
35904 } else {
35905 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35906 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35907 // If not, pull from overflow_area. (branch to overflowMBB)
35908 //
35909 // thisMBB
35910 // | .
35911 // | .
35912 // offsetMBB overflowMBB
35913 // | .
35914 // | .
35915 // endMBB
35916
35917 // Registers for the PHI in endMBB
35918 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35919 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35920
35921 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35922 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35923 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35924 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35925
35927
35928 // Insert the new basic blocks
35929 MF->insert(MBBIter, offsetMBB);
35930 MF->insert(MBBIter, overflowMBB);
35931 MF->insert(MBBIter, endMBB);
35932
35933 // Transfer the remainder of MBB and its successor edges to endMBB.
35934 endMBB->splice(endMBB->begin(), thisMBB,
35935 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35936 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35937
35938 // Make offsetMBB and overflowMBB successors of thisMBB
35939 thisMBB->addSuccessor(offsetMBB);
35940 thisMBB->addSuccessor(overflowMBB);
35941
35942 // endMBB is a successor of both offsetMBB and overflowMBB
35943 offsetMBB->addSuccessor(endMBB);
35944 overflowMBB->addSuccessor(endMBB);
35945
35946 // Load the offset value into a register
35947 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35948 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35949 .add(Base)
35950 .add(Scale)
35951 .add(Index)
35952 .addDisp(Disp, UseFPOffset ? 4 : 0)
35953 .add(Segment)
35954 .setMemRefs(LoadOnlyMMO);
35955
35956 // Check if there is enough room left to pull this argument.
35957 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35958 .addReg(OffsetReg)
35959 .addImm(MaxOffset + 8 - ArgSizeA8);
35960
35961 // Branch to "overflowMBB" if offset >= max
35962 // Fall through to "offsetMBB" otherwise
35963 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35964 .addMBB(overflowMBB).addImm(X86::COND_AE);
35965 }
35966
35967 // In offsetMBB, emit code to use the reg_save_area.
35968 if (offsetMBB) {
35969 assert(OffsetReg != 0);
35970
35971 // Read the reg_save_area address.
35972 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35973 BuildMI(
35974 offsetMBB, MIMD,
35975 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35976 RegSaveReg)
35977 .add(Base)
35978 .add(Scale)
35979 .add(Index)
35980 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35981 .add(Segment)
35982 .setMemRefs(LoadOnlyMMO);
35983
35984 if (Subtarget.isTarget64BitLP64()) {
35985 // Zero-extend the offset
35986 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35987 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35988 .addImm(0)
35989 .addReg(OffsetReg)
35990 .addImm(X86::sub_32bit);
35991
35992 // Add the offset to the reg_save_area to get the final address.
35993 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35994 .addReg(OffsetReg64)
35995 .addReg(RegSaveReg);
35996 } else {
35997 // Add the offset to the reg_save_area to get the final address.
35998 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
35999 .addReg(OffsetReg)
36000 .addReg(RegSaveReg);
36001 }
36002
36003 // Compute the offset for the next argument
36004 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36005 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
36006 .addReg(OffsetReg)
36007 .addImm(UseFPOffset ? 16 : 8);
36008
36009 // Store it back into the va_list.
36010 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
36011 .add(Base)
36012 .add(Scale)
36013 .add(Index)
36014 .addDisp(Disp, UseFPOffset ? 4 : 0)
36015 .add(Segment)
36016 .addReg(NextOffsetReg)
36017 .setMemRefs(StoreOnlyMMO);
36018
36019 // Jump to endMBB
36020 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
36021 .addMBB(endMBB);
36022 }
36023
36024 //
36025 // Emit code to use overflow area
36026 //
36027
36028 // Load the overflow_area address into a register.
36029 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36030 BuildMI(overflowMBB, MIMD,
36031 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36032 OverflowAddrReg)
36033 .add(Base)
36034 .add(Scale)
36035 .add(Index)
36036 .addDisp(Disp, 8)
36037 .add(Segment)
36038 .setMemRefs(LoadOnlyMMO);
36039
36040 // If we need to align it, do so. Otherwise, just copy the address
36041 // to OverflowDestReg.
36042 if (NeedsAlign) {
36043 // Align the overflow address
36044 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36045
36046 // aligned_addr = (addr + (align-1)) & ~(align-1)
36047 BuildMI(
36048 overflowMBB, MIMD,
36049 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36050 TmpReg)
36051 .addReg(OverflowAddrReg)
36052 .addImm(Alignment.value() - 1);
36053
36054 BuildMI(
36055 overflowMBB, MIMD,
36056 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36057 OverflowDestReg)
36058 .addReg(TmpReg)
36059 .addImm(~(uint64_t)(Alignment.value() - 1));
36060 } else {
36061 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36062 .addReg(OverflowAddrReg);
36063 }
36064
36065 // Compute the next overflow address after this argument.
36066 // (the overflow address should be kept 8-byte aligned)
36067 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36068 BuildMI(
36069 overflowMBB, MIMD,
36070 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36071 NextAddrReg)
36072 .addReg(OverflowDestReg)
36073 .addImm(ArgSizeA8);
36074
36075 // Store the new overflow address.
36076 BuildMI(overflowMBB, MIMD,
36077 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36078 .add(Base)
36079 .add(Scale)
36080 .add(Index)
36081 .addDisp(Disp, 8)
36082 .add(Segment)
36083 .addReg(NextAddrReg)
36084 .setMemRefs(StoreOnlyMMO);
36085
36086 // If we branched, emit the PHI to the front of endMBB.
36087 if (offsetMBB) {
36088 BuildMI(*endMBB, endMBB->begin(), MIMD,
36089 TII->get(X86::PHI), DestReg)
36090 .addReg(OffsetDestReg).addMBB(offsetMBB)
36091 .addReg(OverflowDestReg).addMBB(overflowMBB);
36092 }
36093
36094 // Erase the pseudo instruction
36095 MI.eraseFromParent();
36096
36097 return endMBB;
36098}
36099
36100// The EFLAGS operand of SelectItr might be missing a kill marker
36101// because there were multiple uses of EFLAGS, and ISel didn't know
36102// which to mark. Figure out whether SelectItr should have had a
36103// kill marker, and set it if it should. Returns the correct kill
36104// marker value.
36107 const TargetRegisterInfo* TRI) {
36108 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36109 return false;
36110
36111 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36112 // out. SelectMI should have a kill flag on EFLAGS.
36113 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36114 return true;
36115}
36116
36117// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36118// together with other CMOV pseudo-opcodes into a single basic-block with
36119// conditional jump around it.
36121 switch (MI.getOpcode()) {
36122 case X86::CMOV_FR16:
36123 case X86::CMOV_FR16X:
36124 case X86::CMOV_FR32:
36125 case X86::CMOV_FR32X:
36126 case X86::CMOV_FR64:
36127 case X86::CMOV_FR64X:
36128 case X86::CMOV_GR8:
36129 case X86::CMOV_GR16:
36130 case X86::CMOV_GR32:
36131 case X86::CMOV_RFP32:
36132 case X86::CMOV_RFP64:
36133 case X86::CMOV_RFP80:
36134 case X86::CMOV_VR64:
36135 case X86::CMOV_VR128:
36136 case X86::CMOV_VR128X:
36137 case X86::CMOV_VR256:
36138 case X86::CMOV_VR256X:
36139 case X86::CMOV_VR512:
36140 case X86::CMOV_VK1:
36141 case X86::CMOV_VK2:
36142 case X86::CMOV_VK4:
36143 case X86::CMOV_VK8:
36144 case X86::CMOV_VK16:
36145 case X86::CMOV_VK32:
36146 case X86::CMOV_VK64:
36147 return true;
36148
36149 default:
36150 return false;
36151 }
36152}
36153
36154// Helper function, which inserts PHI functions into SinkMBB:
36155// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36156// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36157// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36158// the last PHI function inserted.
36161 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36162 MachineBasicBlock *SinkMBB) {
36163 MachineFunction *MF = TrueMBB->getParent();
36165 const MIMetadata MIMD(*MIItBegin);
36166
36167 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36169
36170 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36171
36172 // As we are creating the PHIs, we have to be careful if there is more than
36173 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36174 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36175 // That also means that PHI construction must work forward from earlier to
36176 // later, and that the code must maintain a mapping from earlier PHI's
36177 // destination registers, and the registers that went into the PHI.
36180
36181 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36182 Register DestReg = MIIt->getOperand(0).getReg();
36183 Register Op1Reg = MIIt->getOperand(1).getReg();
36184 Register Op2Reg = MIIt->getOperand(2).getReg();
36185
36186 // If this CMOV we are generating is the opposite condition from
36187 // the jump we generated, then we have to swap the operands for the
36188 // PHI that is going to be generated.
36189 if (MIIt->getOperand(3).getImm() == OppCC)
36190 std::swap(Op1Reg, Op2Reg);
36191
36192 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36193 Op1Reg = It->second.first;
36194
36195 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36196 Op2Reg = It->second.second;
36197
36198 MIB =
36199 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36200 .addReg(Op1Reg)
36201 .addMBB(FalseMBB)
36202 .addReg(Op2Reg)
36203 .addMBB(TrueMBB);
36204
36205 // Add this PHI to the rewrite table.
36206 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36207 }
36208
36209 return MIB;
36210}
36211
36212// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36214X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36215 MachineInstr &SecondCascadedCMOV,
36216 MachineBasicBlock *ThisMBB) const {
36217 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36218 const MIMetadata MIMD(FirstCMOV);
36219
36220 // We lower cascaded CMOVs such as
36221 //
36222 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36223 //
36224 // to two successive branches.
36225 //
36226 // Without this, we would add a PHI between the two jumps, which ends up
36227 // creating a few copies all around. For instance, for
36228 //
36229 // (sitofp (zext (fcmp une)))
36230 //
36231 // we would generate:
36232 //
36233 // ucomiss %xmm1, %xmm0
36234 // movss <1.0f>, %xmm0
36235 // movaps %xmm0, %xmm1
36236 // jne .LBB5_2
36237 // xorps %xmm1, %xmm1
36238 // .LBB5_2:
36239 // jp .LBB5_4
36240 // movaps %xmm1, %xmm0
36241 // .LBB5_4:
36242 // retq
36243 //
36244 // because this custom-inserter would have generated:
36245 //
36246 // A
36247 // | \
36248 // | B
36249 // | /
36250 // C
36251 // | \
36252 // | D
36253 // | /
36254 // E
36255 //
36256 // A: X = ...; Y = ...
36257 // B: empty
36258 // C: Z = PHI [X, A], [Y, B]
36259 // D: empty
36260 // E: PHI [X, C], [Z, D]
36261 //
36262 // If we lower both CMOVs in a single step, we can instead generate:
36263 //
36264 // A
36265 // | \
36266 // | C
36267 // | /|
36268 // |/ |
36269 // | |
36270 // | D
36271 // | /
36272 // E
36273 //
36274 // A: X = ...; Y = ...
36275 // D: empty
36276 // E: PHI [X, A], [X, C], [Y, D]
36277 //
36278 // Which, in our sitofp/fcmp example, gives us something like:
36279 //
36280 // ucomiss %xmm1, %xmm0
36281 // movss <1.0f>, %xmm0
36282 // jne .LBB5_4
36283 // jp .LBB5_4
36284 // xorps %xmm0, %xmm0
36285 // .LBB5_4:
36286 // retq
36287 //
36288
36289 // We lower cascaded CMOV into two successive branches to the same block.
36290 // EFLAGS is used by both, so mark it as live in the second.
36291 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36292 MachineFunction *F = ThisMBB->getParent();
36293 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36294 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36295 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36296
36297 MachineFunction::iterator It = ++ThisMBB->getIterator();
36298 F->insert(It, FirstInsertedMBB);
36299 F->insert(It, SecondInsertedMBB);
36300 F->insert(It, SinkMBB);
36301
36302 // For a cascaded CMOV, we lower it to two successive branches to
36303 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36304 // the FirstInsertedMBB.
36305 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36306
36307 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36308 // live into the sink and copy blocks.
36309 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36310 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36311 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36312 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36313 SinkMBB->addLiveIn(X86::EFLAGS);
36314 }
36315
36316 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36317 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36318 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36319 ThisMBB->end());
36320 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36321
36322 // Fallthrough block for ThisMBB.
36323 ThisMBB->addSuccessor(FirstInsertedMBB);
36324 // The true block target of the first branch is always SinkMBB.
36325 ThisMBB->addSuccessor(SinkMBB);
36326 // Fallthrough block for FirstInsertedMBB.
36327 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36328 // The true block for the branch of FirstInsertedMBB.
36329 FirstInsertedMBB->addSuccessor(SinkMBB);
36330 // This is fallthrough.
36331 SecondInsertedMBB->addSuccessor(SinkMBB);
36332
36333 // Create the conditional branch instructions.
36334 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36335 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36336
36337 X86::CondCode SecondCC =
36338 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36339 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36340 .addMBB(SinkMBB)
36341 .addImm(SecondCC);
36342
36343 // SinkMBB:
36344 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36345 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36346 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36347 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36348 MachineInstrBuilder MIB =
36349 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36350 .addReg(Op1Reg)
36351 .addMBB(SecondInsertedMBB)
36352 .addReg(Op2Reg)
36353 .addMBB(ThisMBB);
36354
36355 // The second SecondInsertedMBB provides the same incoming value as the
36356 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36357 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36358
36359 // Now remove the CMOVs.
36360 FirstCMOV.eraseFromParent();
36361 SecondCascadedCMOV.eraseFromParent();
36362
36363 return SinkMBB;
36364}
36365
36367X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36368 MachineBasicBlock *ThisMBB) const {
36369 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36370 const MIMetadata MIMD(MI);
36371
36372 // To "insert" a SELECT_CC instruction, we actually have to insert the
36373 // diamond control-flow pattern. The incoming instruction knows the
36374 // destination vreg to set, the condition code register to branch on, the
36375 // true/false values to select between and a branch opcode to use.
36376
36377 // ThisMBB:
36378 // ...
36379 // TrueVal = ...
36380 // cmpTY ccX, r1, r2
36381 // bCC copy1MBB
36382 // fallthrough --> FalseMBB
36383
36384 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36385 // as described above, by inserting a BB, and then making a PHI at the join
36386 // point to select the true and false operands of the CMOV in the PHI.
36387 //
36388 // The code also handles two different cases of multiple CMOV opcodes
36389 // in a row.
36390 //
36391 // Case 1:
36392 // In this case, there are multiple CMOVs in a row, all which are based on
36393 // the same condition setting (or the exact opposite condition setting).
36394 // In this case we can lower all the CMOVs using a single inserted BB, and
36395 // then make a number of PHIs at the join point to model the CMOVs. The only
36396 // trickiness here, is that in a case like:
36397 //
36398 // t2 = CMOV cond1 t1, f1
36399 // t3 = CMOV cond1 t2, f2
36400 //
36401 // when rewriting this into PHIs, we have to perform some renaming on the
36402 // temps since you cannot have a PHI operand refer to a PHI result earlier
36403 // in the same block. The "simple" but wrong lowering would be:
36404 //
36405 // t2 = PHI t1(BB1), f1(BB2)
36406 // t3 = PHI t2(BB1), f2(BB2)
36407 //
36408 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36409 // renaming is to note that on the path through BB1, t2 is really just a
36410 // copy of t1, and do that renaming, properly generating:
36411 //
36412 // t2 = PHI t1(BB1), f1(BB2)
36413 // t3 = PHI t1(BB1), f2(BB2)
36414 //
36415 // Case 2:
36416 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36417 // function - EmitLoweredCascadedSelect.
36418
36419 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36421 MachineInstr *LastCMOV = &MI;
36423
36424 // Check for case 1, where there are multiple CMOVs with the same condition
36425 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36426 // number of jumps the most.
36427
36428 if (isCMOVPseudo(MI)) {
36429 // See if we have a string of CMOVS with the same condition. Skip over
36430 // intervening debug insts.
36431 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36432 (NextMIIt->getOperand(3).getImm() == CC ||
36433 NextMIIt->getOperand(3).getImm() == OppCC)) {
36434 LastCMOV = &*NextMIIt;
36435 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36436 }
36437 }
36438
36439 // This checks for case 2, but only do this if we didn't already find
36440 // case 1, as indicated by LastCMOV == MI.
36441 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36442 NextMIIt->getOpcode() == MI.getOpcode() &&
36443 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36444 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36445 NextMIIt->getOperand(1).isKill()) {
36446 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36447 }
36448
36449 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36450 MachineFunction *F = ThisMBB->getParent();
36451 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36452 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36453
36454 MachineFunction::iterator It = ++ThisMBB->getIterator();
36455 F->insert(It, FalseMBB);
36456 F->insert(It, SinkMBB);
36457
36458 // Set the call frame size on entry to the new basic blocks.
36459 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36460 FalseMBB->setCallFrameSize(CallFrameSize);
36461 SinkMBB->setCallFrameSize(CallFrameSize);
36462
36463 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36464 // live into the sink and copy blocks.
36465 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36466 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36467 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36468 FalseMBB->addLiveIn(X86::EFLAGS);
36469 SinkMBB->addLiveIn(X86::EFLAGS);
36470 }
36471
36472 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36474 MachineBasicBlock::iterator(LastCMOV));
36475 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36476 if (MI.isDebugInstr())
36477 SinkMBB->push_back(MI.removeFromParent());
36478
36479 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36480 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36481 std::next(MachineBasicBlock::iterator(LastCMOV)),
36482 ThisMBB->end());
36483 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36484
36485 // Fallthrough block for ThisMBB.
36486 ThisMBB->addSuccessor(FalseMBB);
36487 // The true block target of the first (or only) branch is always a SinkMBB.
36488 ThisMBB->addSuccessor(SinkMBB);
36489 // Fallthrough block for FalseMBB.
36490 FalseMBB->addSuccessor(SinkMBB);
36491
36492 // Create the conditional branch instruction.
36493 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36494
36495 // SinkMBB:
36496 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36497 // ...
36500 std::next(MachineBasicBlock::iterator(LastCMOV));
36501 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36502
36503 // Now remove the CMOV(s).
36504 ThisMBB->erase(MIItBegin, MIItEnd);
36505
36506 return SinkMBB;
36507}
36508
36509static unsigned getSUBriOpcode(bool IsLP64) {
36510 if (IsLP64)
36511 return X86::SUB64ri32;
36512 else
36513 return X86::SUB32ri;
36514}
36515
36517X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36518 MachineBasicBlock *MBB) const {
36519 MachineFunction *MF = MBB->getParent();
36520 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36521 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36522 const MIMetadata MIMD(MI);
36523 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36524
36525 const unsigned ProbeSize = getStackProbeSize(*MF);
36526
36527 MachineRegisterInfo &MRI = MF->getRegInfo();
36528 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36529 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36530 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36531
36533 MF->insert(MBBIter, testMBB);
36534 MF->insert(MBBIter, blockMBB);
36535 MF->insert(MBBIter, tailMBB);
36536
36537 Register sizeVReg = MI.getOperand(1).getReg();
36538
36539 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36540
36541 Register TmpStackPtr = MRI.createVirtualRegister(
36542 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36543 Register FinalStackPtr = MRI.createVirtualRegister(
36544 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36545
36546 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36547 .addReg(physSPReg);
36548 {
36549 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36550 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36551 .addReg(TmpStackPtr)
36552 .addReg(sizeVReg);
36553 }
36554
36555 // test rsp size
36556
36557 BuildMI(testMBB, MIMD,
36558 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36559 .addReg(FinalStackPtr)
36560 .addReg(physSPReg);
36561
36562 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36563 .addMBB(tailMBB)
36565 testMBB->addSuccessor(blockMBB);
36566 testMBB->addSuccessor(tailMBB);
36567
36568 // Touch the block then extend it. This is done on the opposite side of
36569 // static probe where we allocate then touch, to avoid the need of probing the
36570 // tail of the static alloca. Possible scenarios are:
36571 //
36572 // + ---- <- ------------ <- ------------- <- ------------ +
36573 // | |
36574 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36575 // | |
36576 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36577 //
36578 // The property we want to enforce is to never have more than [page alloc] between two probes.
36579
36580 const unsigned XORMIOpc =
36581 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36582 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36583 .addImm(0);
36584
36585 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36586 physSPReg)
36587 .addReg(physSPReg)
36588 .addImm(ProbeSize);
36589
36590 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36591 blockMBB->addSuccessor(testMBB);
36592
36593 // Replace original instruction by the expected stack ptr
36594 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36595 MI.getOperand(0).getReg())
36596 .addReg(FinalStackPtr);
36597
36598 tailMBB->splice(tailMBB->end(), MBB,
36599 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36601 MBB->addSuccessor(testMBB);
36602
36603 // Delete the original pseudo instruction.
36604 MI.eraseFromParent();
36605
36606 // And we're done.
36607 return tailMBB;
36608}
36609
36611X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36612 MachineBasicBlock *BB) const {
36613 MachineFunction *MF = BB->getParent();
36614 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36615 const MIMetadata MIMD(MI);
36616 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36617
36618 assert(MF->shouldSplitStack());
36619
36620 const bool Is64Bit = Subtarget.is64Bit();
36621 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36622
36623 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36624 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36625
36626 // BB:
36627 // ... [Till the alloca]
36628 // If stacklet is not large enough, jump to mallocMBB
36629 //
36630 // bumpMBB:
36631 // Allocate by subtracting from RSP
36632 // Jump to continueMBB
36633 //
36634 // mallocMBB:
36635 // Allocate by call to runtime
36636 //
36637 // continueMBB:
36638 // ...
36639 // [rest of original BB]
36640 //
36641
36642 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36643 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36644 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36645
36646 MachineRegisterInfo &MRI = MF->getRegInfo();
36647 const TargetRegisterClass *AddrRegClass =
36649
36650 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36651 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36652 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36653 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36654 sizeVReg = MI.getOperand(1).getReg(),
36655 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36656
36657 MachineFunction::iterator MBBIter = ++BB->getIterator();
36658
36659 MF->insert(MBBIter, bumpMBB);
36660 MF->insert(MBBIter, mallocMBB);
36661 MF->insert(MBBIter, continueMBB);
36662
36663 continueMBB->splice(continueMBB->begin(), BB,
36664 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36665 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36666
36667 // Add code to the main basic block to check if the stack limit has been hit,
36668 // and if so, jump to mallocMBB otherwise to bumpMBB.
36669 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36670 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36671 .addReg(tmpSPVReg).addReg(sizeVReg);
36672 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36673 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36674 .addReg(SPLimitVReg);
36675 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36676
36677 // bumpMBB simply decreases the stack pointer, since we know the current
36678 // stacklet has enough space.
36679 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36680 .addReg(SPLimitVReg);
36681 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36682 .addReg(SPLimitVReg);
36683 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36684
36685 // Calls into a routine in libgcc to allocate more space from the heap.
36686 const uint32_t *RegMask =
36687 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36688 if (IsLP64) {
36689 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36690 .addReg(sizeVReg);
36691 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36692 .addExternalSymbol("__morestack_allocate_stack_space")
36693 .addRegMask(RegMask)
36694 .addReg(X86::RDI, RegState::Implicit)
36695 .addReg(X86::RAX, RegState::ImplicitDefine);
36696 } else if (Is64Bit) {
36697 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36698 .addReg(sizeVReg);
36699 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36700 .addExternalSymbol("__morestack_allocate_stack_space")
36701 .addRegMask(RegMask)
36702 .addReg(X86::EDI, RegState::Implicit)
36703 .addReg(X86::EAX, RegState::ImplicitDefine);
36704 } else {
36705 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36706 .addImm(12);
36707 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36708 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36709 .addExternalSymbol("__morestack_allocate_stack_space")
36710 .addRegMask(RegMask)
36711 .addReg(X86::EAX, RegState::ImplicitDefine);
36712 }
36713
36714 if (!Is64Bit)
36715 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36716 .addImm(16);
36717
36718 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36719 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36720 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36721
36722 // Set up the CFG correctly.
36723 BB->addSuccessor(bumpMBB);
36724 BB->addSuccessor(mallocMBB);
36725 mallocMBB->addSuccessor(continueMBB);
36726 bumpMBB->addSuccessor(continueMBB);
36727
36728 // Take care of the PHI nodes.
36729 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36730 MI.getOperand(0).getReg())
36731 .addReg(mallocPtrVReg)
36732 .addMBB(mallocMBB)
36733 .addReg(bumpSPPtrVReg)
36734 .addMBB(bumpMBB);
36735
36736 // Delete the original pseudo instruction.
36737 MI.eraseFromParent();
36738
36739 // And we're done.
36740 return continueMBB;
36741}
36742
36744X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36745 MachineBasicBlock *BB) const {
36746 MachineFunction *MF = BB->getParent();
36747 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36748 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36749 const MIMetadata MIMD(MI);
36750
36753 "SEH does not use catchret!");
36754
36755 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36756 if (!Subtarget.is32Bit())
36757 return BB;
36758
36759 // C++ EH creates a new target block to hold the restore code, and wires up
36760 // the new block to the return destination with a normal JMP_4.
36761 MachineBasicBlock *RestoreMBB =
36763 assert(BB->succ_size() == 1);
36764 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36765 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36766 BB->addSuccessor(RestoreMBB);
36767 MI.getOperand(0).setMBB(RestoreMBB);
36768
36769 // Marking this as an EH pad but not a funclet entry block causes PEI to
36770 // restore stack pointers in the block.
36771 RestoreMBB->setIsEHPad(true);
36772
36773 auto RestoreMBBI = RestoreMBB->begin();
36774 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36775 return BB;
36776}
36777
36779X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36780 MachineBasicBlock *BB) const {
36781 // This is pretty easy. We're taking the value that we received from
36782 // our load from the relocation, sticking it in either RDI (x86-64)
36783 // or EAX and doing an indirect call. The return value will then
36784 // be in the normal return register.
36785 MachineFunction *F = BB->getParent();
36786 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36787 const MIMetadata MIMD(MI);
36788
36789 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36790 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36791
36792 // Get a register mask for the lowered call.
36793 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36794 // proper register mask.
36795 const uint32_t *RegMask =
36796 Subtarget.is64Bit() ?
36797 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36798 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36799 if (Subtarget.is64Bit()) {
36800 MachineInstrBuilder MIB =
36801 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36802 .addReg(X86::RIP)
36803 .addImm(0)
36804 .addReg(0)
36805 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36806 MI.getOperand(3).getTargetFlags())
36807 .addReg(0);
36808 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36809 addDirectMem(MIB, X86::RDI);
36810 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36811 } else if (!isPositionIndependent()) {
36812 MachineInstrBuilder MIB =
36813 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36814 .addReg(0)
36815 .addImm(0)
36816 .addReg(0)
36817 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36818 MI.getOperand(3).getTargetFlags())
36819 .addReg(0);
36820 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36821 addDirectMem(MIB, X86::EAX);
36822 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36823 } else {
36824 MachineInstrBuilder MIB =
36825 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36826 .addReg(TII->getGlobalBaseReg(F))
36827 .addImm(0)
36828 .addReg(0)
36829 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36830 MI.getOperand(3).getTargetFlags())
36831 .addReg(0);
36832 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36833 addDirectMem(MIB, X86::EAX);
36834 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36835 }
36836
36837 MI.eraseFromParent(); // The pseudo instruction is gone now.
36838 return BB;
36839}
36840
36841static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36842 switch (RPOpc) {
36843 case X86::INDIRECT_THUNK_CALL32:
36844 return X86::CALLpcrel32;
36845 case X86::INDIRECT_THUNK_CALL64:
36846 return X86::CALL64pcrel32;
36847 case X86::INDIRECT_THUNK_TCRETURN32:
36848 return X86::TCRETURNdi;
36849 case X86::INDIRECT_THUNK_TCRETURN64:
36850 return X86::TCRETURNdi64;
36851 }
36852 llvm_unreachable("not indirect thunk opcode");
36853}
36854
36855static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36856 Register Reg) {
36857 if (Subtarget.useRetpolineExternalThunk()) {
36858 // When using an external thunk for retpolines, we pick names that match the
36859 // names GCC happens to use as well. This helps simplify the implementation
36860 // of the thunks for kernels where they have no easy ability to create
36861 // aliases and are doing non-trivial configuration of the thunk's body. For
36862 // example, the Linux kernel will do boot-time hot patching of the thunk
36863 // bodies and cannot easily export aliases of these to loaded modules.
36864 //
36865 // Note that at any point in the future, we may need to change the semantics
36866 // of how we implement retpolines and at that time will likely change the
36867 // name of the called thunk. Essentially, there is no hard guarantee that
36868 // LLVM will generate calls to specific thunks, we merely make a best-effort
36869 // attempt to help out kernels and other systems where duplicating the
36870 // thunks is costly.
36871 switch (Reg.id()) {
36872 case X86::EAX:
36873 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36874 return "__x86_indirect_thunk_eax";
36875 case X86::ECX:
36876 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36877 return "__x86_indirect_thunk_ecx";
36878 case X86::EDX:
36879 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36880 return "__x86_indirect_thunk_edx";
36881 case X86::EDI:
36882 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36883 return "__x86_indirect_thunk_edi";
36884 case X86::R11:
36885 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36886 return "__x86_indirect_thunk_r11";
36887 }
36888 llvm_unreachable("unexpected reg for external indirect thunk");
36889 }
36890
36891 if (Subtarget.useRetpolineIndirectCalls() ||
36892 Subtarget.useRetpolineIndirectBranches()) {
36893 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36894 switch (Reg.id()) {
36895 case X86::EAX:
36896 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36897 return "__llvm_retpoline_eax";
36898 case X86::ECX:
36899 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36900 return "__llvm_retpoline_ecx";
36901 case X86::EDX:
36902 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36903 return "__llvm_retpoline_edx";
36904 case X86::EDI:
36905 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36906 return "__llvm_retpoline_edi";
36907 case X86::R11:
36908 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36909 return "__llvm_retpoline_r11";
36910 }
36911 llvm_unreachable("unexpected reg for retpoline");
36912 }
36913
36914 if (Subtarget.useLVIControlFlowIntegrity()) {
36915 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36916 return "__llvm_lvi_thunk_r11";
36917 }
36918 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36919}
36920
36922X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36923 MachineBasicBlock *BB) const {
36924 // Copy the virtual register into the R11 physical register and
36925 // call the retpoline thunk.
36926 const MIMetadata MIMD(MI);
36927 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36928 Register CalleeVReg = MI.getOperand(0).getReg();
36929 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36930
36931 // Find an available scratch register to hold the callee. On 64-bit, we can
36932 // just use R11, but we scan for uses anyway to ensure we don't generate
36933 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36934 // already a register use operand to the call to hold the callee. If none
36935 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36936 // register and ESI is the base pointer to realigned stack frames with VLAs.
36937 SmallVector<Register, 3> AvailableRegs;
36938 if (Subtarget.is64Bit())
36939 AvailableRegs.push_back(X86::R11);
36940 else
36941 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36942
36943 // Zero out any registers that are already used.
36944 for (const auto &MO : MI.operands()) {
36945 if (MO.isReg() && MO.isUse())
36946 llvm::replace(AvailableRegs, MO.getReg(), Register());
36947 }
36948
36949 // Choose the first remaining non-zero available register.
36950 Register AvailableReg;
36951 for (Register MaybeReg : AvailableRegs) {
36952 if (MaybeReg) {
36953 AvailableReg = MaybeReg;
36954 break;
36955 }
36956 }
36957 if (!AvailableReg)
36958 report_fatal_error("calling convention incompatible with retpoline, no "
36959 "available registers");
36960
36961 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36962
36963 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36964 .addReg(CalleeVReg);
36965 MI.getOperand(0).ChangeToES(Symbol);
36966 MI.setDesc(TII->get(Opc));
36967 MachineInstrBuilder(*BB->getParent(), &MI)
36968 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36969 return BB;
36970}
36971
36972/// SetJmp implies future control flow change upon calling the corresponding
36973/// LongJmp.
36974/// Instead of using the 'return' instruction, the long jump fixes the stack and
36975/// performs an indirect branch. To do so it uses the registers that were stored
36976/// in the jump buffer (when calling SetJmp).
36977/// In case the shadow stack is enabled we need to fix it as well, because some
36978/// return addresses will be skipped.
36979/// The function will save the SSP for future fixing in the function
36980/// emitLongJmpShadowStackFix.
36981/// \sa emitLongJmpShadowStackFix
36982/// \param [in] MI The temporary Machine Instruction for the builtin.
36983/// \param [in] MBB The Machine Basic Block that will be modified.
36984void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36985 MachineBasicBlock *MBB) const {
36986 const MIMetadata MIMD(MI);
36987 MachineFunction *MF = MBB->getParent();
36988 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36989 MachineRegisterInfo &MRI = MF->getRegInfo();
36990 MachineInstrBuilder MIB;
36991
36992 // Memory Reference.
36993 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36994
36995 // Initialize a register with zero.
36996 MVT PVT = getPointerTy(MF->getDataLayout());
36997 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36998 Register ZReg = MRI.createVirtualRegister(PtrRC);
36999 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
37000 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
37001 .addDef(ZReg)
37002 .addReg(ZReg, RegState::Undef)
37003 .addReg(ZReg, RegState::Undef);
37004
37005 // Read the current SSP Register value to the zeroed register.
37006 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37007 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37008 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37009
37010 // Write the SSP register value to offset 3 in input memory buffer.
37011 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37012 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
37013 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37014 const unsigned MemOpndSlot = 1;
37015 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37016 if (i == X86::AddrDisp)
37017 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37018 else
37019 MIB.add(MI.getOperand(MemOpndSlot + i));
37020 }
37021 MIB.addReg(SSPCopyReg);
37022 MIB.setMemRefs(MMOs);
37023}
37024
37026X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37027 MachineBasicBlock *MBB) const {
37028 const MIMetadata MIMD(MI);
37029 MachineFunction *MF = MBB->getParent();
37030 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37031 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37032 MachineRegisterInfo &MRI = MF->getRegInfo();
37033
37034 const BasicBlock *BB = MBB->getBasicBlock();
37036
37037 // Memory Reference
37038 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37039
37040 unsigned MemOpndSlot = 0;
37041
37042 unsigned CurOp = 0;
37043
37044 Register DstReg = MI.getOperand(CurOp++).getReg();
37045 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37046 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37047 (void)TRI;
37048 Register mainDstReg = MRI.createVirtualRegister(RC);
37049 Register restoreDstReg = MRI.createVirtualRegister(RC);
37050
37051 MemOpndSlot = CurOp;
37052
37053 MVT PVT = getPointerTy(MF->getDataLayout());
37054 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37055 "Invalid Pointer Size!");
37056
37057 // For v = setjmp(buf), we generate
37058 //
37059 // thisMBB:
37060 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37061 // SjLjSetup restoreMBB
37062 //
37063 // mainMBB:
37064 // v_main = 0
37065 //
37066 // sinkMBB:
37067 // v = phi(main, restore)
37068 //
37069 // restoreMBB:
37070 // if base pointer being used, load it from frame
37071 // v_restore = 1
37072
37073 MachineBasicBlock *thisMBB = MBB;
37074 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37075 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37076 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37077 MF->insert(I, mainMBB);
37078 MF->insert(I, sinkMBB);
37079 MF->push_back(restoreMBB);
37080 restoreMBB->setMachineBlockAddressTaken();
37081
37082 MachineInstrBuilder MIB;
37083
37084 // Transfer the remainder of BB and its successor edges to sinkMBB.
37085 sinkMBB->splice(sinkMBB->begin(), MBB,
37086 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37088
37089 // thisMBB:
37090 unsigned PtrStoreOpc = 0;
37091 Register LabelReg;
37092 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37093 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37095
37096 // Prepare IP either in reg or imm.
37097 if (!UseImmLabel) {
37098 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37099 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37100 LabelReg = MRI.createVirtualRegister(PtrRC);
37101 if (Subtarget.is64Bit()) {
37102 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37103 .addReg(X86::RIP)
37104 .addImm(0)
37105 .addReg(0)
37106 .addMBB(restoreMBB)
37107 .addReg(0);
37108 } else {
37109 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37110 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37111 .addReg(XII->getGlobalBaseReg(MF))
37112 .addImm(0)
37113 .addReg(0)
37114 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37115 .addReg(0);
37116 }
37117 } else
37118 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37119 // Store IP
37120 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37121 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37122 if (i == X86::AddrDisp)
37123 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37124 else
37125 MIB.add(MI.getOperand(MemOpndSlot + i));
37126 }
37127 if (!UseImmLabel)
37128 MIB.addReg(LabelReg);
37129 else
37130 MIB.addMBB(restoreMBB);
37131 MIB.setMemRefs(MMOs);
37132
37133 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37134 emitSetJmpShadowStackFix(MI, thisMBB);
37135 }
37136
37137 // Setup
37138 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37139 .addMBB(restoreMBB);
37140
37141 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37142 MIB.addRegMask(RegInfo->getNoPreservedMask());
37143 thisMBB->addSuccessor(mainMBB);
37144 thisMBB->addSuccessor(restoreMBB);
37145
37146 // mainMBB:
37147 // EAX = 0
37148 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37149 mainMBB->addSuccessor(sinkMBB);
37150
37151 // sinkMBB:
37152 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37153 .addReg(mainDstReg)
37154 .addMBB(mainMBB)
37155 .addReg(restoreDstReg)
37156 .addMBB(restoreMBB);
37157
37158 // restoreMBB:
37159 if (RegInfo->hasBasePointer(*MF)) {
37160 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37161 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37162 X86FI->setRestoreBasePointer(MF);
37163 Register FramePtr = RegInfo->getFrameRegister(*MF);
37164 Register BasePtr = RegInfo->getBaseRegister();
37165 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37166 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37167 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37169 }
37170 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37171 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37172 restoreMBB->addSuccessor(sinkMBB);
37173
37174 MI.eraseFromParent();
37175 return sinkMBB;
37176}
37177
37178/// Fix the shadow stack using the previously saved SSP pointer.
37179/// \sa emitSetJmpShadowStackFix
37180/// \param [in] MI The temporary Machine Instruction for the builtin.
37181/// \param [in] MBB The Machine Basic Block that will be modified.
37182/// \return The sink MBB that will perform the future indirect branch.
37184X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37185 MachineBasicBlock *MBB) const {
37186 const MIMetadata MIMD(MI);
37187 MachineFunction *MF = MBB->getParent();
37188 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37189 MachineRegisterInfo &MRI = MF->getRegInfo();
37190
37191 // Memory Reference
37192 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37193
37194 MVT PVT = getPointerTy(MF->getDataLayout());
37195 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37196
37197 // checkSspMBB:
37198 // xor vreg1, vreg1
37199 // rdssp vreg1
37200 // test vreg1, vreg1
37201 // je sinkMBB # Jump if Shadow Stack is not supported
37202 // fallMBB:
37203 // mov buf+24/12(%rip), vreg2
37204 // sub vreg1, vreg2
37205 // jbe sinkMBB # No need to fix the Shadow Stack
37206 // fixShadowMBB:
37207 // shr 3/2, vreg2
37208 // incssp vreg2 # fix the SSP according to the lower 8 bits
37209 // shr 8, vreg2
37210 // je sinkMBB
37211 // fixShadowLoopPrepareMBB:
37212 // shl vreg2
37213 // mov 128, vreg3
37214 // fixShadowLoopMBB:
37215 // incssp vreg3
37216 // dec vreg2
37217 // jne fixShadowLoopMBB # Iterate until you finish fixing
37218 // # the Shadow Stack
37219 // sinkMBB:
37220
37222 const BasicBlock *BB = MBB->getBasicBlock();
37223
37224 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37225 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37226 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37227 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37228 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37229 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37230 MF->insert(I, checkSspMBB);
37231 MF->insert(I, fallMBB);
37232 MF->insert(I, fixShadowMBB);
37233 MF->insert(I, fixShadowLoopPrepareMBB);
37234 MF->insert(I, fixShadowLoopMBB);
37235 MF->insert(I, sinkMBB);
37236
37237 // Transfer the remainder of BB and its successor edges to sinkMBB.
37238 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37239 MBB->end());
37241
37242 MBB->addSuccessor(checkSspMBB);
37243
37244 // Initialize a register with zero.
37245 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37246 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37247
37248 if (PVT == MVT::i64) {
37249 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37250 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37251 .addImm(0)
37252 .addReg(ZReg)
37253 .addImm(X86::sub_32bit);
37254 ZReg = TmpZReg;
37255 }
37256
37257 // Read the current SSP Register value to the zeroed register.
37258 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37259 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37260 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37261
37262 // Check whether the result of the SSP register is zero and jump directly
37263 // to the sink.
37264 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37265 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37266 .addReg(SSPCopyReg)
37267 .addReg(SSPCopyReg);
37268 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37269 .addMBB(sinkMBB)
37271 checkSspMBB->addSuccessor(sinkMBB);
37272 checkSspMBB->addSuccessor(fallMBB);
37273
37274 // Reload the previously saved SSP register value.
37275 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37276 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37277 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37278 MachineInstrBuilder MIB =
37279 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37280 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37281 const MachineOperand &MO = MI.getOperand(i);
37282 if (i == X86::AddrDisp)
37283 MIB.addDisp(MO, SPPOffset);
37284 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37285 // preserve kill flags.
37286 MIB.addReg(MO.getReg());
37287 else
37288 MIB.add(MO);
37289 }
37290 MIB.setMemRefs(MMOs);
37291
37292 // Subtract the current SSP from the previous SSP.
37293 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37294 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37295 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37296 .addReg(PrevSSPReg)
37297 .addReg(SSPCopyReg);
37298
37299 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37300 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37301 .addMBB(sinkMBB)
37303 fallMBB->addSuccessor(sinkMBB);
37304 fallMBB->addSuccessor(fixShadowMBB);
37305
37306 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37307 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37308 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37309 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37310 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37311 .addReg(SspSubReg)
37312 .addImm(Offset);
37313
37314 // Increase SSP when looking only on the lower 8 bits of the delta.
37315 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37316 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37317
37318 // Reset the lower 8 bits.
37319 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37320 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37321 .addReg(SspFirstShrReg)
37322 .addImm(8);
37323
37324 // Jump if the result of the shift is zero.
37325 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37326 .addMBB(sinkMBB)
37328 fixShadowMBB->addSuccessor(sinkMBB);
37329 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37330
37331 // Do a single shift left.
37332 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37333 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37334 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37335 .addReg(SspSecondShrReg)
37336 .addImm(1);
37337
37338 // Save the value 128 to a register (will be used next with incssp).
37339 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37340 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37341 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37342 .addImm(128);
37343 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37344
37345 // Since incssp only looks at the lower 8 bits, we might need to do several
37346 // iterations of incssp until we finish fixing the shadow stack.
37347 Register DecReg = MRI.createVirtualRegister(PtrRC);
37348 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37349 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37350 .addReg(SspAfterShlReg)
37351 .addMBB(fixShadowLoopPrepareMBB)
37352 .addReg(DecReg)
37353 .addMBB(fixShadowLoopMBB);
37354
37355 // Every iteration we increase the SSP by 128.
37356 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37357
37358 // Every iteration we decrement the counter by 1.
37359 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37360 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37361
37362 // Jump if the counter is not zero yet.
37363 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37364 .addMBB(fixShadowLoopMBB)
37366 fixShadowLoopMBB->addSuccessor(sinkMBB);
37367 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37368
37369 return sinkMBB;
37370}
37371
37373X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37374 MachineBasicBlock *MBB) const {
37375 const MIMetadata MIMD(MI);
37376 MachineFunction *MF = MBB->getParent();
37377 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37378 MachineRegisterInfo &MRI = MF->getRegInfo();
37379
37380 // Memory Reference
37381 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37382
37383 MVT PVT = getPointerTy(MF->getDataLayout());
37384 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37385 "Invalid Pointer Size!");
37386
37387 const TargetRegisterClass *RC =
37388 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37389 Register Tmp = MRI.createVirtualRegister(RC);
37390 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37391 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37392 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37393 Register SP = RegInfo->getStackRegister();
37394
37395 MachineInstrBuilder MIB;
37396
37397 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37398 const int64_t SPOffset = 2 * PVT.getStoreSize();
37399
37400 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37401 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37402
37403 MachineBasicBlock *thisMBB = MBB;
37404
37405 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37406 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37407 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37408 }
37409
37410 // Reload FP
37411 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37412 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37413 const MachineOperand &MO = MI.getOperand(i);
37414 if (MO.isReg()) // Don't add the whole operand, we don't want to
37415 // preserve kill flags.
37416 MIB.addReg(MO.getReg());
37417 else
37418 MIB.add(MO);
37419 }
37420 MIB.setMemRefs(MMOs);
37422
37423 // Reload IP
37424 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37425 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37426 const MachineOperand &MO = MI.getOperand(i);
37427 if (i == X86::AddrDisp)
37428 MIB.addDisp(MO, LabelOffset);
37429 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37430 // preserve kill flags.
37431 MIB.addReg(MO.getReg());
37432 else
37433 MIB.add(MO);
37434 }
37435 MIB.setMemRefs(MMOs);
37436
37437 // Reload SP
37438 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37439 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37440 if (i == X86::AddrDisp)
37441 MIB.addDisp(MI.getOperand(i), SPOffset);
37442 else
37443 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37444 // the last instruction of the expansion.
37445 }
37446 MIB.setMemRefs(MMOs);
37448
37449 // Jump
37450 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37451
37452 MI.eraseFromParent();
37453 return thisMBB;
37454}
37455
37456void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37458 MachineBasicBlock *DispatchBB,
37459 int FI) const {
37460 const MIMetadata MIMD(MI);
37461 MachineFunction *MF = MBB->getParent();
37462 MachineRegisterInfo *MRI = &MF->getRegInfo();
37463 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37464
37465 MVT PVT = getPointerTy(MF->getDataLayout());
37466 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37467
37468 unsigned Op = 0;
37469 Register VR;
37470
37471 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37473
37474 if (UseImmLabel) {
37475 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37476 } else {
37477 const TargetRegisterClass *TRC =
37478 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37479 VR = MRI->createVirtualRegister(TRC);
37480 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37481
37482 if (Subtarget.is64Bit())
37483 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37484 .addReg(X86::RIP)
37485 .addImm(1)
37486 .addReg(0)
37487 .addMBB(DispatchBB)
37488 .addReg(0);
37489 else
37490 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37491 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37492 .addImm(1)
37493 .addReg(0)
37494 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37495 .addReg(0);
37496 }
37497
37498 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37499 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37500 if (UseImmLabel)
37501 MIB.addMBB(DispatchBB);
37502 else
37503 MIB.addReg(VR);
37504}
37505
37507X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37508 MachineBasicBlock *BB) const {
37509 const MIMetadata MIMD(MI);
37510 MachineFunction *MF = BB->getParent();
37511 MachineRegisterInfo *MRI = &MF->getRegInfo();
37512 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37513 int FI = MF->getFrameInfo().getFunctionContextIndex();
37514
37515 // Get a mapping of the call site numbers to all of the landing pads they're
37516 // associated with.
37517 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37518 unsigned MaxCSNum = 0;
37519 for (auto &MBB : *MF) {
37520 if (!MBB.isEHPad())
37521 continue;
37522
37523 MCSymbol *Sym = nullptr;
37524 for (const auto &MI : MBB) {
37525 if (MI.isDebugInstr())
37526 continue;
37527
37528 assert(MI.isEHLabel() && "expected EH_LABEL");
37529 Sym = MI.getOperand(0).getMCSymbol();
37530 break;
37531 }
37532
37533 if (!MF->hasCallSiteLandingPad(Sym))
37534 continue;
37535
37536 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37537 CallSiteNumToLPad[CSI].push_back(&MBB);
37538 MaxCSNum = std::max(MaxCSNum, CSI);
37539 }
37540 }
37541
37542 // Get an ordered list of the machine basic blocks for the jump table.
37543 std::vector<MachineBasicBlock *> LPadList;
37544 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37545 LPadList.reserve(CallSiteNumToLPad.size());
37546
37547 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37548 for (auto &LP : CallSiteNumToLPad[CSI]) {
37549 LPadList.push_back(LP);
37550 InvokeBBs.insert_range(LP->predecessors());
37551 }
37552 }
37553
37554 assert(!LPadList.empty() &&
37555 "No landing pad destinations for the dispatch jump table!");
37556
37557 // Create the MBBs for the dispatch code.
37558
37559 // Shove the dispatch's address into the return slot in the function context.
37560 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37561 DispatchBB->setIsEHPad(true);
37562
37563 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37564 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37565 DispatchBB->addSuccessor(TrapBB);
37566
37567 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37568 DispatchBB->addSuccessor(DispContBB);
37569
37570 // Insert MBBs.
37571 MF->push_back(DispatchBB);
37572 MF->push_back(DispContBB);
37573 MF->push_back(TrapBB);
37574
37575 // Insert code into the entry block that creates and registers the function
37576 // context.
37577 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37578
37579 // Create the jump table and associated information
37580 unsigned JTE = getJumpTableEncoding();
37581 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37582 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37583
37584 const X86RegisterInfo &RI = TII->getRegisterInfo();
37585 // Add a register mask with no preserved registers. This results in all
37586 // registers being marked as clobbered.
37587 if (RI.hasBasePointer(*MF)) {
37588 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37589 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37590 MFI->setRestoreBasePointer(MF);
37591
37592 Register FP = RI.getFrameRegister(*MF);
37593 Register BP = RI.getBaseRegister();
37594 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37595 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37598 } else {
37599 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37601 }
37602
37603 // IReg is used as an index in a memory operand and therefore can't be SP
37604 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37605 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37606 Subtarget.is64Bit() ? 8 : 4);
37607 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37608 .addReg(IReg)
37609 .addImm(LPadList.size());
37610 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37611 .addMBB(TrapBB)
37613
37614 if (Subtarget.is64Bit()) {
37615 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37616 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37617
37618 // leaq .LJTI0_0(%rip), BReg
37619 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37620 .addReg(X86::RIP)
37621 .addImm(1)
37622 .addReg(0)
37623 .addJumpTableIndex(MJTI)
37624 .addReg(0);
37625 // movzx IReg64, IReg
37626 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37627 .addImm(0)
37628 .addReg(IReg)
37629 .addImm(X86::sub_32bit);
37630
37631 switch (JTE) {
37633 // jmpq *(BReg,IReg64,8)
37634 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37635 .addReg(BReg)
37636 .addImm(8)
37637 .addReg(IReg64)
37638 .addImm(0)
37639 .addReg(0);
37640 break;
37642 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37643 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37644 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37645
37646 // movl (BReg,IReg64,4), OReg
37647 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37648 .addReg(BReg)
37649 .addImm(4)
37650 .addReg(IReg64)
37651 .addImm(0)
37652 .addReg(0);
37653 // movsx OReg64, OReg
37654 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37655 .addReg(OReg);
37656 // addq BReg, OReg64, TReg
37657 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37658 .addReg(OReg64)
37659 .addReg(BReg);
37660 // jmpq *TReg
37661 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37662 break;
37663 }
37664 default:
37665 llvm_unreachable("Unexpected jump table encoding");
37666 }
37667 } else {
37668 // jmpl *.LJTI0_0(,IReg,4)
37669 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37670 .addReg(0)
37671 .addImm(4)
37672 .addReg(IReg)
37673 .addJumpTableIndex(MJTI)
37674 .addReg(0);
37675 }
37676
37677 // Add the jump table entries as successors to the MBB.
37678 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37679 for (auto &LP : LPadList)
37680 if (SeenMBBs.insert(LP).second)
37681 DispContBB->addSuccessor(LP);
37682
37683 // N.B. the order the invoke BBs are processed in doesn't matter here.
37685 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37686 for (MachineBasicBlock *MBB : InvokeBBs) {
37687 // Remove the landing pad successor from the invoke block and replace it
37688 // with the new dispatch block.
37689 // Keep a copy of Successors since it's modified inside the loop.
37690 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37691 MBB->succ_rend());
37692 // FIXME: Avoid quadratic complexity.
37693 for (auto *MBBS : Successors) {
37694 if (MBBS->isEHPad()) {
37695 MBB->removeSuccessor(MBBS);
37696 MBBLPads.push_back(MBBS);
37697 }
37698 }
37699
37700 MBB->addSuccessor(DispatchBB);
37701
37702 // Find the invoke call and mark all of the callee-saved registers as
37703 // 'implicit defined' so that they're spilled. This prevents code from
37704 // moving instructions to before the EH block, where they will never be
37705 // executed.
37706 for (auto &II : reverse(*MBB)) {
37707 if (!II.isCall())
37708 continue;
37709
37710 DenseSet<Register> DefRegs;
37711 for (auto &MOp : II.operands())
37712 if (MOp.isReg())
37713 DefRegs.insert(MOp.getReg());
37714
37715 MachineInstrBuilder MIB(*MF, &II);
37716 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37717 Register Reg = SavedRegs[RegIdx];
37718 if (!DefRegs.contains(Reg))
37720 }
37721
37722 break;
37723 }
37724 }
37725
37726 // Mark all former landing pads as non-landing pads. The dispatch is the only
37727 // landing pad now.
37728 for (auto &LP : MBBLPads)
37729 LP->setIsEHPad(false);
37730
37731 // The instruction is gone now.
37732 MI.eraseFromParent();
37733 return BB;
37734}
37735
37737X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37738 MachineBasicBlock *BB) const {
37739 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37740 // calls may require proper stack alignment.
37741 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37742 const MIMetadata MIMD(MI);
37743 MachineFunction &MF = *BB->getParent();
37744
37745 // Emit CALLSEQ_START right before the instruction.
37746 MF.getFrameInfo().setAdjustsStack(true);
37747 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37748 MachineInstrBuilder CallseqStart =
37749 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37750 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37751
37752 // Emit CALLSEQ_END right after the instruction.
37753 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37754 MachineInstrBuilder CallseqEnd =
37755 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37756 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37757
37758 return BB;
37759}
37760
37763 MachineBasicBlock *BB) const {
37764 MachineFunction *MF = BB->getParent();
37765 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37766 const MIMetadata MIMD(MI);
37767
37768 auto TMMImmToTMMReg = [](unsigned Imm) {
37769 assert (Imm < 8 && "Illegal tmm index");
37770 return X86::TMM0 + Imm;
37771 };
37772 auto TMMImmToTMMPair = [](unsigned Imm) {
37773 assert(Imm < 8 && "Illegal tmm pair index.");
37774 return X86::TMM0_TMM1 + Imm / 2;
37775 };
37776 switch (MI.getOpcode()) {
37777 default:
37778 llvm_unreachable("Unexpected instr type to insert");
37779 case X86::INDIRECT_THUNK_CALL32:
37780 case X86::INDIRECT_THUNK_CALL64:
37781 case X86::INDIRECT_THUNK_TCRETURN32:
37782 case X86::INDIRECT_THUNK_TCRETURN64:
37783 return EmitLoweredIndirectThunk(MI, BB);
37784 case X86::CATCHRET:
37785 return EmitLoweredCatchRet(MI, BB);
37786 case X86::SEG_ALLOCA_32:
37787 case X86::SEG_ALLOCA_64:
37788 return EmitLoweredSegAlloca(MI, BB);
37789 case X86::PROBED_ALLOCA_32:
37790 case X86::PROBED_ALLOCA_64:
37791 return EmitLoweredProbedAlloca(MI, BB);
37792 case X86::TLSCall_32:
37793 case X86::TLSCall_64:
37794 return EmitLoweredTLSCall(MI, BB);
37795 case X86::CMOV_FR16:
37796 case X86::CMOV_FR16X:
37797 case X86::CMOV_FR32:
37798 case X86::CMOV_FR32X:
37799 case X86::CMOV_FR64:
37800 case X86::CMOV_FR64X:
37801 case X86::CMOV_GR8:
37802 case X86::CMOV_GR16:
37803 case X86::CMOV_GR32:
37804 case X86::CMOV_RFP32:
37805 case X86::CMOV_RFP64:
37806 case X86::CMOV_RFP80:
37807 case X86::CMOV_VR64:
37808 case X86::CMOV_VR128:
37809 case X86::CMOV_VR128X:
37810 case X86::CMOV_VR256:
37811 case X86::CMOV_VR256X:
37812 case X86::CMOV_VR512:
37813 case X86::CMOV_VK1:
37814 case X86::CMOV_VK2:
37815 case X86::CMOV_VK4:
37816 case X86::CMOV_VK8:
37817 case X86::CMOV_VK16:
37818 case X86::CMOV_VK32:
37819 case X86::CMOV_VK64:
37820 return EmitLoweredSelect(MI, BB);
37821
37822 case X86::FP80_ADDr:
37823 case X86::FP80_ADDm32: {
37824 // Change the floating point control register to use double extended
37825 // precision when performing the addition.
37826 int OrigCWFrameIdx =
37827 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37828 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37829 OrigCWFrameIdx);
37830
37831 // Load the old value of the control word...
37832 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37833 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37834 OrigCWFrameIdx);
37835
37836 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37837 // precision.
37838 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37839 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37840 .addReg(OldCW, RegState::Kill)
37841 .addImm(0x300);
37842
37843 // Extract to 16 bits.
37844 Register NewCW16 =
37845 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37846 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37847 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37848
37849 // Prepare memory for FLDCW.
37850 int NewCWFrameIdx =
37851 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37852 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37853 NewCWFrameIdx)
37854 .addReg(NewCW16, RegState::Kill);
37855
37856 // Reload the modified control word now...
37857 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37858 NewCWFrameIdx);
37859
37860 // Do the addition.
37861 if (MI.getOpcode() == X86::FP80_ADDr) {
37862 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37863 .add(MI.getOperand(0))
37864 .add(MI.getOperand(1))
37865 .add(MI.getOperand(2));
37866 } else {
37867 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37868 .add(MI.getOperand(0))
37869 .add(MI.getOperand(1))
37870 .add(MI.getOperand(2))
37871 .add(MI.getOperand(3))
37872 .add(MI.getOperand(4))
37873 .add(MI.getOperand(5))
37874 .add(MI.getOperand(6));
37875 }
37876
37877 // Reload the original control word now.
37878 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37879 OrigCWFrameIdx);
37880
37881 MI.eraseFromParent(); // The pseudo instruction is gone now.
37882 return BB;
37883 }
37884
37885 case X86::FP32_TO_INT16_IN_MEM:
37886 case X86::FP32_TO_INT32_IN_MEM:
37887 case X86::FP32_TO_INT64_IN_MEM:
37888 case X86::FP64_TO_INT16_IN_MEM:
37889 case X86::FP64_TO_INT32_IN_MEM:
37890 case X86::FP64_TO_INT64_IN_MEM:
37891 case X86::FP80_TO_INT16_IN_MEM:
37892 case X86::FP80_TO_INT32_IN_MEM:
37893 case X86::FP80_TO_INT64_IN_MEM: {
37894 // Change the floating point control register to use "round towards zero"
37895 // mode when truncating to an integer value.
37896 int OrigCWFrameIdx =
37897 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37898 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37899 OrigCWFrameIdx);
37900
37901 // Load the old value of the control word...
37902 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37903 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37904 OrigCWFrameIdx);
37905
37906 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37907 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37908 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37909 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37910
37911 // Extract to 16 bits.
37912 Register NewCW16 =
37913 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37914 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37915 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37916
37917 // Prepare memory for FLDCW.
37918 int NewCWFrameIdx =
37919 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37920 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37921 NewCWFrameIdx)
37922 .addReg(NewCW16, RegState::Kill);
37923
37924 // Reload the modified control word now...
37925 addFrameReference(BuildMI(*BB, MI, MIMD,
37926 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37927
37928 // Get the X86 opcode to use.
37929 unsigned Opc;
37930 switch (MI.getOpcode()) {
37931 // clang-format off
37932 default: llvm_unreachable("illegal opcode!");
37933 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37934 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37935 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37936 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37937 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37938 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37939 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37940 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37941 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37942 // clang-format on
37943 }
37944
37946 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37947 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37948
37949 // Reload the original control word now.
37950 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37951 OrigCWFrameIdx);
37952
37953 MI.eraseFromParent(); // The pseudo instruction is gone now.
37954 return BB;
37955 }
37956
37957 // xbegin
37958 case X86::XBEGIN:
37959 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37960
37961 case X86::VAARG_64:
37962 case X86::VAARG_X32:
37963 return EmitVAARGWithCustomInserter(MI, BB);
37964
37965 case X86::EH_SjLj_SetJmp32:
37966 case X86::EH_SjLj_SetJmp64:
37967 return emitEHSjLjSetJmp(MI, BB);
37968
37969 case X86::EH_SjLj_LongJmp32:
37970 case X86::EH_SjLj_LongJmp64:
37971 return emitEHSjLjLongJmp(MI, BB);
37972
37973 case X86::Int_eh_sjlj_setup_dispatch:
37974 return EmitSjLjDispatchBlock(MI, BB);
37975
37976 case TargetOpcode::STATEPOINT:
37977 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37978 // this point in the process. We diverge later.
37979 return emitPatchPoint(MI, BB);
37980
37981 case TargetOpcode::STACKMAP:
37982 case TargetOpcode::PATCHPOINT:
37983 return emitPatchPoint(MI, BB);
37984
37985 case TargetOpcode::PATCHABLE_EVENT_CALL:
37986 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37987 return emitPatchableEventCall(MI, BB);
37988
37989 case X86::LCMPXCHG8B: {
37990 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37991 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37992 // requires a memory operand. If it happens that current architecture is
37993 // i686 and for current function we need a base pointer
37994 // - which is ESI for i686 - register allocator would not be able to
37995 // allocate registers for an address in form of X(%reg, %reg, Y)
37996 // - there never would be enough unreserved registers during regalloc
37997 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37998 // We are giving a hand to register allocator by precomputing the address in
37999 // a new vreg using LEA.
38000
38001 // If it is not i686 or there is no base pointer - nothing to do here.
38002 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38003 return BB;
38004
38005 // Even though this code does not necessarily needs the base pointer to
38006 // be ESI, we check for that. The reason: if this assert fails, there are
38007 // some changes happened in the compiler base pointer handling, which most
38008 // probably have to be addressed somehow here.
38009 assert(TRI->getBaseRegister() == X86::ESI &&
38010 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38011 "base pointer in mind");
38012
38014 MVT SPTy = getPointerTy(MF->getDataLayout());
38015 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38016 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38017
38019 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38020 // does not use index register.
38021 if (AM.IndexReg == X86::NoRegister)
38022 return BB;
38023
38024 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38025 // four operand definitions that are E[ABCD] registers. We skip them and
38026 // then insert the LEA.
38027 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38028 while (RMBBI != BB->rend() &&
38029 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38030 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38031 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38032 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38033 ++RMBBI;
38034 }
38037 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38038
38039 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38040
38041 return BB;
38042 }
38043 case X86::LCMPXCHG16B_NO_RBX: {
38044 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38045 Register BasePtr = TRI->getBaseRegister();
38046 if (TRI->hasBasePointer(*MF) &&
38047 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38048 if (!BB->isLiveIn(BasePtr))
38049 BB->addLiveIn(BasePtr);
38050 // Save RBX into a virtual register.
38051 Register SaveRBX =
38052 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38053 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38054 .addReg(X86::RBX);
38055 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38057 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38058 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38059 MIB.add(MI.getOperand(Idx));
38060 MIB.add(MI.getOperand(X86::AddrNumOperands));
38061 MIB.addReg(SaveRBX);
38062 } else {
38063 // Simple case, just copy the virtual register to RBX.
38064 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38065 .add(MI.getOperand(X86::AddrNumOperands));
38067 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38068 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38069 MIB.add(MI.getOperand(Idx));
38070 }
38071 MI.eraseFromParent();
38072 return BB;
38073 }
38074 case X86::MWAITX: {
38075 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38076 Register BasePtr = TRI->getBaseRegister();
38077 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38078 // If no need to save the base pointer, we generate MWAITXrrr,
38079 // else we generate pseudo MWAITX_SAVE_RBX.
38080 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38081 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38082 .addReg(MI.getOperand(0).getReg());
38083 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38084 .addReg(MI.getOperand(1).getReg());
38085 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38086 .addReg(MI.getOperand(2).getReg());
38087 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38088 MI.eraseFromParent();
38089 } else {
38090 if (!BB->isLiveIn(BasePtr)) {
38091 BB->addLiveIn(BasePtr);
38092 }
38093 // Parameters can be copied into ECX and EAX but not EBX yet.
38094 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38095 .addReg(MI.getOperand(0).getReg());
38096 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38097 .addReg(MI.getOperand(1).getReg());
38098 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38099 // Save RBX into a virtual register.
38100 Register SaveRBX =
38101 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38102 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38103 .addReg(X86::RBX);
38104 // Generate mwaitx pseudo.
38105 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38106 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38107 .addDef(Dst) // Destination tied in with SaveRBX.
38108 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38109 .addUse(SaveRBX); // Save of base pointer.
38110 MI.eraseFromParent();
38111 }
38112 return BB;
38113 }
38114 case TargetOpcode::PREALLOCATED_SETUP: {
38115 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38116 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38117 MFI->setHasPreallocatedCall(true);
38118 int64_t PreallocatedId = MI.getOperand(0).getImm();
38119 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38120 assert(StackAdjustment != 0 && "0 stack adjustment");
38121 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38122 << StackAdjustment << "\n");
38123 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38124 .addReg(X86::ESP)
38125 .addImm(StackAdjustment);
38126 MI.eraseFromParent();
38127 return BB;
38128 }
38129 case TargetOpcode::PREALLOCATED_ARG: {
38130 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38131 int64_t PreallocatedId = MI.getOperand(1).getImm();
38132 int64_t ArgIdx = MI.getOperand(2).getImm();
38133 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38134 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38135 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38136 << ", arg offset " << ArgOffset << "\n");
38137 // stack pointer + offset
38138 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38139 MI.getOperand(0).getReg()),
38140 X86::ESP, false, ArgOffset);
38141 MI.eraseFromParent();
38142 return BB;
38143 }
38144 case X86::PTDPBSSD:
38145 case X86::PTDPBSUD:
38146 case X86::PTDPBUSD:
38147 case X86::PTDPBUUD:
38148 case X86::PTDPBF16PS:
38149 case X86::PTDPFP16PS:
38150 case X86::PTCMMIMFP16PS:
38151 case X86::PTCMMRLFP16PS:
38152 case X86::PTDPBF8PS:
38153 case X86::PTDPBHF8PS:
38154 case X86::PTDPHBF8PS:
38155 case X86::PTDPHF8PS:
38156 case X86::PTTDPBF16PS:
38157 case X86::PTTDPFP16PS:
38158 case X86::PTTCMMIMFP16PS:
38159 case X86::PTTCMMRLFP16PS:
38160 case X86::PTCONJTCMMIMFP16PS:
38161 case X86::PTMMULTF32PS:
38162 case X86::PTTMMULTF32PS: {
38163 unsigned Opc;
38164 switch (MI.getOpcode()) {
38165 default: llvm_unreachable("illegal opcode!");
38166 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38167 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38168 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38169 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38170 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38171 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38172 case X86::PTCMMIMFP16PS:
38173 Opc = X86::TCMMIMFP16PS;
38174 break;
38175 case X86::PTCMMRLFP16PS:
38176 Opc = X86::TCMMRLFP16PS;
38177 break;
38178 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38179 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38180 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38181 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38182 case X86::PTTDPBF16PS:
38183 Opc = X86::TTDPBF16PS;
38184 break;
38185 case X86::PTTDPFP16PS:
38186 Opc = X86::TTDPFP16PS;
38187 break;
38188 case X86::PTTCMMIMFP16PS:
38189 Opc = X86::TTCMMIMFP16PS;
38190 break;
38191 case X86::PTTCMMRLFP16PS:
38192 Opc = X86::TTCMMRLFP16PS;
38193 break;
38194 case X86::PTCONJTCMMIMFP16PS:
38195 Opc = X86::TCONJTCMMIMFP16PS;
38196 break;
38197 case X86::PTMMULTF32PS:
38198 Opc = X86::TMMULTF32PS;
38199 break;
38200 case X86::PTTMMULTF32PS:
38201 Opc = X86::TTMMULTF32PS;
38202 break;
38203 }
38204
38205 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38206 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38207 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38208 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38209 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38210
38211 MI.eraseFromParent(); // The pseudo is gone now.
38212 return BB;
38213 }
38214 case X86::PTILEZERO: {
38215 unsigned Imm = MI.getOperand(0).getImm();
38216 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38217 MI.eraseFromParent(); // The pseudo is gone now.
38218 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38220 return BB;
38221 }
38222 case X86::PTILEZEROV: {
38223 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38225 return BB;
38226 }
38227 case X86::PTILELOADDRS:
38228 case X86::PTILELOADDRST1:
38229 case X86::PTILELOADD:
38230 case X86::PTILELOADDT1:
38231 case X86::PTILESTORED: {
38232 unsigned Opc;
38233 switch (MI.getOpcode()) {
38234 default: llvm_unreachable("illegal opcode!");
38235#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38236 case X86::PTILELOADD:
38237 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38238 break;
38239 case X86::PTILELOADDT1:
38240 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38241 break;
38242 case X86::PTILESTORED:
38243 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38244 break;
38245 case X86::PTILELOADDRS:
38246 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38247 break;
38248 case X86::PTILELOADDRST1:
38249 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38250 break;
38251 }
38252#undef GET_EGPR_IF_ENABLED
38253
38254 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38255 unsigned CurOp = 0;
38256 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38257 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38259
38260 MIB.add(MI.getOperand(CurOp++)); // base
38261 MIB.add(MI.getOperand(CurOp++)); // scale
38262 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38263 MIB.add(MI.getOperand(CurOp++)); // displacement
38264 MIB.add(MI.getOperand(CurOp++)); // segment
38265
38266 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38267 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38269
38270 MI.eraseFromParent(); // The pseudo is gone now.
38271 return BB;
38272 }
38273 case X86::PT2RPNTLVWZ0:
38274 case X86::PT2RPNTLVWZ0T1:
38275 case X86::PT2RPNTLVWZ1:
38276 case X86::PT2RPNTLVWZ1T1:
38277 case X86::PT2RPNTLVWZ0RS:
38278 case X86::PT2RPNTLVWZ0RST1:
38279 case X86::PT2RPNTLVWZ1RS:
38280 case X86::PT2RPNTLVWZ1RST1: {
38281 const DebugLoc &DL = MI.getDebugLoc();
38282 unsigned Opc;
38283#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38284 switch (MI.getOpcode()) {
38285 default:
38286 llvm_unreachable("Unexpected instruction!");
38287 case X86::PT2RPNTLVWZ0:
38288 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38289 break;
38290 case X86::PT2RPNTLVWZ0T1:
38291 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38292 break;
38293 case X86::PT2RPNTLVWZ1:
38294 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38295 break;
38296 case X86::PT2RPNTLVWZ1T1:
38297 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38298 break;
38299 case X86::PT2RPNTLVWZ0RS:
38300 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38301 break;
38302 case X86::PT2RPNTLVWZ0RST1:
38303 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38304 break;
38305 case X86::PT2RPNTLVWZ1RS:
38306 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38307 break;
38308 case X86::PT2RPNTLVWZ1RST1:
38309 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38310 break;
38311 }
38312#undef GET_EGPR_IF_ENABLED
38313 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38314 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38315
38316 MIB.add(MI.getOperand(1)); // base
38317 MIB.add(MI.getOperand(2)); // scale
38318 MIB.add(MI.getOperand(3)); // index
38319 MIB.add(MI.getOperand(4)); // displacement
38320 MIB.add(MI.getOperand(5)); // segment
38321 MI.eraseFromParent(); // The pseudo is gone now.
38322 return BB;
38323 }
38324 case X86::PTTRANSPOSED:
38325 case X86::PTCONJTFP16: {
38326 const DebugLoc &DL = MI.getDebugLoc();
38327 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38328 : X86::TCONJTFP16;
38329
38330 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38331 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38332 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38333
38334 MI.eraseFromParent(); // The pseudo is gone now.
38335 return BB;
38336 }
38337 case X86::PTCVTROWPS2BF16Hrri:
38338 case X86::PTCVTROWPS2BF16Lrri:
38339 case X86::PTCVTROWPS2PHHrri:
38340 case X86::PTCVTROWPS2PHLrri:
38341 case X86::PTCVTROWD2PSrri:
38342 case X86::PTILEMOVROWrri: {
38343 const DebugLoc &DL = MI.getDebugLoc();
38344 unsigned Opc;
38345 switch (MI.getOpcode()) {
38346 default:
38347 llvm_unreachable("Unexpected instruction!");
38348 case X86::PTCVTROWD2PSrri:
38349 Opc = X86::TCVTROWD2PSrri;
38350 break;
38351 case X86::PTCVTROWPS2BF16Hrri:
38352 Opc = X86::TCVTROWPS2BF16Hrri;
38353 break;
38354 case X86::PTCVTROWPS2PHHrri:
38355 Opc = X86::TCVTROWPS2PHHrri;
38356 break;
38357 case X86::PTCVTROWPS2BF16Lrri:
38358 Opc = X86::TCVTROWPS2BF16Lrri;
38359 break;
38360 case X86::PTCVTROWPS2PHLrri:
38361 Opc = X86::TCVTROWPS2PHLrri;
38362 break;
38363 case X86::PTILEMOVROWrri:
38364 Opc = X86::TILEMOVROWrri;
38365 break;
38366 }
38367 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38368 MIB.add(MI.getOperand(0));
38369 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38370 MIB.addImm(MI.getOperand(2).getImm());
38371
38372 MI.eraseFromParent(); // The pseudo is gone now.
38373 return BB;
38374 }
38375 case X86::PTCVTROWPS2BF16Hrre:
38376 case X86::PTCVTROWPS2BF16Lrre:
38377 case X86::PTCVTROWPS2PHHrre:
38378 case X86::PTCVTROWPS2PHLrre:
38379 case X86::PTCVTROWD2PSrre:
38380 case X86::PTILEMOVROWrre: {
38381 const DebugLoc &DL = MI.getDebugLoc();
38382 unsigned Opc;
38383 switch (MI.getOpcode()) {
38384 default:
38385 llvm_unreachable("Unexpected instruction!");
38386 case X86::PTCVTROWD2PSrre:
38387 Opc = X86::TCVTROWD2PSrre;
38388 break;
38389 case X86::PTCVTROWPS2BF16Hrre:
38390 Opc = X86::TCVTROWPS2BF16Hrre;
38391 break;
38392 case X86::PTCVTROWPS2BF16Lrre:
38393 Opc = X86::TCVTROWPS2BF16Lrre;
38394 break;
38395 case X86::PTCVTROWPS2PHHrre:
38396 Opc = X86::TCVTROWPS2PHHrre;
38397 break;
38398 case X86::PTCVTROWPS2PHLrre:
38399 Opc = X86::TCVTROWPS2PHLrre;
38400 break;
38401 case X86::PTILEMOVROWrre:
38402 Opc = X86::TILEMOVROWrre;
38403 break;
38404 }
38405 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38406 MIB.add(MI.getOperand(0));
38407 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38408 MIB.add(MI.getOperand(2));
38409
38410 MI.eraseFromParent(); // The pseudo is gone now.
38411 return BB;
38412 }
38413 }
38414}
38415
38416//===----------------------------------------------------------------------===//
38417// X86 Optimization Hooks
38418//===----------------------------------------------------------------------===//
38419
38420bool
38422 const APInt &DemandedBits,
38423 const APInt &DemandedElts,
38424 TargetLoweringOpt &TLO) const {
38425 EVT VT = Op.getValueType();
38426 unsigned Opcode = Op.getOpcode();
38427 unsigned EltSize = VT.getScalarSizeInBits();
38428
38429 if (VT.isVector()) {
38430 // If the constant is only all signbits in the active bits, then we should
38431 // extend it to the entire constant to allow it act as a boolean constant
38432 // vector.
38433 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38434 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38435 return false;
38436 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38437 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38438 continue;
38439 const APInt &Val = V.getConstantOperandAPInt(i);
38440 if (Val.getBitWidth() > Val.getNumSignBits() &&
38441 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38442 return true;
38443 }
38444 return false;
38445 };
38446 // For vectors - if we have a constant, then try to sign extend.
38447 // TODO: Handle AND cases.
38448 unsigned ActiveBits = DemandedBits.getActiveBits();
38449 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38450 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38451 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38452 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38453 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38455 SDValue NewC =
38457 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38458 SDValue NewOp =
38459 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38460 return TLO.CombineTo(Op, NewOp);
38461 }
38462 return false;
38463 }
38464
38465 // Only optimize Ands to prevent shrinking a constant that could be
38466 // matched by movzx.
38467 if (Opcode != ISD::AND)
38468 return false;
38469
38470 // Make sure the RHS really is a constant.
38471 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38472 if (!C)
38473 return false;
38474
38475 const APInt &Mask = C->getAPIntValue();
38476
38477 // Clear all non-demanded bits initially.
38478 APInt ShrunkMask = Mask & DemandedBits;
38479
38480 // Find the width of the shrunk mask.
38481 unsigned Width = ShrunkMask.getActiveBits();
38482
38483 // If the mask is all 0s there's nothing to do here.
38484 if (Width == 0)
38485 return false;
38486
38487 // Find the next power of 2 width, rounding up to a byte.
38488 Width = llvm::bit_ceil(std::max(Width, 8U));
38489 // Truncate the width to size to handle illegal types.
38490 Width = std::min(Width, EltSize);
38491
38492 // Calculate a possible zero extend mask for this constant.
38493 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38494
38495 // If we aren't changing the mask, just return true to keep it and prevent
38496 // the caller from optimizing.
38497 if (ZeroExtendMask == Mask)
38498 return true;
38499
38500 // Make sure the new mask can be represented by a combination of mask bits
38501 // and non-demanded bits.
38502 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38503 return false;
38504
38505 // Replace the constant with the zero extend mask.
38506 SDLoc DL(Op);
38507 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38508 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38509 return TLO.CombineTo(Op, NewOp);
38510}
38511
38513 KnownBits &Known,
38514 const APInt &DemandedElts,
38515 const SelectionDAG &DAG, unsigned Depth) {
38516 KnownBits Known2;
38517 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38518 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38519 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38520 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38521 Known = KnownBits::abdu(Known, Known2).zext(16);
38522 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38523 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38524 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38525 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38526 Known = Known.zext(64);
38527}
38528
38530 KnownBits &Known,
38531 const APInt &DemandedElts,
38532 const SelectionDAG &DAG,
38533 unsigned Depth) {
38534 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38535
38536 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38537 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38538 APInt DemandedLoElts =
38539 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38540 APInt DemandedHiElts =
38541 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38542 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38543 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38544 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38545 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38546 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38547 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38548 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38549}
38550
38552 KnownBits &Known,
38553 const APInt &DemandedElts,
38554 const SelectionDAG &DAG,
38555 unsigned Depth) {
38556 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38557
38558 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38559 // pairs.
38560 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38561 APInt DemandedLoElts =
38562 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38563 APInt DemandedHiElts =
38564 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38565 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38566 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38567 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38568 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38569 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38570 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38571 Known = KnownBits::sadd_sat(Lo, Hi);
38572}
38573
38575 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38576 const SelectionDAG &DAG,
38577 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38578 KnownBitsFunc) {
38579 APInt DemandedEltsLHS, DemandedEltsRHS;
38580 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38581 DemandedElts, DemandedEltsLHS,
38582 DemandedEltsRHS);
38583
38584 const auto ComputeForSingleOpFunc =
38585 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38586 return KnownBitsFunc(
38587 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38588 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38589 };
38590
38591 if (DemandedEltsRHS.isZero())
38592 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38593 if (DemandedEltsLHS.isZero())
38594 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38595
38596 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38597 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38598}
38599
38601 KnownBits &Known,
38602 const APInt &DemandedElts,
38603 const SelectionDAG &DAG,
38604 unsigned Depth) const {
38605 unsigned BitWidth = Known.getBitWidth();
38606 unsigned NumElts = DemandedElts.getBitWidth();
38607 unsigned Opc = Op.getOpcode();
38608 EVT VT = Op.getValueType();
38613 "Should use MaskedValueIsZero if you don't know whether Op"
38614 " is a target node!");
38615
38616 Known.resetAll();
38617 switch (Opc) {
38618 default: break;
38619 case X86ISD::MUL_IMM: {
38620 KnownBits Known2;
38621 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38622 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38623 Known = KnownBits::mul(Known, Known2);
38624 break;
38625 }
38626 case X86ISD::BSF: {
38628
38629 KnownBits Known2;
38630 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38631 if (Known2.isNonZero()) {
38632 // If we have a known 1, its position is our upper bound.
38633 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38634 unsigned LowBits = llvm::bit_width(PossibleTZ);
38635 Known.Zero.setBitsFrom(LowBits);
38636 } else if (!Op.getOperand(0).isUndef()) {
38637 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38638 Known = Known.intersectWith(Known2);
38639 }
38640 break;
38641 }
38642 case X86ISD::BSR: {
38643 // TODO: Bound with input known bits?
38645
38646 if (!Op.getOperand(0).isUndef() &&
38647 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38648 KnownBits Known2;
38649 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38650 Known = Known.intersectWith(Known2);
38651 }
38652 break;
38653 }
38654 case X86ISD::SETCC:
38655 Known.Zero.setBitsFrom(1);
38656 break;
38657 case X86ISD::MOVMSK: {
38658 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38659 Known.Zero.setBitsFrom(NumLoBits);
38660 break;
38661 }
38662 case X86ISD::PEXTRB:
38663 case X86ISD::PEXTRW: {
38664 SDValue Src = Op.getOperand(0);
38665 EVT SrcVT = Src.getValueType();
38666 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38667 Op.getConstantOperandVal(1));
38668 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38669 Known = Known.anyextOrTrunc(BitWidth);
38670 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38671 break;
38672 }
38673 case X86ISD::VSRAI:
38674 case X86ISD::VSHLI:
38675 case X86ISD::VSRLI: {
38676 unsigned ShAmt = Op.getConstantOperandVal(1);
38677 if (ShAmt >= VT.getScalarSizeInBits()) {
38678 // Out of range logical bit shifts are guaranteed to be zero.
38679 // Out of range arithmetic bit shifts splat the sign bit.
38680 if (Opc != X86ISD::VSRAI) {
38681 Known.setAllZero();
38682 break;
38683 }
38684
38685 ShAmt = VT.getScalarSizeInBits() - 1;
38686 }
38687
38688 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38689 if (Opc == X86ISD::VSHLI) {
38690 Known <<= ShAmt;
38691 // Low bits are known zero.
38692 Known.Zero.setLowBits(ShAmt);
38693 } else if (Opc == X86ISD::VSRLI) {
38694 Known >>= ShAmt;
38695 // High bits are known zero.
38696 Known.Zero.setHighBits(ShAmt);
38697 } else {
38698 Known.Zero.ashrInPlace(ShAmt);
38699 Known.One.ashrInPlace(ShAmt);
38700 }
38701 break;
38702 }
38703 case X86ISD::PACKUS: {
38704 // PACKUS is just a truncation if the upper half is zero.
38705 APInt DemandedLHS, DemandedRHS;
38706 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38707
38708 Known.One = APInt::getAllOnes(BitWidth * 2);
38709 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38710
38711 KnownBits Known2;
38712 if (!!DemandedLHS) {
38713 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38714 Known = Known.intersectWith(Known2);
38715 }
38716 if (!!DemandedRHS) {
38717 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38718 Known = Known.intersectWith(Known2);
38719 }
38720
38721 if (Known.countMinLeadingZeros() < BitWidth)
38722 Known.resetAll();
38723 Known = Known.trunc(BitWidth);
38724 break;
38725 }
38726 case X86ISD::PSHUFB: {
38727 SDValue Src = Op.getOperand(0);
38728 SDValue Idx = Op.getOperand(1);
38729
38730 // If the index vector is never negative (MSB is zero), then all elements
38731 // come from the source vector. This is useful for cases where
38732 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38733 // below will handle the more common constant shuffle mask case.
38734 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38735 if (KnownIdx.isNonNegative())
38736 Known = DAG.computeKnownBits(Src, Depth + 1);
38737 break;
38738 }
38739 case X86ISD::VBROADCAST: {
38740 SDValue Src = Op.getOperand(0);
38741 if (!Src.getSimpleValueType().isVector()) {
38742 Known = DAG.computeKnownBits(Src, Depth + 1);
38743 return;
38744 }
38745 break;
38746 }
38747 case X86ISD::AND: {
38748 if (Op.getResNo() == 0) {
38749 KnownBits Known2;
38750 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38751 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38752 Known &= Known2;
38753 }
38754 break;
38755 }
38756 case X86ISD::ANDNP: {
38757 KnownBits Known2;
38758 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38759 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38760
38761 // ANDNP = (~X & Y);
38762 Known.One &= Known2.Zero;
38763 Known.Zero |= Known2.One;
38764 break;
38765 }
38766 case X86ISD::FOR: {
38767 KnownBits Known2;
38768 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38769 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38770
38771 Known |= Known2;
38772 break;
38773 }
38774 case X86ISD::PSADBW: {
38775 SDValue LHS = Op.getOperand(0);
38776 SDValue RHS = Op.getOperand(1);
38777 assert(VT.getScalarType() == MVT::i64 &&
38778 LHS.getValueType() == RHS.getValueType() &&
38779 LHS.getValueType().getScalarType() == MVT::i8 &&
38780 "Unexpected PSADBW types");
38781 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38782 break;
38783 }
38784 case X86ISD::PCMPGT:
38785 case X86ISD::PCMPEQ: {
38786 KnownBits KnownLhs =
38787 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38788 KnownBits KnownRhs =
38789 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38790 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38791 ? KnownBits::eq(KnownLhs, KnownRhs)
38792 : KnownBits::sgt(KnownLhs, KnownRhs);
38793 if (Res) {
38794 if (*Res)
38795 Known.setAllOnes();
38796 else
38797 Known.setAllZero();
38798 }
38799 break;
38800 }
38801 case X86ISD::VPMADDWD: {
38802 SDValue LHS = Op.getOperand(0);
38803 SDValue RHS = Op.getOperand(1);
38804 assert(VT.getVectorElementType() == MVT::i32 &&
38805 LHS.getValueType() == RHS.getValueType() &&
38806 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38807 "Unexpected PMADDWD types");
38808 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38809 break;
38810 }
38811 case X86ISD::VPMADDUBSW: {
38812 SDValue LHS = Op.getOperand(0);
38813 SDValue RHS = Op.getOperand(1);
38814 assert(VT.getVectorElementType() == MVT::i16 &&
38815 LHS.getValueType() == RHS.getValueType() &&
38816 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38817 "Unexpected PMADDUBSW types");
38818 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38819 break;
38820 }
38821 case X86ISD::PMULUDQ: {
38822 KnownBits Known2;
38823 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38824 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38825
38826 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38827 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38828 Known = KnownBits::mul(Known, Known2);
38829 break;
38830 }
38831 case X86ISD::CMOV: {
38832 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38833 // If we don't know any bits, early out.
38834 if (Known.isUnknown())
38835 break;
38836 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38837
38838 // Only known if known in both the LHS and RHS.
38839 Known = Known.intersectWith(Known2);
38840 break;
38841 }
38842 case X86ISD::BEXTR:
38843 case X86ISD::BEXTRI: {
38844 SDValue Op0 = Op.getOperand(0);
38845 SDValue Op1 = Op.getOperand(1);
38846
38847 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38848 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38849 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38850
38851 // If the length is 0, the result is 0.
38852 if (Length == 0) {
38853 Known.setAllZero();
38854 break;
38855 }
38856
38857 if ((Shift + Length) <= BitWidth) {
38858 Known = DAG.computeKnownBits(Op0, Depth + 1);
38859 Known = Known.extractBits(Length, Shift);
38860 Known = Known.zextOrTrunc(BitWidth);
38861 }
38862 }
38863 break;
38864 }
38865 case X86ISD::PDEP: {
38866 KnownBits Known2;
38867 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38868 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38869 // Zeros are retained from the mask operand. But not ones.
38870 Known.One.clearAllBits();
38871 // The result will have at least as many trailing zeros as the non-mask
38872 // operand since bits can only map to the same or higher bit position.
38873 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38874 break;
38875 }
38876 case X86ISD::PEXT: {
38877 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38878 // The result has as many leading zeros as the number of zeroes in the mask.
38879 unsigned Count = Known.Zero.popcount();
38881 Known.One.clearAllBits();
38882 break;
38883 }
38884 case X86ISD::VTRUNC:
38885 case X86ISD::VTRUNCS:
38886 case X86ISD::VTRUNCUS:
38887 case X86ISD::CVTSI2P:
38888 case X86ISD::CVTUI2P:
38889 case X86ISD::CVTP2SI:
38890 case X86ISD::CVTP2UI:
38891 case X86ISD::MCVTP2SI:
38892 case X86ISD::MCVTP2UI:
38893 case X86ISD::CVTTP2SI:
38894 case X86ISD::CVTTP2UI:
38895 case X86ISD::MCVTTP2SI:
38896 case X86ISD::MCVTTP2UI:
38897 case X86ISD::MCVTSI2P:
38898 case X86ISD::MCVTUI2P:
38899 case X86ISD::VFPROUND:
38900 case X86ISD::VMFPROUND:
38901 case X86ISD::CVTPS2PH:
38902 case X86ISD::MCVTPS2PH:
38903 case X86ISD::MCVTTP2SIS:
38904 case X86ISD::MCVTTP2UIS: {
38905 // Truncations/Conversions - upper elements are known zero.
38906 EVT SrcVT = Op.getOperand(0).getValueType();
38907 if (SrcVT.isVector()) {
38908 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38909 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38910 Known.setAllZero();
38911 }
38912 break;
38913 }
38920 // Strict Conversions - upper elements are known zero.
38921 EVT SrcVT = Op.getOperand(1).getValueType();
38922 if (SrcVT.isVector()) {
38923 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38924 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38925 Known.setAllZero();
38926 }
38927 break;
38928 }
38929 case X86ISD::MOVQ2DQ: {
38930 // Move from MMX to XMM. Upper half of XMM should be 0.
38931 if (DemandedElts.countr_zero() >= (NumElts / 2))
38932 Known.setAllZero();
38933 break;
38934 }
38936 APInt UndefElts;
38937 SmallVector<APInt, 16> EltBits;
38938 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38939 /*AllowWholeUndefs*/ false,
38940 /*AllowPartialUndefs*/ false)) {
38941 Known.Zero.setAllBits();
38942 Known.One.setAllBits();
38943 for (unsigned I = 0; I != NumElts; ++I) {
38944 if (!DemandedElts[I])
38945 continue;
38946 if (UndefElts[I]) {
38947 Known.resetAll();
38948 break;
38949 }
38950 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38951 Known = Known.intersectWith(Known2);
38952 }
38953 return;
38954 }
38955 break;
38956 }
38957 case X86ISD::HADD:
38958 case X86ISD::HSUB: {
38960 Op, DemandedElts, Depth, DAG,
38961 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38963 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38964 KnownLHS, KnownRHS);
38965 });
38966 break;
38967 }
38969 switch (Op->getConstantOperandVal(0)) {
38970 case Intrinsic::x86_sse2_pmadd_wd:
38971 case Intrinsic::x86_avx2_pmadd_wd:
38972 case Intrinsic::x86_avx512_pmaddw_d_512: {
38973 SDValue LHS = Op.getOperand(1);
38974 SDValue RHS = Op.getOperand(2);
38975 assert(VT.getScalarType() == MVT::i32 &&
38976 LHS.getValueType() == RHS.getValueType() &&
38977 LHS.getValueType().getScalarType() == MVT::i16 &&
38978 "Unexpected PMADDWD types");
38979 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38980 break;
38981 }
38982 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38983 case Intrinsic::x86_avx2_pmadd_ub_sw:
38984 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38985 SDValue LHS = Op.getOperand(1);
38986 SDValue RHS = Op.getOperand(2);
38987 assert(VT.getScalarType() == MVT::i16 &&
38988 LHS.getValueType() == RHS.getValueType() &&
38989 LHS.getValueType().getScalarType() == MVT::i8 &&
38990 "Unexpected PMADDUBSW types");
38991 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38992 break;
38993 }
38994 case Intrinsic::x86_sse2_psad_bw:
38995 case Intrinsic::x86_avx2_psad_bw:
38996 case Intrinsic::x86_avx512_psad_bw_512: {
38997 SDValue LHS = Op.getOperand(1);
38998 SDValue RHS = Op.getOperand(2);
38999 assert(VT.getScalarType() == MVT::i64 &&
39000 LHS.getValueType() == RHS.getValueType() &&
39001 LHS.getValueType().getScalarType() == MVT::i8 &&
39002 "Unexpected PSADBW types");
39003 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
39004 break;
39005 }
39006 }
39007 break;
39008 }
39009 case X86ISD::VPMADD52L:
39010 case X86ISD::VPMADD52H: {
39011 assert(Op.getValueType().isVector() &&
39012 Op.getValueType().getScalarType() == MVT::i64 &&
39013 "Unexpected VPMADD52 type");
39014 KnownBits K0 =
39015 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
39016 KnownBits K1 =
39017 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
39018 KnownBits KAcc =
39019 DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
39020 K0 = K0.trunc(52);
39021 K1 = K1.trunc(52);
39022 KnownBits KnownMul = (Op.getOpcode() == X86ISD::VPMADD52L)
39023 ? KnownBits::mul(K0, K1)
39024 : KnownBits::mulhu(K0, K1);
39025 KnownMul = KnownMul.zext(64);
39026 Known = KnownBits::add(KAcc, KnownMul);
39027 return;
39028 }
39029 }
39030
39031 // Handle target shuffles.
39032 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39033 if (isTargetShuffle(Opc)) {
39036 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39037 unsigned NumOps = Ops.size();
39038 unsigned NumElts = VT.getVectorNumElements();
39039 if (Mask.size() == NumElts) {
39040 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39041 Known.Zero.setAllBits(); Known.One.setAllBits();
39042 for (unsigned i = 0; i != NumElts; ++i) {
39043 if (!DemandedElts[i])
39044 continue;
39045 int M = Mask[i];
39046 if (M == SM_SentinelUndef) {
39047 // For UNDEF elements, we don't know anything about the common state
39048 // of the shuffle result.
39049 Known.resetAll();
39050 break;
39051 }
39052 if (M == SM_SentinelZero) {
39053 Known.One.clearAllBits();
39054 continue;
39055 }
39056 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39057 "Shuffle index out of range");
39058
39059 unsigned OpIdx = (unsigned)M / NumElts;
39060 unsigned EltIdx = (unsigned)M % NumElts;
39061 if (Ops[OpIdx].getValueType() != VT) {
39062 // TODO - handle target shuffle ops with different value types.
39063 Known.resetAll();
39064 break;
39065 }
39066 DemandedOps[OpIdx].setBit(EltIdx);
39067 }
39068 // Known bits are the values that are shared by every demanded element.
39069 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39070 if (!DemandedOps[i])
39071 continue;
39072 KnownBits Known2 =
39073 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39074 Known = Known.intersectWith(Known2);
39075 }
39076 }
39077 }
39078 }
39079}
39080
39082 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39083 unsigned Depth) const {
39084 EVT VT = Op.getValueType();
39085 unsigned VTBits = VT.getScalarSizeInBits();
39086 unsigned Opcode = Op.getOpcode();
39087 switch (Opcode) {
39089 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39090 return VTBits;
39091
39092 case X86ISD::VTRUNC: {
39093 SDValue Src = Op.getOperand(0);
39094 MVT SrcVT = Src.getSimpleValueType();
39095 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39096 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39097 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39098 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39099 if (Tmp > (NumSrcBits - VTBits))
39100 return Tmp - (NumSrcBits - VTBits);
39101 return 1;
39102 }
39103
39104 case X86ISD::PACKSS: {
39105 // PACKSS is just a truncation if the sign bits extend to the packed size.
39106 APInt DemandedLHS, DemandedRHS;
39107 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39108 DemandedRHS);
39109
39110 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39111 // patterns often used to compact vXi64 allsignbit patterns.
39112 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39114 if (BC.getOpcode() == X86ISD::PACKSS &&
39115 BC.getScalarValueSizeInBits() == 16 &&
39116 V.getScalarValueSizeInBits() == 32) {
39119 if (BC0.getScalarValueSizeInBits() == 64 &&
39120 BC1.getScalarValueSizeInBits() == 64 &&
39121 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39122 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39123 return 32;
39124 }
39125 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39126 };
39127
39128 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39129 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39130 if (!!DemandedLHS)
39131 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39132 if (!!DemandedRHS)
39133 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39134 unsigned Tmp = std::min(Tmp0, Tmp1);
39135 if (Tmp > (SrcBits - VTBits))
39136 return Tmp - (SrcBits - VTBits);
39137 return 1;
39138 }
39139
39140 case X86ISD::VBROADCAST: {
39141 SDValue Src = Op.getOperand(0);
39142 if (!Src.getSimpleValueType().isVector())
39143 return DAG.ComputeNumSignBits(Src, Depth + 1);
39144 break;
39145 }
39146
39147 case X86ISD::VSHLI: {
39148 SDValue Src = Op.getOperand(0);
39149 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39150 if (ShiftVal.uge(VTBits))
39151 return VTBits; // Shifted all bits out --> zero.
39152 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39153 if (ShiftVal.uge(Tmp))
39154 return 1; // Shifted all sign bits out --> unknown.
39155 return Tmp - ShiftVal.getZExtValue();
39156 }
39157
39158 case X86ISD::VSRAI: {
39159 SDValue Src = Op.getOperand(0);
39160 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39161 if (ShiftVal.uge(VTBits - 1))
39162 return VTBits; // Sign splat.
39163 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39164 ShiftVal += Tmp;
39165 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39166 }
39167
39168 case X86ISD::FSETCC:
39169 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39170 if (VT == MVT::f32 || VT == MVT::f64 ||
39171 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39172 return VTBits;
39173 break;
39174
39175 case X86ISD::PCMPGT:
39176 case X86ISD::PCMPEQ:
39177 case X86ISD::CMPP:
39178 case X86ISD::VPCOM:
39179 case X86ISD::VPCOMU:
39180 // Vector compares return zero/all-bits result values.
39181 return VTBits;
39182
39183 case X86ISD::ANDNP: {
39184 unsigned Tmp0 =
39185 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39186 if (Tmp0 == 1) return 1; // Early out.
39187 unsigned Tmp1 =
39188 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39189 return std::min(Tmp0, Tmp1);
39190 }
39191
39192 case X86ISD::CMOV: {
39193 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39194 if (Tmp0 == 1) return 1; // Early out.
39195 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39196 return std::min(Tmp0, Tmp1);
39197 }
39198 }
39199
39200 // Handle target shuffles.
39201 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39202 if (isTargetShuffle(Opcode)) {
39205 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39206 unsigned NumOps = Ops.size();
39207 unsigned NumElts = VT.getVectorNumElements();
39208 if (Mask.size() == NumElts) {
39209 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39210 for (unsigned i = 0; i != NumElts; ++i) {
39211 if (!DemandedElts[i])
39212 continue;
39213 int M = Mask[i];
39214 if (M == SM_SentinelUndef) {
39215 // For UNDEF elements, we don't know anything about the common state
39216 // of the shuffle result.
39217 return 1;
39218 } else if (M == SM_SentinelZero) {
39219 // Zero = all sign bits.
39220 continue;
39221 }
39222 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39223 "Shuffle index out of range");
39224
39225 unsigned OpIdx = (unsigned)M / NumElts;
39226 unsigned EltIdx = (unsigned)M % NumElts;
39227 if (Ops[OpIdx].getValueType() != VT) {
39228 // TODO - handle target shuffle ops with different value types.
39229 return 1;
39230 }
39231 DemandedOps[OpIdx].setBit(EltIdx);
39232 }
39233 unsigned Tmp0 = VTBits;
39234 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39235 if (!DemandedOps[i])
39236 continue;
39237 unsigned Tmp1 =
39238 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39239 Tmp0 = std::min(Tmp0, Tmp1);
39240 }
39241 return Tmp0;
39242 }
39243 }
39244 }
39245
39246 // Fallback case.
39247 return 1;
39248}
39249
39251 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39252 return N->getOperand(0);
39253 return N;
39254}
39255
39256// Helper to look for a normal load that can be narrowed into a vzload with the
39257// specified VT and memory VT. Returns SDValue() on failure.
39259 SelectionDAG &DAG) {
39260 // Can't if the load is volatile or atomic.
39261 if (!LN->isSimple())
39262 return SDValue();
39263
39264 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39265 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39266 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39267 LN->getPointerInfo(), LN->getBaseAlign(),
39268 LN->getMemOperand()->getFlags());
39269}
39270
39271// Attempt to match a combined shuffle mask against supported unary shuffle
39272// instructions.
39273// TODO: Investigate sharing more of this with shuffle lowering.
39274static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39275 bool AllowFloatDomain, bool AllowIntDomain,
39276 SDValue V1, const SelectionDAG &DAG,
39277 const X86Subtarget &Subtarget, unsigned &Shuffle,
39278 MVT &SrcVT, MVT &DstVT) {
39279 unsigned NumMaskElts = Mask.size();
39280 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39281
39282 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39283 if (Mask[0] == 0 &&
39284 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39285 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39287 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39288 Shuffle = X86ISD::VZEXT_MOVL;
39289 if (MaskEltSize == 16)
39290 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39291 else
39292 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39293 return true;
39294 }
39295 }
39296
39297 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39298 if (AllowIntDomain &&
39299 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39300 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39301 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39302 unsigned MaxScale = 64 / MaskEltSize;
39303 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39304 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39305 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39306 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39307 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39308 continue;
39309 bool MatchAny = true;
39310 bool MatchZero = true;
39311 bool MatchSign = UseSign;
39312 unsigned NumDstElts = NumMaskElts / Scale;
39313 for (unsigned i = 0;
39314 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39315 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39316 MatchAny = MatchSign = MatchZero = false;
39317 break;
39318 }
39319 unsigned Pos = (i * Scale) + 1;
39320 unsigned Len = Scale - 1;
39321 MatchAny &= isUndefInRange(Mask, Pos, Len);
39322 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39323 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39324 }
39325 if (MatchAny || MatchSign || MatchZero) {
39326 assert((MatchSign || MatchZero) &&
39327 "Failed to match sext/zext but matched aext?");
39328 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39329 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39330 : MVT::getIntegerVT(MaskEltSize);
39331 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39332
39333 Shuffle = unsigned(
39334 MatchAny ? ISD::ANY_EXTEND
39335 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39336 if (SrcVT.getVectorNumElements() != NumDstElts)
39337 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39338
39339 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39340 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39341 return true;
39342 }
39343 }
39344 }
39345
39346 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39347 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39348 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39349 isUndefOrEqual(Mask[0], 0) &&
39350 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39351 Shuffle = X86ISD::VZEXT_MOVL;
39352 if (MaskEltSize == 16)
39353 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39354 else
39355 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39356 return true;
39357 }
39358
39359 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39360 // instructions are no slower than UNPCKLPD but has the option to
39361 // fold the input operand into even an unaligned memory load.
39362 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39363 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39364 Shuffle = X86ISD::MOVDDUP;
39365 SrcVT = DstVT = MVT::v2f64;
39366 return true;
39367 }
39368 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39369 Shuffle = X86ISD::MOVSLDUP;
39370 SrcVT = DstVT = MVT::v4f32;
39371 return true;
39372 }
39373 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39374 Shuffle = X86ISD::MOVSHDUP;
39375 SrcVT = DstVT = MVT::v4f32;
39376 return true;
39377 }
39378 }
39379
39380 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39381 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39382 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39383 Shuffle = X86ISD::MOVDDUP;
39384 SrcVT = DstVT = MVT::v4f64;
39385 return true;
39386 }
39387 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39388 V1)) {
39389 Shuffle = X86ISD::MOVSLDUP;
39390 SrcVT = DstVT = MVT::v8f32;
39391 return true;
39392 }
39393 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39394 V1)) {
39395 Shuffle = X86ISD::MOVSHDUP;
39396 SrcVT = DstVT = MVT::v8f32;
39397 return true;
39398 }
39399 }
39400
39401 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39402 assert(Subtarget.hasAVX512() &&
39403 "AVX512 required for 512-bit vector shuffles");
39404 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39405 V1)) {
39406 Shuffle = X86ISD::MOVDDUP;
39407 SrcVT = DstVT = MVT::v8f64;
39408 return true;
39409 }
39411 MaskVT, Mask,
39412 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39413 Shuffle = X86ISD::MOVSLDUP;
39414 SrcVT = DstVT = MVT::v16f32;
39415 return true;
39416 }
39418 MaskVT, Mask,
39419 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39420 Shuffle = X86ISD::MOVSHDUP;
39421 SrcVT = DstVT = MVT::v16f32;
39422 return true;
39423 }
39424 }
39425
39426 return false;
39427}
39428
39429// Attempt to match a combined shuffle mask against supported unary immediate
39430// permute instructions.
39431// TODO: Investigate sharing more of this with shuffle lowering.
39433 const APInt &Zeroable,
39434 bool AllowFloatDomain, bool AllowIntDomain,
39435 const SelectionDAG &DAG,
39436 const X86Subtarget &Subtarget,
39437 unsigned &Shuffle, MVT &ShuffleVT,
39438 unsigned &PermuteImm) {
39439 unsigned NumMaskElts = Mask.size();
39440 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39441 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39442 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39443 bool ContainsZeros = isAnyZero(Mask);
39444
39445 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39446 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39447 // Check for lane crossing permutes.
39448 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39449 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39450 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39451 Shuffle = X86ISD::VPERMI;
39452 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39453 PermuteImm = getV4X86ShuffleImm(Mask);
39454 return true;
39455 }
39456 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39457 SmallVector<int, 4> RepeatedMask;
39458 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39459 Shuffle = X86ISD::VPERMI;
39460 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39461 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39462 return true;
39463 }
39464 }
39465 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39466 // VPERMILPD can permute with a non-repeating shuffle.
39467 Shuffle = X86ISD::VPERMILPI;
39468 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39469 PermuteImm = 0;
39470 for (int i = 0, e = Mask.size(); i != e; ++i) {
39471 int M = Mask[i];
39472 if (M == SM_SentinelUndef)
39473 continue;
39474 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39475 PermuteImm |= (M & 1) << i;
39476 }
39477 return true;
39478 }
39479 }
39480
39481 // We are checking for shuffle match or shift match. Loop twice so we can
39482 // order which we try and match first depending on target preference.
39483 for (unsigned Order = 0; Order < 2; ++Order) {
39484 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39485 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39486 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39487 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39488 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39489 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39490 SmallVector<int, 4> RepeatedMask;
39491 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39492 // Narrow the repeated mask to create 32-bit element permutes.
39493 SmallVector<int, 4> WordMask = RepeatedMask;
39494 if (MaskScalarSizeInBits == 64)
39495 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39496
39497 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39498 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39499 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39500 PermuteImm = getV4X86ShuffleImm(WordMask);
39501 return true;
39502 }
39503 }
39504
39505 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39506 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39507 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39508 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39509 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39510 SmallVector<int, 4> RepeatedMask;
39511 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39512 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39513 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39514
39515 // PSHUFLW: permute lower 4 elements only.
39516 if (isUndefOrInRange(LoMask, 0, 4) &&
39517 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39518 Shuffle = X86ISD::PSHUFLW;
39519 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39520 PermuteImm = getV4X86ShuffleImm(LoMask);
39521 return true;
39522 }
39523
39524 // PSHUFHW: permute upper 4 elements only.
39525 if (isUndefOrInRange(HiMask, 4, 8) &&
39526 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39527 // Offset the HiMask so that we can create the shuffle immediate.
39528 int OffsetHiMask[4];
39529 for (int i = 0; i != 4; ++i)
39530 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39531
39532 Shuffle = X86ISD::PSHUFHW;
39533 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39534 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39535 return true;
39536 }
39537 }
39538 }
39539 } else {
39540 // Attempt to match against bit rotates.
39541 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39542 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39543 Subtarget.hasAVX512())) {
39544 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39545 Subtarget, Mask);
39546 if (0 < RotateAmt) {
39547 Shuffle = X86ISD::VROTLI;
39548 PermuteImm = (unsigned)RotateAmt;
39549 return true;
39550 }
39551 }
39552 }
39553 // Attempt to match against byte/bit shifts.
39554 if (AllowIntDomain &&
39555 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39556 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39557 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39558 int ShiftAmt =
39559 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39560 Zeroable, Subtarget);
39561 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39562 32 <= ShuffleVT.getScalarSizeInBits())) {
39563 // Byte shifts can be slower so only match them on second attempt.
39564 if (Order == 0 &&
39565 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39566 continue;
39567
39568 PermuteImm = (unsigned)ShiftAmt;
39569 return true;
39570 }
39571
39572 }
39573 }
39574
39575 return false;
39576}
39577
39578// Attempt to match a combined unary shuffle mask against supported binary
39579// shuffle instructions.
39580// TODO: Investigate sharing more of this with shuffle lowering.
39581static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39582 bool AllowFloatDomain, bool AllowIntDomain,
39583 SDValue &V1, SDValue &V2, const SDLoc &DL,
39584 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39585 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39586 bool IsUnary) {
39587 unsigned NumMaskElts = Mask.size();
39588 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39589 unsigned SizeInBits = MaskVT.getSizeInBits();
39590
39591 if (MaskVT.is128BitVector()) {
39592 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39593 AllowFloatDomain) {
39594 V2 = V1;
39595 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39596 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39597 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39598 return true;
39599 }
39600 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39601 AllowFloatDomain) {
39602 V2 = V1;
39603 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39604 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39605 return true;
39606 }
39607 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39608 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39609 std::swap(V1, V2);
39610 Shuffle = X86ISD::MOVSD;
39611 SrcVT = DstVT = MVT::v2f64;
39612 return true;
39613 }
39614 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39615 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39616 Shuffle = X86ISD::MOVSS;
39617 SrcVT = DstVT = MVT::v4f32;
39618 return true;
39619 }
39620 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39621 DAG) &&
39622 Subtarget.hasFP16()) {
39623 Shuffle = X86ISD::MOVSH;
39624 SrcVT = DstVT = MVT::v8f16;
39625 return true;
39626 }
39627 }
39628
39629 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39630 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39631 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39632 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39633 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39634 Subtarget)) {
39635 DstVT = MaskVT;
39636 return true;
39637 }
39638 }
39639 // TODO: Can we handle this inside matchShuffleWithPACK?
39640 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39641 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39642 V1.getScalarValueSizeInBits() == 64 &&
39643 V2.getScalarValueSizeInBits() == 64) {
39644 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39645 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39646 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39647 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39648 SrcVT = MVT::v4i32;
39649 DstVT = MVT::v8i16;
39650 Shuffle = X86ISD::PACKUS;
39651 return true;
39652 }
39653 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39654 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39655 SrcVT = MVT::v8i16;
39656 DstVT = MVT::v16i8;
39657 Shuffle = X86ISD::PACKUS;
39658 return true;
39659 }
39660 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39661 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39662 SrcVT = MVT::v4i32;
39663 DstVT = MVT::v8i16;
39664 Shuffle = X86ISD::PACKSS;
39665 return true;
39666 }
39667 }
39668
39669 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39670 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39671 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39672 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39673 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39674 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39675 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39676 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39677 Subtarget)) {
39678 SrcVT = DstVT = MaskVT;
39679 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39680 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39681 return true;
39682 }
39683 }
39684
39685 // Attempt to match against a OR if we're performing a blend shuffle and the
39686 // non-blended source element is zero in each case.
39687 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39688 if (SizeInBits == V1.getValueSizeInBits() &&
39689 SizeInBits == V2.getValueSizeInBits() &&
39690 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39691 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39692 bool IsBlend = true;
39693 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39694 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39695 unsigned Scale1 = NumV1Elts / NumMaskElts;
39696 unsigned Scale2 = NumV2Elts / NumMaskElts;
39697 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39698 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39699 for (unsigned i = 0; i != NumMaskElts; ++i) {
39700 int M = Mask[i];
39701 if (M == SM_SentinelUndef)
39702 continue;
39703 if (M == SM_SentinelZero) {
39704 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39705 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39706 continue;
39707 }
39708 if (M == (int)i) {
39709 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39710 continue;
39711 }
39712 if (M == (int)(i + NumMaskElts)) {
39713 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39714 continue;
39715 }
39716 IsBlend = false;
39717 break;
39718 }
39719 if (IsBlend) {
39720 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39721 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39722 Shuffle = ISD::OR;
39723 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39724 return true;
39725 }
39726 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39727 // FIXME: handle mismatched sizes?
39728 // TODO: investigate if `ISD::OR` handling in
39729 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39730 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39731 unsigned NumElts = V.getValueType().getVectorNumElements();
39732 KnownBits Known(NumElts);
39733 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39734 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39735 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39736 if (PeepholeKnown.isZero())
39737 Known.Zero.setBit(EltIdx);
39738 if (PeepholeKnown.isAllOnes())
39739 Known.One.setBit(EltIdx);
39740 }
39741 return Known;
39742 };
39743
39744 KnownBits V1Known = computeKnownBitsElementWise(V1);
39745 KnownBits V2Known = computeKnownBitsElementWise(V2);
39746
39747 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39748 int M = Mask[i];
39749 if (M == SM_SentinelUndef)
39750 continue;
39751 if (M == SM_SentinelZero) {
39752 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39753 continue;
39754 }
39755 if (M == (int)i) {
39756 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39757 continue;
39758 }
39759 if (M == (int)(i + NumMaskElts)) {
39760 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39761 continue;
39762 }
39763 llvm_unreachable("will not get here.");
39764 }
39765 if (IsBlend) {
39766 Shuffle = ISD::OR;
39767 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39768 return true;
39769 }
39770 }
39771 }
39772 }
39773
39774 return false;
39775}
39776
39778 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39779 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39780 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39781 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39782 unsigned NumMaskElts = Mask.size();
39783 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39784
39785 // Attempt to match against VALIGND/VALIGNQ rotate.
39786 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39787 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39788 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39789 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39790 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39791 MaskVT.getSizeInBits() / EltSizeInBits);
39792 if (!isAnyZero(Mask)) {
39793 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39794 if (0 < Rotation) {
39795 Shuffle = X86ISD::VALIGN;
39796 ShuffleVT = AlignVT;
39797 PermuteImm = Rotation;
39798 return true;
39799 }
39800 }
39801 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39802 unsigned ZeroLo = Zeroable.countr_one();
39803 unsigned ZeroHi = Zeroable.countl_one();
39804 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39805 if (ZeroLo) {
39806 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39807 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39808 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39809 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39810 Shuffle = X86ISD::VALIGN;
39811 ShuffleVT = AlignVT;
39812 PermuteImm = NumMaskElts - ZeroLo;
39813 return true;
39814 }
39815 }
39816 if (ZeroHi) {
39817 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39818 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39819 ZeroHi);
39820 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39821 V2 = V1;
39822 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39823 Shuffle = X86ISD::VALIGN;
39824 ShuffleVT = AlignVT;
39825 PermuteImm = ZeroHi;
39826 return true;
39827 }
39828 }
39829 }
39830
39831 // Attempt to match against PALIGNR byte rotate.
39832 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39833 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39834 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39835 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39836 if (0 < ByteRotation) {
39837 Shuffle = X86ISD::PALIGNR;
39838 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39839 PermuteImm = ByteRotation;
39840 return true;
39841 }
39842 }
39843
39844 // Attempt to combine to X86ISD::BLENDI.
39845 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39846 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39847 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39848 uint64_t BlendMask = 0;
39849 bool ForceV1Zero = false, ForceV2Zero = false;
39850 SmallVector<int, 8> TargetMask(Mask);
39851 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39852 ForceV2Zero, BlendMask)) {
39853 if (MaskVT == MVT::v16i16) {
39854 // We can only use v16i16 PBLENDW if the lanes are repeated.
39855 SmallVector<int, 8> RepeatedMask;
39856 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39857 RepeatedMask)) {
39858 assert(RepeatedMask.size() == 8 &&
39859 "Repeated mask size doesn't match!");
39860 PermuteImm = 0;
39861 for (int i = 0; i < 8; ++i)
39862 if (RepeatedMask[i] >= 8)
39863 PermuteImm |= 1 << i;
39864 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39865 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39866 Shuffle = X86ISD::BLENDI;
39867 ShuffleVT = MaskVT;
39868 return true;
39869 }
39870 } else {
39871 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39872 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39873 PermuteImm = (unsigned)BlendMask;
39874 Shuffle = X86ISD::BLENDI;
39875 ShuffleVT = MaskVT;
39876 return true;
39877 }
39878 }
39879 }
39880
39881 // Attempt to combine to INSERTPS, but only if it has elements that need to
39882 // be set to zero.
39883 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39884 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39885 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39886 Shuffle = X86ISD::INSERTPS;
39887 ShuffleVT = MVT::v4f32;
39888 return true;
39889 }
39890
39891 // Attempt to combine to SHUFPD.
39892 if (AllowFloatDomain && EltSizeInBits == 64 &&
39893 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39894 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39895 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39896 bool ForceV1Zero = false, ForceV2Zero = false;
39897 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39898 PermuteImm, Mask, Zeroable)) {
39899 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39900 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39901 Shuffle = X86ISD::SHUFP;
39902 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39903 return true;
39904 }
39905 }
39906
39907 // Attempt to combine to SHUFPS.
39908 if (AllowFloatDomain && EltSizeInBits == 32 &&
39909 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39910 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39911 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39912 SmallVector<int, 4> RepeatedMask;
39913 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39914 // Match each half of the repeated mask, to determine if its just
39915 // referencing one of the vectors, is zeroable or entirely undef.
39916 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39917 int M0 = RepeatedMask[Offset];
39918 int M1 = RepeatedMask[Offset + 1];
39919
39920 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39921 return DAG.getUNDEF(MaskVT);
39922 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39923 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39924 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39925 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39926 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39927 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39928 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39929 return V1;
39930 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39931 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39932 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39933 return V2;
39934 }
39935
39936 return SDValue();
39937 };
39938
39939 int ShufMask[4] = {-1, -1, -1, -1};
39940 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39941 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39942
39943 if (Lo && Hi) {
39944 V1 = Lo;
39945 V2 = Hi;
39946 Shuffle = X86ISD::SHUFP;
39947 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39948 PermuteImm = getV4X86ShuffleImm(ShufMask);
39949 return true;
39950 }
39951 }
39952 }
39953
39954 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39955 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39956 MaskVT.is128BitVector() &&
39957 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39958 Shuffle = X86ISD::INSERTPS;
39959 ShuffleVT = MVT::v4f32;
39960 return true;
39961 }
39962
39963 return false;
39964}
39965
39967 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39968 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39969 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39970 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39971 const X86Subtarget &Subtarget);
39972
39973/// Combine an arbitrary chain of shuffles into a single instruction if
39974/// possible.
39975///
39976/// This is the leaf of the recursive combine below. When we have found some
39977/// chain of single-use x86 shuffle instructions and accumulated the combined
39978/// shuffle mask represented by them, this will try to pattern match that mask
39979/// into either a single instruction if there is a special purpose instruction
39980/// for this operation, or into a PSHUFB instruction which is a fully general
39981/// instruction but should only be used to replace chains over a certain depth.
39983 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
39984 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39985 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39986 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39987 const X86Subtarget &Subtarget) {
39988 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39989 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39990 "Unexpected number of shuffle inputs!");
39991 unsigned RootSizeInBits = RootVT.getSizeInBits();
39992 unsigned NumRootElts = RootVT.getVectorNumElements();
39993
39994 // Canonicalize shuffle input op to the requested type.
39995 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39996 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39997 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39998 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39999 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
40000 return DAG.getBitcast(VT, Op);
40001 };
40002
40003 // Find the inputs that enter the chain. Note that multiple uses are OK
40004 // here, we're not going to remove the operands we find.
40005 bool UnaryShuffle = (Inputs.size() == 1);
40006 SDValue V1 = peekThroughBitcasts(Inputs[0]);
40007 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
40008 : peekThroughBitcasts(Inputs[1]));
40009
40010 MVT VT1 = V1.getSimpleValueType();
40011 MVT VT2 = V2.getSimpleValueType();
40012 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
40013 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
40014
40015 SDValue Res;
40016
40017 unsigned NumBaseMaskElts = BaseMask.size();
40018 if (NumBaseMaskElts == 1) {
40019 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
40020 return CanonicalizeShuffleInput(RootVT, V1);
40021 }
40022
40023 bool OptForSize = DAG.shouldOptForSize();
40024 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
40025 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
40026 (RootVT.isFloatingPoint() && Depth >= 1) ||
40027 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
40028
40029 // If we are shuffling a splat (and not introducing zeros) then we can just
40030 // use it directly. This works for smaller elements as well as they already
40031 // repeat across each mask element.
40032 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40033 V1.getValueSizeInBits() >= RootSizeInBits &&
40034 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40035 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40036 return CanonicalizeShuffleInput(RootVT, V1);
40037 }
40038
40039 SmallVector<int, 64> Mask(BaseMask);
40040
40041 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40042 // etc. can be simplified.
40043 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40044 SmallVector<int> ScaledMask, IdentityMask;
40045 unsigned NumElts = VT1.getVectorNumElements();
40046 if (Mask.size() <= NumElts &&
40047 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40048 for (unsigned i = 0; i != NumElts; ++i)
40049 IdentityMask.push_back(i);
40050 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40051 V2))
40052 return CanonicalizeShuffleInput(RootVT, V1);
40053 }
40054 }
40055
40056 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40057 if (RootVT.is512BitVector() &&
40058 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40059 // If the upper subvectors are zeroable, then an extract+insert is more
40060 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40061 // to zero the upper subvectors.
40062 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40063 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40064 return SDValue(); // Nothing to do!
40065 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40066 "Unexpected lane shuffle");
40067 Res = CanonicalizeShuffleInput(RootVT, V1);
40068 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40069 bool UseZero = isAnyZero(Mask);
40070 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40071 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40072 }
40073
40074 // Narrow shuffle mask to v4x128.
40075 SmallVector<int, 4> ScaledMask;
40076 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40077 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40078
40079 // Try to lower to vshuf64x2/vshuf32x4.
40080 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40081 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40082 SelectionDAG &DAG) {
40083 int PermMask[4] = {-1, -1, -1, -1};
40084 // Ensure elements came from the same Op.
40085 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40086 for (int i = 0; i < 4; ++i) {
40087 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40088 if (ScaledMask[i] < 0)
40089 continue;
40090
40091 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40092 unsigned OpIndex = i / 2;
40093 if (Ops[OpIndex].isUndef())
40094 Ops[OpIndex] = Op;
40095 else if (Ops[OpIndex] != Op)
40096 return SDValue();
40097
40098 PermMask[i] = ScaledMask[i] % 4;
40099 }
40100
40101 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40102 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40103 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40104 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40105 };
40106
40107 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40108 // doesn't work because our mask is for 128 bits and we don't have an MVT
40109 // to match that.
40110 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40111 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40112 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40113 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40114 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40115 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40116 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40117 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40118 ScaledMask[1] == (ScaledMask[3] % 2));
40119
40120 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40121 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40122 return SDValue(); // Nothing to do!
40123 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40124 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40125 return DAG.getBitcast(RootVT, V);
40126 }
40127 }
40128
40129 // Handle 128-bit lane shuffles of 256-bit vectors.
40130 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40131 // If the upper half is zeroable, then an extract+insert is more optimal
40132 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40133 // zero the upper half.
40134 if (isUndefOrZero(Mask[1])) {
40135 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40136 return SDValue(); // Nothing to do!
40137 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40138 Res = CanonicalizeShuffleInput(RootVT, V1);
40139 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40140 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40141 256);
40142 }
40143
40144 // If we're inserting the low subvector, an insert-subvector 'concat'
40145 // pattern is quicker than VPERM2X128.
40146 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40147 !Subtarget.hasAVX2()) {
40148 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40149 return SDValue(); // Nothing to do!
40150 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40151 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40152 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40153 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40154 }
40155
40156 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40157 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40158 // feature.
40159 // Prefer blends for sequential shuffles unless we are optimizing for size.
40160 if (UnaryShuffle &&
40161 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40162 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40163 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40164 return SDValue(); // Nothing to do!
40165 unsigned PermMask = 0;
40166 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40167 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40168 return DAG.getNode(
40169 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40170 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40171 }
40172
40173 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40174 return SDValue(); // Nothing to do!
40175
40176 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40177 if (!UnaryShuffle && !IsMaskedShuffle) {
40178 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40179 "Unexpected shuffle sentinel value");
40180 // Prefer blends to X86ISD::VPERM2X128.
40181 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40182 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40183 return SDValue(); // Nothing to do!
40184 unsigned PermMask = 0;
40185 PermMask |= ((Mask[0] & 3) << 0);
40186 PermMask |= ((Mask[1] & 3) << 4);
40187 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40188 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40189 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40190 CanonicalizeShuffleInput(RootVT, LHS),
40191 CanonicalizeShuffleInput(RootVT, RHS),
40192 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40193 }
40194 }
40195 }
40196
40197 // For masks that have been widened to 128-bit elements or more,
40198 // narrow back down to 64-bit elements.
40199 if (BaseMaskEltSizeInBits > 64) {
40200 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40201 int MaskScale = BaseMaskEltSizeInBits / 64;
40202 SmallVector<int, 64> ScaledMask;
40203 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40204 Mask = std::move(ScaledMask);
40205 }
40206
40207 // For masked shuffles, we're trying to match the root width for better
40208 // writemask folding, attempt to scale the mask.
40209 // TODO - variable shuffles might need this to be widened again.
40210 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40211 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40212 int MaskScale = NumRootElts / Mask.size();
40213 SmallVector<int, 64> ScaledMask;
40214 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40215 Mask = std::move(ScaledMask);
40216 }
40217
40218 unsigned NumMaskElts = Mask.size();
40219 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40220 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40221
40222 // Determine the effective mask value type.
40223 FloatDomain &= (32 <= MaskEltSizeInBits);
40224 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40225 : MVT::getIntegerVT(MaskEltSizeInBits);
40226 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40227
40228 // Only allow legal mask types.
40229 if (!TLI.isTypeLegal(MaskVT))
40230 return SDValue();
40231
40232 // Attempt to match the mask against known shuffle patterns.
40233 MVT ShuffleSrcVT, ShuffleVT;
40234 unsigned Shuffle, PermuteImm;
40235
40236 // Which shuffle domains are permitted?
40237 // Permit domain crossing at higher combine depths.
40238 // TODO: Should we indicate which domain is preferred if both are allowed?
40239 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40240 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40241 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40242
40243 // Determine zeroable mask elements.
40244 APInt KnownUndef, KnownZero;
40245 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40246 APInt Zeroable = KnownUndef | KnownZero;
40247
40248 if (UnaryShuffle) {
40249 // Attempt to match against broadcast-from-vector.
40250 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40251 if ((Subtarget.hasAVX2() ||
40252 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40253 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40254 if (isUndefOrEqual(Mask, 0)) {
40255 if (V1.getValueType() == MaskVT &&
40257 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40258 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40259 return SDValue(); // Nothing to do!
40260 Res = V1.getOperand(0);
40261 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40262 return DAG.getBitcast(RootVT, Res);
40263 }
40264 if (Subtarget.hasAVX2()) {
40265 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40266 return SDValue(); // Nothing to do!
40267 Res = CanonicalizeShuffleInput(MaskVT, V1);
40268 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40269 return DAG.getBitcast(RootVT, Res);
40270 }
40271 }
40272 }
40273
40274 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40275 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40276 (!IsMaskedShuffle ||
40277 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40278 if (Depth == 0 && RootOpc == Shuffle)
40279 return SDValue(); // Nothing to do!
40280 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40281 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40282 return DAG.getBitcast(RootVT, Res);
40283 }
40284
40285 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40286 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40287 PermuteImm) &&
40288 (!IsMaskedShuffle ||
40289 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40290 if (Depth == 0 && RootOpc == Shuffle)
40291 return SDValue(); // Nothing to do!
40292 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40293 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40294 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40295 return DAG.getBitcast(RootVT, Res);
40296 }
40297 }
40298
40299 // Attempt to combine to INSERTPS, but only if the inserted element has come
40300 // from a scalar.
40301 // TODO: Handle other insertions here as well?
40302 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40303 Subtarget.hasSSE41() &&
40304 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40305 if (MaskEltSizeInBits == 32) {
40306 SDValue SrcV1 = V1, SrcV2 = V2;
40307 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40308 DAG) &&
40309 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40310 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40311 return SDValue(); // Nothing to do!
40312 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40313 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40314 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40315 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40316 return DAG.getBitcast(RootVT, Res);
40317 }
40318 }
40319 if (MaskEltSizeInBits == 64 &&
40320 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40322 V2.getScalarValueSizeInBits() <= 32) {
40323 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40324 return SDValue(); // Nothing to do!
40325 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40326 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40327 CanonicalizeShuffleInput(MVT::v4f32, V1),
40328 CanonicalizeShuffleInput(MVT::v4f32, V2),
40329 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40330 return DAG.getBitcast(RootVT, Res);
40331 }
40332 }
40333
40334 SDValue NewV1 = V1; // Save operands in case early exit happens.
40335 SDValue NewV2 = V2;
40336 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40337 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40338 ShuffleVT, UnaryShuffle) &&
40339 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40340 if (Depth == 0 && RootOpc == Shuffle)
40341 return SDValue(); // Nothing to do!
40342 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40343 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40344 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40345 return DAG.getBitcast(RootVT, Res);
40346 }
40347
40348 NewV1 = V1; // Save operands in case early exit happens.
40349 NewV2 = V2;
40350 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40351 AllowIntDomain, NewV1, NewV2, DL, DAG,
40352 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40353 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40354 if (Depth == 0 && RootOpc == Shuffle)
40355 return SDValue(); // Nothing to do!
40356 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40357 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40358 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40359 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40360 return DAG.getBitcast(RootVT, Res);
40361 }
40362
40363 // Typically from here on, we need an integer version of MaskVT.
40364 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40365 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40366
40367 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40368 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40369 uint64_t BitLen, BitIdx;
40370 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40371 Zeroable)) {
40372 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40373 return SDValue(); // Nothing to do!
40374 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40375 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40376 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40377 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40378 return DAG.getBitcast(RootVT, Res);
40379 }
40380
40381 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40382 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40383 return SDValue(); // Nothing to do!
40384 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40385 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40386 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40387 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40388 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40389 return DAG.getBitcast(RootVT, Res);
40390 }
40391 }
40392
40393 // Match shuffle against TRUNCATE patterns.
40394 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40395 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40396 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40397 Subtarget)) {
40398 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40399 ShuffleSrcVT.getVectorNumElements();
40400 unsigned Opc =
40402 if (Depth == 0 && RootOpc == Opc)
40403 return SDValue(); // Nothing to do!
40404 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40405 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40406 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40407 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40408 return DAG.getBitcast(RootVT, Res);
40409 }
40410
40411 // Do we need a more general binary truncation pattern?
40412 if (RootSizeInBits < 512 &&
40413 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40414 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40415 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40416 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40417 // Bail if this was already a truncation or PACK node.
40418 // We sometimes fail to match PACK if we demand known undef elements.
40419 if (Depth == 0 &&
40420 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40421 RootOpc == X86ISD::PACKUS))
40422 return SDValue(); // Nothing to do!
40423 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40424 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40425 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40426 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40427 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40428 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40429 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40430 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40431 return DAG.getBitcast(RootVT, Res);
40432 }
40433 }
40434
40435 // Don't try to re-form single instruction chains under any circumstances now
40436 // that we've done encoding canonicalization for them.
40437 if (Depth < 1)
40438 return SDValue();
40439
40440 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40441 return isTargetShuffleVariableMask(N->getOpcode());
40442 });
40443 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40444 return (N->getOpcode() == X86ISD::VPERMV3 ||
40445 N->getOpcode() == X86ISD::VPERMV);
40446 });
40447
40448 // Depth threshold above which we can efficiently use variable mask shuffles.
40449 int VariableCrossLaneShuffleDepth =
40450 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40451 int VariablePerLaneShuffleDepth =
40452 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40453 AllowVariableCrossLaneMask &=
40454 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40455 AllowVariablePerLaneMask &=
40456 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40457 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40458 // higher depth before combining them.
40459 int BWIVPERMV3ShuffleDepth =
40460 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40461 bool AllowBWIVPERMV3 =
40462 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40463
40464 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40465 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40466 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40467
40468 bool MaskContainsZeros = isAnyZero(Mask);
40469
40470 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40471 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40472 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40473 if (Subtarget.hasAVX2() &&
40474 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40475 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40476 Res = CanonicalizeShuffleInput(MaskVT, V1);
40477 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40478 return DAG.getBitcast(RootVT, Res);
40479 }
40480 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40481 if ((Subtarget.hasAVX512() &&
40482 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40483 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40484 (Subtarget.hasBWI() &&
40485 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40486 (Subtarget.hasVBMI() &&
40487 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40488 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40489 V2 = DAG.getUNDEF(MaskVT);
40490 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40491 return DAG.getBitcast(RootVT, Res);
40492 }
40493 }
40494
40495 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40496 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40497 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40498 ((Subtarget.hasAVX512() &&
40499 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40500 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40501 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40502 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40503 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40504 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40505 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40506 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40507 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40508 for (unsigned i = 0; i != NumMaskElts; ++i)
40509 if (Mask[i] == SM_SentinelZero)
40510 Mask[i] = NumMaskElts + i;
40511 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40512 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40513 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40514 return DAG.getBitcast(RootVT, Res);
40515 }
40516
40517 // If that failed and either input is extracted then try to combine as a
40518 // shuffle with the larger type.
40520 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40521 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40522 IsMaskedShuffle, DAG, DL, Subtarget))
40523 return WideShuffle;
40524
40525 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40526 // (non-VLX will pad to 512-bit shuffles).
40527 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40528 ((Subtarget.hasAVX512() &&
40529 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40530 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40531 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40532 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40533 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40534 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40535 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40536 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40537 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40538 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40539 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40540 return DAG.getBitcast(RootVT, Res);
40541 }
40542 return SDValue();
40543 }
40544
40545 // See if we can combine a single input shuffle with zeros to a bit-mask,
40546 // which is much simpler than any shuffle.
40547 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40548 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40549 TLI.isTypeLegal(MaskVT)) {
40550 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40551 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40552 APInt UndefElts(NumMaskElts, 0);
40553 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40554 for (unsigned i = 0; i != NumMaskElts; ++i) {
40555 int M = Mask[i];
40556 if (M == SM_SentinelUndef) {
40557 UndefElts.setBit(i);
40558 continue;
40559 }
40560 if (M == SM_SentinelZero)
40561 continue;
40562 EltBits[i] = AllOnes;
40563 }
40564 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40565 Res = CanonicalizeShuffleInput(MaskVT, V1);
40566 unsigned AndOpcode =
40568 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40569 return DAG.getBitcast(RootVT, Res);
40570 }
40571
40572 // If we have a single input shuffle with different shuffle patterns in the
40573 // the 128-bit lanes use the variable mask to VPERMILPS.
40574 // TODO Combine other mask types at higher depths.
40575 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40576 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40577 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40578 SmallVector<SDValue, 16> VPermIdx;
40579 for (int M : Mask) {
40580 SDValue Idx =
40581 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40582 VPermIdx.push_back(Idx);
40583 }
40584 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40585 Res = CanonicalizeShuffleInput(MaskVT, V1);
40586 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40587 return DAG.getBitcast(RootVT, Res);
40588 }
40589
40590 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40591 // to VPERMIL2PD/VPERMIL2PS.
40592 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40593 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40594 MaskVT == MVT::v8f32)) {
40595 // VPERMIL2 Operation.
40596 // Bits[3] - Match Bit.
40597 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40598 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40599 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40600 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40601 SmallVector<int, 8> VPerm2Idx;
40602 unsigned M2ZImm = 0;
40603 for (int M : Mask) {
40604 if (M == SM_SentinelUndef) {
40605 VPerm2Idx.push_back(-1);
40606 continue;
40607 }
40608 if (M == SM_SentinelZero) {
40609 M2ZImm = 2;
40610 VPerm2Idx.push_back(8);
40611 continue;
40612 }
40613 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40614 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40615 VPerm2Idx.push_back(Index);
40616 }
40617 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40618 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40619 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40620 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40621 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40622 return DAG.getBitcast(RootVT, Res);
40623 }
40624
40625 // If we have 3 or more shuffle instructions or a chain involving a variable
40626 // mask, we can replace them with a single PSHUFB instruction profitably.
40627 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40628 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40629 // more aggressive.
40630 if (UnaryShuffle && AllowVariablePerLaneMask &&
40631 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40632 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40633 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40634 SmallVector<SDValue, 16> PSHUFBMask;
40635 int NumBytes = RootVT.getSizeInBits() / 8;
40636 int Ratio = NumBytes / NumMaskElts;
40637 for (int i = 0; i < NumBytes; ++i) {
40638 int M = Mask[i / Ratio];
40639 if (M == SM_SentinelUndef) {
40640 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40641 continue;
40642 }
40643 if (M == SM_SentinelZero) {
40644 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40645 continue;
40646 }
40647 M = Ratio * M + i % Ratio;
40648 assert((M / 16) == (i / 16) && "Lane crossing detected");
40649 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40650 }
40651 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40652 Res = CanonicalizeShuffleInput(ByteVT, V1);
40653 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40654 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40655 return DAG.getBitcast(RootVT, Res);
40656 }
40657
40658 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40659 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40660 // slower than PSHUFB on targets that support both.
40661 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40662 Subtarget.hasXOP()) {
40663 // VPPERM Mask Operation
40664 // Bits[4:0] - Byte Index (0 - 31)
40665 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40666 SmallVector<SDValue, 16> VPPERMMask;
40667 int NumBytes = 16;
40668 int Ratio = NumBytes / NumMaskElts;
40669 for (int i = 0; i < NumBytes; ++i) {
40670 int M = Mask[i / Ratio];
40671 if (M == SM_SentinelUndef) {
40672 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40673 continue;
40674 }
40675 if (M == SM_SentinelZero) {
40676 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40677 continue;
40678 }
40679 M = Ratio * M + i % Ratio;
40680 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40681 }
40682 MVT ByteVT = MVT::v16i8;
40683 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40684 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40685 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40686 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40687 return DAG.getBitcast(RootVT, Res);
40688 }
40689
40690 // If that failed and either input is extracted then try to combine as a
40691 // shuffle with the larger type.
40693 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40694 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40695 DAG, DL, Subtarget))
40696 return WideShuffle;
40697
40698 // If we have a dual input shuffle then lower to VPERMV3,
40699 // (non-VLX will pad to 512-bit shuffles)
40700 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40701 ((Subtarget.hasAVX512() &&
40702 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40703 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40704 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40705 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40706 MaskVT == MVT::v16i32)) ||
40707 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40708 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40709 MaskVT == MVT::v32i16)) ||
40710 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40711 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40712 MaskVT == MVT::v64i8)))) {
40713 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40714 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40715 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40716 return DAG.getBitcast(RootVT, Res);
40717 }
40718
40719 // Failed to find any combines.
40720 return SDValue();
40721}
40722
40723// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40724// instruction if possible.
40725//
40726// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40727// type size to attempt to combine:
40728// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40729// -->
40730// extract_subvector(shuffle(x,y,m2),0)
40732 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40733 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40734 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40735 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40736 const X86Subtarget &Subtarget) {
40737 unsigned NumMaskElts = BaseMask.size();
40738 unsigned NumInputs = Inputs.size();
40739 if (NumInputs == 0)
40740 return SDValue();
40741
40742 unsigned RootSizeInBits = RootVT.getSizeInBits();
40743 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40744 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40745
40746 // Peek through subvectors to find widest legal vector.
40747 // TODO: Handle ISD::TRUNCATE
40748 unsigned WideSizeInBits = RootSizeInBits;
40749 for (SDValue Input : Inputs) {
40751 while (1) {
40752 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40753 Input = peekThroughBitcasts(Input.getOperand(0));
40754 continue;
40755 }
40756 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40757 Input.getOperand(0).isUndef() &&
40758 isNullConstant(Input.getOperand(2))) {
40759 Input = peekThroughBitcasts(Input.getOperand(1));
40760 continue;
40761 }
40762 break;
40763 }
40764 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40765 WideSizeInBits < Input.getValueSizeInBits())
40766 WideSizeInBits = Input.getValueSizeInBits();
40767 }
40768
40769 // Bail if we fail to find a source larger than the existing root.
40770 if (WideSizeInBits <= RootSizeInBits ||
40771 (WideSizeInBits % RootSizeInBits) != 0)
40772 return SDValue();
40773
40774 // Create new mask for larger type.
40775 SmallVector<int, 64> WideMask;
40776 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40777
40778 // Attempt to peek through inputs and adjust mask when we extract from an
40779 // upper subvector.
40780 int AdjustedMasks = 0;
40781 SmallVector<SDValue, 4> WideInputs(Inputs);
40782 for (unsigned I = 0; I != NumInputs; ++I) {
40783 SDValue &Input = WideInputs[I];
40785 while (1) {
40786 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40787 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40788 uint64_t Idx = Input.getConstantOperandVal(1);
40789 if (Idx != 0) {
40790 ++AdjustedMasks;
40791 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40792 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40793
40794 int lo = I * WideMask.size();
40795 int hi = (I + 1) * WideMask.size();
40796 for (int &M : WideMask)
40797 if (lo <= M && M < hi)
40798 M += Idx;
40799 }
40800 Input = peekThroughBitcasts(Input.getOperand(0));
40801 continue;
40802 }
40803 // TODO: Handle insertions into upper subvectors.
40804 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40805 Input.getOperand(0).isUndef() &&
40806 isNullConstant(Input.getOperand(2))) {
40807 Input = peekThroughBitcasts(Input.getOperand(1));
40808 continue;
40809 }
40810 break;
40811 }
40812 }
40813
40814 // Remove unused/repeated shuffle source ops.
40815 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40816 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40817
40818 // Bail if we're always extracting from the lowest subvectors,
40819 // combineX86ShuffleChain should match this for the current width, or the
40820 // shuffle still references too many inputs.
40821 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40822 return SDValue();
40823
40824 // Minor canonicalization of the accumulated shuffle mask to make it easier
40825 // to match below. All this does is detect masks with sequential pairs of
40826 // elements, and shrink them to the half-width mask. It does this in a loop
40827 // so it will reduce the size of the mask to the minimal width mask which
40828 // performs an equivalent shuffle.
40829 while (WideMask.size() > 1) {
40830 SmallVector<int, 64> WidenedMask;
40831 if (!canWidenShuffleElements(WideMask, WidenedMask))
40832 break;
40833 WideMask = std::move(WidenedMask);
40834 }
40835
40836 // Canonicalization of binary shuffle masks to improve pattern matching by
40837 // commuting the inputs.
40838 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40840 std::swap(WideInputs[0], WideInputs[1]);
40841 }
40842
40843 // Increase depth for every upper subvector we've peeked through.
40844 Depth += AdjustedMasks;
40845
40846 // Attempt to combine wider chain.
40847 // TODO: Can we use a better Root?
40848 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40849 WideInputs.back().getValueSizeInBits()
40850 ? WideInputs.front()
40851 : WideInputs.back();
40852 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40853 "WideRootSize mismatch");
40854
40855 if (SDValue WideShuffle = combineX86ShuffleChain(
40856 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40857 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40858 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40859 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40860 return DAG.getBitcast(RootVT, WideShuffle);
40861 }
40862
40863 return SDValue();
40864}
40865
40866// Canonicalize the combined shuffle mask chain with horizontal ops.
40867// NOTE: This may update the Ops and Mask.
40870 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40871 const X86Subtarget &Subtarget) {
40872 if (Mask.empty() || Ops.empty())
40873 return SDValue();
40874
40876 for (SDValue Op : Ops)
40878
40879 // All ops must be the same horizop + type.
40880 SDValue BC0 = BC[0];
40881 EVT VT0 = BC0.getValueType();
40882 unsigned Opcode0 = BC0.getOpcode();
40883 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40884 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40885 }))
40886 return SDValue();
40887
40888 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40889 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40890 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40891 if (!isHoriz && !isPack)
40892 return SDValue();
40893
40894 // Do all ops have a single use?
40895 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40896 return Op.hasOneUse() &&
40898 });
40899
40900 int NumElts = VT0.getVectorNumElements();
40901 int NumLanes = VT0.getSizeInBits() / 128;
40902 int NumEltsPerLane = NumElts / NumLanes;
40903 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40904 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40905 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40906
40907 if (NumEltsPerLane >= 4 &&
40908 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40909 SmallVector<int> LaneMask, ScaledMask;
40910 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40911 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40912 // See if we can remove the shuffle by resorting the HOP chain so that
40913 // the HOP args are pre-shuffled.
40914 // TODO: Generalize to any sized/depth chain.
40915 // TODO: Add support for PACKSS/PACKUS.
40916 if (isHoriz) {
40917 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40918 auto GetHOpSrc = [&](int M) {
40919 if (M == SM_SentinelUndef)
40920 return DAG.getUNDEF(VT0);
40921 if (M == SM_SentinelZero)
40922 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40923 SDValue Src0 = BC[M / 4];
40924 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40925 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40926 return Src1.getOperand(M % 2);
40927 return SDValue();
40928 };
40929 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40930 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40931 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40932 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40933 if (M0 && M1 && M2 && M3) {
40934 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40935 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40936 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40937 }
40938 }
40939 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40940 if (Ops.size() >= 2) {
40941 SDValue LHS, RHS;
40942 auto GetHOpSrc = [&](int M, int &OutM) {
40943 // TODO: Support SM_SentinelZero
40944 if (M < 0)
40945 return M == SM_SentinelUndef;
40946 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40947 if (!LHS || LHS == Src) {
40948 LHS = Src;
40949 OutM = (M % 2);
40950 return true;
40951 }
40952 if (!RHS || RHS == Src) {
40953 RHS = Src;
40954 OutM = (M % 2) + 2;
40955 return true;
40956 }
40957 return false;
40958 };
40959 int PostMask[4] = {-1, -1, -1, -1};
40960 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40961 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40962 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40963 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40964 LHS = DAG.getBitcast(SrcVT, LHS);
40965 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40966 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40967 // Use SHUFPS for the permute so this will work on SSE2 targets,
40968 // shuffle combining and domain handling will simplify this later on.
40969 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40970 Res = DAG.getBitcast(ShuffleVT, Res);
40971 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40972 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40973 }
40974 }
40975 }
40976 }
40977
40978 if (2 < Ops.size())
40979 return SDValue();
40980
40981 SDValue BC1 = BC[BC.size() - 1];
40982 if (Mask.size() == VT0.getVectorNumElements()) {
40983 // Canonicalize binary shuffles of horizontal ops that use the
40984 // same sources to an unary shuffle.
40985 // TODO: Try to perform this fold even if the shuffle remains.
40986 if (Ops.size() == 2) {
40987 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40988 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40989 };
40990 // Commute if all BC0's ops are contained in BC1.
40991 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40992 ContainsOps(BC1, BC0.getOperand(1))) {
40994 std::swap(Ops[0], Ops[1]);
40995 std::swap(BC0, BC1);
40996 }
40997
40998 // If BC1 can be represented by BC0, then convert to unary shuffle.
40999 if (ContainsOps(BC0, BC1.getOperand(0)) &&
41000 ContainsOps(BC0, BC1.getOperand(1))) {
41001 for (int &M : Mask) {
41002 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
41003 continue;
41004 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
41005 M -= NumElts + (SubLane * NumHalfEltsPerLane);
41006 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
41007 M += NumHalfEltsPerLane;
41008 }
41009 }
41010 }
41011
41012 // Canonicalize unary horizontal ops to only refer to lower halves.
41013 for (int i = 0; i != NumElts; ++i) {
41014 int &M = Mask[i];
41015 if (isUndefOrZero(M))
41016 continue;
41017 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
41018 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41019 M -= NumHalfEltsPerLane;
41020 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
41021 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41022 M -= NumHalfEltsPerLane;
41023 }
41024 }
41025
41026 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
41027 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
41028 // represents the LHS/RHS inputs for the lower/upper halves.
41029 SmallVector<int, 16> TargetMask128, WideMask128;
41030 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41031 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41032 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41033 bool SingleOp = (Ops.size() == 1);
41034 if (isPack || OneUseOps ||
41035 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41036 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41037 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41038 Lo = Lo.getOperand(WideMask128[0] & 1);
41039 Hi = Hi.getOperand(WideMask128[1] & 1);
41040 if (SingleOp) {
41041 SDValue Undef = DAG.getUNDEF(SrcVT);
41042 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41043 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41044 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41045 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41046 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41047 }
41048 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41049 }
41050 }
41051
41052 // If we are post-shuffling a 256-bit hop and not requiring the upper
41053 // elements, then try to narrow to a 128-bit hop directly.
41054 SmallVector<int, 16> WideMask64;
41055 if (Ops.size() == 1 && NumLanes == 2 &&
41056 scaleShuffleElements(Mask, 4, WideMask64) &&
41057 isUndefInRange(WideMask64, 2, 2)) {
41058 int M0 = WideMask64[0];
41059 int M1 = WideMask64[1];
41060 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41062 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41063 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41064 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41065 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41066 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41067 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41068 }
41069 }
41070
41071 return SDValue();
41072}
41073
41074// Attempt to constant fold all of the constant source ops.
41075// Returns true if the entire shuffle is folded to a constant.
41076// TODO: Extend this to merge multiple constant Ops and update the mask.
41078 ArrayRef<int> Mask,
41079 ArrayRef<const SDNode *> SrcNodes,
41080 SelectionDAG &DAG, const SDLoc &DL,
41081 const X86Subtarget &Subtarget) {
41082 unsigned SizeInBits = VT.getSizeInBits();
41083 unsigned NumMaskElts = Mask.size();
41084 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41085 unsigned NumOps = Ops.size();
41086
41087 // Extract constant bits from each source op.
41088 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41090 for (unsigned I = 0; I != NumOps; ++I)
41091 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41092 RawBitsOps[I],
41093 /*AllowWholeUndefs*/ true,
41094 /*AllowPartialUndefs*/ true))
41095 return SDValue();
41096
41097 // If we're optimizing for size, only fold if at least one of the constants is
41098 // only used once or the combined shuffle has included a variable mask
41099 // shuffle, this is to avoid constant pool bloat.
41100 bool IsOptimizingSize = DAG.shouldOptForSize();
41101 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41102 return isTargetShuffleVariableMask(N->getOpcode());
41103 });
41104 if (IsOptimizingSize && !HasVariableMask &&
41105 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41106 return SDValue();
41107
41108 // Shuffle the constant bits according to the mask.
41109 APInt UndefElts(NumMaskElts, 0);
41110 APInt ZeroElts(NumMaskElts, 0);
41111 APInt ConstantElts(NumMaskElts, 0);
41112 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41113 APInt::getZero(MaskSizeInBits));
41114 for (unsigned i = 0; i != NumMaskElts; ++i) {
41115 int M = Mask[i];
41116 if (M == SM_SentinelUndef) {
41117 UndefElts.setBit(i);
41118 continue;
41119 } else if (M == SM_SentinelZero) {
41120 ZeroElts.setBit(i);
41121 continue;
41122 }
41123 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41124
41125 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41126 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41127
41128 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41129 if (SrcUndefElts[SrcMaskIdx]) {
41130 UndefElts.setBit(i);
41131 continue;
41132 }
41133
41134 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41135 APInt &Bits = SrcEltBits[SrcMaskIdx];
41136 if (!Bits) {
41137 ZeroElts.setBit(i);
41138 continue;
41139 }
41140
41141 ConstantElts.setBit(i);
41142 ConstantBitData[i] = Bits;
41143 }
41144 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41145
41146 // Attempt to create a zero vector.
41147 if ((UndefElts | ZeroElts).isAllOnes())
41148 return getZeroVector(VT, Subtarget, DAG, DL);
41149
41150 // Create the constant data.
41151 MVT MaskSVT;
41152 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41153 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41154 else
41155 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41156
41157 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41158 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41159 return SDValue();
41160
41161 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41162 return DAG.getBitcast(VT, CstOp);
41163}
41164
41165namespace llvm {
41166 namespace X86 {
41167 enum {
41169 };
41170 } // namespace X86
41171} // namespace llvm
41172
41173/// Fully generic combining of x86 shuffle instructions.
41174///
41175/// This should be the last combine run over the x86 shuffle instructions. Once
41176/// they have been fully optimized, this will recursively consider all chains
41177/// of single-use shuffle instructions, build a generic model of the cumulative
41178/// shuffle operation, and check for simpler instructions which implement this
41179/// operation. We use this primarily for two purposes:
41180///
41181/// 1) Collapse generic shuffles to specialized single instructions when
41182/// equivalent. In most cases, this is just an encoding size win, but
41183/// sometimes we will collapse multiple generic shuffles into a single
41184/// special-purpose shuffle.
41185/// 2) Look for sequences of shuffle instructions with 3 or more total
41186/// instructions, and replace them with the slightly more expensive SSSE3
41187/// PSHUFB instruction if available. We do this as the last combining step
41188/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41189/// a suitable short sequence of other instructions. The PSHUFB will either
41190/// use a register or have to read from memory and so is slightly (but only
41191/// slightly) more expensive than the other shuffle instructions.
41192///
41193/// Because this is inherently a quadratic operation (for each shuffle in
41194/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41195/// This should never be an issue in practice as the shuffle lowering doesn't
41196/// produce sequences of more than 8 instructions.
41197///
41198/// FIXME: We will currently miss some cases where the redundant shuffling
41199/// would simplify under the threshold for PSHUFB formation because of
41200/// combine-ordering. To fix this, we should do the redundant instruction
41201/// combining in this recursive walk.
41203 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41204 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41205 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41206 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41207 const SDLoc &DL, const X86Subtarget &Subtarget) {
41208 assert(!RootMask.empty() &&
41209 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41210 "Illegal shuffle root mask");
41211 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41212 unsigned RootSizeInBits = RootVT.getSizeInBits();
41213 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41214
41215 // Bound the depth of our recursive combine because this is ultimately
41216 // quadratic in nature.
41217 if (Depth >= MaxDepth)
41218 return SDValue();
41219
41220 // Directly rip through bitcasts to find the underlying operand.
41221 SDValue Op = SrcOps[SrcOpIndex];
41223
41224 EVT VT = Op.getValueType();
41225 if (!VT.isVector() || !VT.isSimple())
41226 return SDValue(); // Bail if we hit a non-simple non-vector.
41227
41228 // FIXME: Just bail on f16 for now.
41229 if (VT.getVectorElementType() == MVT::f16)
41230 return SDValue();
41231
41232 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41233 "Can only combine shuffles upto size of the root op.");
41234
41235 // Create a demanded elts mask from the referenced elements of Op.
41236 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41237 for (int M : RootMask) {
41238 int BaseIdx = RootMask.size() * SrcOpIndex;
41239 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41240 OpDemandedElts.setBit(M - BaseIdx);
41241 }
41242 if (RootSizeInBits != VT.getSizeInBits()) {
41243 // Op is smaller than Root - extract the demanded elts for the subvector.
41244 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41245 unsigned NumOpMaskElts = RootMask.size() / Scale;
41246 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41247 assert(OpDemandedElts
41248 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41249 .isZero() &&
41250 "Out of range elements referenced in root mask");
41251 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41252 }
41253 OpDemandedElts =
41254 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41255
41256 // Extract target shuffle mask and resolve sentinels and inputs.
41257 SmallVector<int, 64> OpMask;
41258 SmallVector<SDValue, 2> OpInputs;
41259 APInt OpUndef, OpZero;
41260 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41261 OpZero, DAG, Depth, false)) {
41262 // Shuffle inputs must not be larger than the shuffle result.
41263 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41264 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41265 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41266 }))
41267 return SDValue();
41268 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41269 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41270 !isNullConstant(Op.getOperand(1))) {
41271 SDValue SrcVec = Op.getOperand(0);
41272 int ExtractIdx = Op.getConstantOperandVal(1);
41273 unsigned NumElts = VT.getVectorNumElements();
41274 OpInputs.assign({SrcVec});
41275 OpMask.assign(NumElts, SM_SentinelUndef);
41276 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41277 OpZero = OpUndef = APInt::getZero(NumElts);
41278 } else {
41279 return SDValue();
41280 }
41281
41282 // If the shuffle result was smaller than the root, we need to adjust the
41283 // mask indices and pad the mask with undefs.
41284 if (RootSizeInBits > VT.getSizeInBits()) {
41285 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41286 unsigned OpMaskSize = OpMask.size();
41287 if (OpInputs.size() > 1) {
41288 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41289 for (int &M : OpMask) {
41290 if (M < 0)
41291 continue;
41292 int EltIdx = M % OpMaskSize;
41293 int OpIdx = M / OpMaskSize;
41294 M = (PaddedMaskSize * OpIdx) + EltIdx;
41295 }
41296 }
41297 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41298 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41299 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41300 }
41301
41304
41305 // We don't need to merge masks if the root is empty.
41306 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41307 if (EmptyRoot) {
41308 // Only resolve zeros if it will remove an input, otherwise we might end
41309 // up in an infinite loop.
41310 bool ResolveKnownZeros = true;
41311 if (!OpZero.isZero()) {
41312 APInt UsedInputs = APInt::getZero(OpInputs.size());
41313 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41314 int M = OpMask[i];
41315 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41316 continue;
41317 UsedInputs.setBit(M / OpMask.size());
41318 if (UsedInputs.isAllOnes()) {
41319 ResolveKnownZeros = false;
41320 break;
41321 }
41322 }
41323 }
41324 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41325 ResolveKnownZeros);
41326
41327 Mask = OpMask;
41328 Ops.append(OpInputs.begin(), OpInputs.end());
41329 } else {
41330 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41331
41332 // Add the inputs to the Ops list, avoiding duplicates.
41333 Ops.append(SrcOps.begin(), SrcOps.end());
41334
41335 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41336 // Attempt to find an existing match.
41338 for (int i = 0, e = Ops.size(); i < e; ++i)
41339 if (InputBC == peekThroughBitcasts(Ops[i]))
41340 return i;
41341 // Match failed - should we replace an existing Op?
41342 if (InsertionPoint >= 0) {
41344 return InsertionPoint;
41345 }
41346 // Add to the end of the Ops list.
41347 Ops.push_back(Input);
41348 return Ops.size() - 1;
41349 };
41350
41351 SmallVector<int, 2> OpInputIdx;
41352 for (SDValue OpInput : OpInputs)
41353 OpInputIdx.push_back(
41354 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41355
41356 assert(((RootMask.size() > OpMask.size() &&
41357 RootMask.size() % OpMask.size() == 0) ||
41358 (OpMask.size() > RootMask.size() &&
41359 OpMask.size() % RootMask.size() == 0) ||
41360 OpMask.size() == RootMask.size()) &&
41361 "The smaller number of elements must divide the larger.");
41362
41363 // This function can be performance-critical, so we rely on the power-of-2
41364 // knowledge that we have about the mask sizes to replace div/rem ops with
41365 // bit-masks and shifts.
41367 "Non-power-of-2 shuffle mask sizes");
41369 "Non-power-of-2 shuffle mask sizes");
41370 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41371 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41372
41373 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41374 unsigned RootRatio =
41375 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41376 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41377 assert((RootRatio == 1 || OpRatio == 1) &&
41378 "Must not have a ratio for both incoming and op masks!");
41379
41380 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41381 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41382 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41383 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41384 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41385
41386 Mask.resize(MaskWidth, SM_SentinelUndef);
41387
41388 // Merge this shuffle operation's mask into our accumulated mask. Note that
41389 // this shuffle's mask will be the first applied to the input, followed by
41390 // the root mask to get us all the way to the root value arrangement. The
41391 // reason for this order is that we are recursing up the operation chain.
41392 for (unsigned i = 0; i < MaskWidth; ++i) {
41393 unsigned RootIdx = i >> RootRatioLog2;
41394 if (RootMask[RootIdx] < 0) {
41395 // This is a zero or undef lane, we're done.
41396 Mask[i] = RootMask[RootIdx];
41397 continue;
41398 }
41399
41400 unsigned RootMaskedIdx =
41401 RootRatio == 1
41402 ? RootMask[RootIdx]
41403 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41404
41405 // Just insert the scaled root mask value if it references an input other
41406 // than the SrcOp we're currently inserting.
41407 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41408 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41409 Mask[i] = RootMaskedIdx;
41410 continue;
41411 }
41412
41413 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41414 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41415 if (OpMask[OpIdx] < 0) {
41416 // The incoming lanes are zero or undef, it doesn't matter which ones we
41417 // are using.
41418 Mask[i] = OpMask[OpIdx];
41419 continue;
41420 }
41421
41422 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41423 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41424 : (OpMask[OpIdx] << OpRatioLog2) +
41425 (RootMaskedIdx & (OpRatio - 1));
41426
41427 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41428 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41429 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41430 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41431
41432 Mask[i] = OpMaskedIdx;
41433 }
41434 }
41435
41436 // Peek through any free bitcasts to insert_subvector vector widenings or
41437 // extract_subvector nodes back to root size.
41438 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41439 for (auto [I, Op] : enumerate(Ops)) {
41440 SDValue BC = Op;
41441 while (1) {
41442 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41443 BC = BC.getOperand(0);
41444 continue;
41445 }
41446 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41447 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41448 // Set out of bounds mask indices to undef.
41449 Op = BC = BC.getOperand(1);
41450 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41451 int Lo = I * Mask.size();
41452 int Hi = (I + 1) * Mask.size();
41453 int NewHi = Lo + (Mask.size() / Scale);
41454 for (int &M : Mask) {
41455 if (Lo <= M && NewHi <= M && M < Hi)
41456 M = SM_SentinelUndef;
41457 }
41458 continue;
41459 }
41460 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41461 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41462 isNullConstant(BC.getOperand(1))) {
41463 Op = BC = BC.getOperand(0);
41464 continue;
41465 }
41466 break;
41467 }
41468 }
41469
41470 // Remove unused/repeated shuffle source ops.
41472
41473 // Handle the all undef/zero/ones cases early.
41474 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41475 return DAG.getUNDEF(RootVT);
41476 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41477 return getZeroVector(RootVT, Subtarget, DAG, DL);
41478 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41480 return getOnesVector(RootVT, DAG, DL);
41481
41482 assert(!Ops.empty() && "Shuffle with no inputs detected");
41483
41484 // Update the list of shuffle nodes that have been combined so far.
41485 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41486 CombinedNodes.push_back(Op.getNode());
41487
41488 // See if we can recurse into each shuffle source op (if it's a target
41489 // shuffle). The source op should only be generally combined if it either has
41490 // a single use (i.e. current Op) or all its users have already been combined,
41491 // if not then we can still combine but should prevent generation of variable
41492 // shuffles to avoid constant pool bloat.
41493 // Don't recurse if we already have more source ops than we can combine in
41494 // the remaining recursion depth.
41495 if (Ops.size() < (MaxDepth - Depth)) {
41496 for (int i = 0, e = Ops.size(); i < e; ++i) {
41497 // For empty roots, we need to resolve zeroable elements before combining
41498 // them with other shuffles.
41499 SmallVector<int, 64> ResolvedMask = Mask;
41500 if (EmptyRoot)
41501 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41502 bool AllowCrossLaneVar = false;
41503 bool AllowPerLaneVar = false;
41504 if (Ops[i].getNode()->hasOneUse() ||
41505 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41506 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41507 AllowPerLaneVar = AllowVariablePerLaneMask;
41508 }
41510 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41511 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41512 DAG, DL, Subtarget))
41513 return Res;
41514 }
41515 }
41516
41517 // Attempt to constant fold all of the constant source ops.
41519 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41520 return Cst;
41521
41522 // If constant fold failed and we only have constants - then we have
41523 // multiple uses by a single non-variable shuffle - just bail.
41524 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41525 APInt UndefElts;
41526 SmallVector<APInt> RawBits;
41527 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41528 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41529 RawBits,
41530 /*AllowWholeUndefs*/ true,
41531 /*AllowPartialUndefs*/ true);
41532 })) {
41533 return SDValue();
41534 }
41535
41536 // Canonicalize the combined shuffle mask chain with horizontal ops.
41537 // NOTE: This will update the Ops and Mask.
41539 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41540 return DAG.getBitcast(RootVT, HOp);
41541
41542 // Try to refine our inputs given our knowledge of target shuffle mask.
41543 for (auto I : enumerate(Ops)) {
41544 int OpIdx = I.index();
41545 SDValue &Op = I.value();
41546
41547 // What range of shuffle mask element values results in picking from Op?
41548 int Lo = OpIdx * Mask.size();
41549 int Hi = Lo + Mask.size();
41550
41551 // Which elements of Op do we demand, given the mask's granularity?
41552 APInt OpDemandedElts(Mask.size(), 0);
41553 for (int MaskElt : Mask) {
41554 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41555 int OpEltIdx = MaskElt - Lo;
41556 OpDemandedElts.setBit(OpEltIdx);
41557 }
41558 }
41559
41560 // Is the shuffle result smaller than the root?
41561 if (Op.getValueSizeInBits() < RootSizeInBits) {
41562 // We padded the mask with undefs. But we now need to undo that.
41563 unsigned NumExpectedVectorElts = Mask.size();
41564 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41565 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41566 assert(!OpDemandedElts.extractBits(
41567 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41568 "Demanding the virtual undef widening padding?");
41569 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41570 }
41571
41572 // The Op itself may be of different VT, so we need to scale the mask.
41573 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41574 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41575
41576 // Can this operand be simplified any further, given it's demanded elements?
41578 Op, OpScaledDemandedElts, DAG))
41579 Op = NewOp;
41580 }
41581 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41582
41583 // Widen any subvector shuffle inputs we've collected.
41584 // TODO: Remove this to avoid generating temporary nodes, we should only
41585 // widen once combineX86ShuffleChain has found a match.
41586 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41587 return Op.getValueSizeInBits() < RootSizeInBits;
41588 })) {
41589 for (SDValue &Op : Ops)
41590 if (Op.getValueSizeInBits() < RootSizeInBits)
41591 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41592 RootSizeInBits);
41593 // Reresolve - we might have repeated subvector sources.
41595 }
41596
41597 // Handle the all undef/zero/ones cases.
41598 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41599 return DAG.getUNDEF(RootVT);
41600 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41601 return getZeroVector(RootVT, Subtarget, DAG, DL);
41602 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41604 return getOnesVector(RootVT, DAG, DL);
41605
41606 assert(!Ops.empty() && "Shuffle with no inputs detected");
41607
41608 // We can only combine unary and binary shuffle mask cases.
41609 if (Ops.size() <= 2) {
41610 // Minor canonicalization of the accumulated shuffle mask to make it easier
41611 // to match below. All this does is detect masks with sequential pairs of
41612 // elements, and shrink them to the half-width mask. It does this in a loop
41613 // so it will reduce the size of the mask to the minimal width mask which
41614 // performs an equivalent shuffle.
41615 while (Mask.size() > 1) {
41616 SmallVector<int, 64> WidenedMask;
41617 if (!canWidenShuffleElements(Mask, WidenedMask))
41618 break;
41619 Mask = std::move(WidenedMask);
41620 }
41621
41622 // Canonicalization of binary shuffle masks to improve pattern matching by
41623 // commuting the inputs.
41624 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41626 std::swap(Ops[0], Ops[1]);
41627 }
41628
41629 // Try to combine into a single shuffle instruction.
41630 if (SDValue Shuffle = combineX86ShuffleChain(
41631 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41632 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41633 IsMaskedShuffle, DAG, DL, Subtarget))
41634 return Shuffle;
41635
41636 // If all the operands come from the same larger vector, fallthrough and try
41637 // to use combineX86ShuffleChainWithExtract.
41640 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41641 (RootSizeInBits / Mask.size()) != 64 ||
41642 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41643 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41644 LHS.getOperand(0) != RHS.getOperand(0))
41645 return SDValue();
41646 }
41647
41648 // If that failed and any input is extracted then try to combine as a
41649 // shuffle with the larger type.
41651 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41652 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41653 DAG, DL, Subtarget);
41654}
41655
41656/// Helper entry wrapper to combineX86ShufflesRecursively.
41658 const X86Subtarget &Subtarget) {
41660 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41661 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41662 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41663 SDLoc(Op), Subtarget);
41664}
41665
41666/// Get the PSHUF-style mask from PSHUF node.
41667///
41668/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41669/// PSHUF-style masks that can be reused with such instructions.
41671 MVT VT = N.getSimpleValueType();
41674 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41675 (void)HaveMask;
41676 assert(HaveMask);
41677
41678 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41679 // matter. Check that the upper masks are repeats and remove them.
41680 if (VT.getSizeInBits() > 128) {
41681 int LaneElts = 128 / VT.getScalarSizeInBits();
41682#ifndef NDEBUG
41683 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41684 for (int j = 0; j < LaneElts; ++j)
41685 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41686 "Mask doesn't repeat in high 128-bit lanes!");
41687#endif
41688 Mask.resize(LaneElts);
41689 }
41690
41691 switch (N.getOpcode()) {
41692 case X86ISD::PSHUFD:
41693 return Mask;
41694 case X86ISD::PSHUFLW:
41695 Mask.resize(4);
41696 return Mask;
41697 case X86ISD::PSHUFHW:
41698 Mask.erase(Mask.begin(), Mask.begin() + 4);
41699 for (int &M : Mask)
41700 M -= 4;
41701 return Mask;
41702 default:
41703 llvm_unreachable("No valid shuffle instruction found!");
41704 }
41705}
41706
41707/// Get the expanded blend mask from a BLENDI node.
41708/// For v16i16 nodes, this will splat the repeated i8 mask.
41710 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41711 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41712 APInt Mask = V.getConstantOperandAPInt(2);
41713 if (Mask.getBitWidth() > NumElts)
41714 Mask = Mask.trunc(NumElts);
41715 if (NumElts == 16) {
41716 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41717 Mask = APInt::getSplat(16, Mask);
41718 }
41719 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41720 return Mask;
41721}
41722
41723/// Search for a combinable shuffle across a chain ending in pshufd.
41724///
41725/// We walk up the chain and look for a combinable shuffle, skipping over
41726/// shuffles that we could hoist this shuffle's transformation past without
41727/// altering anything.
41730 const SDLoc &DL,
41731 SelectionDAG &DAG) {
41732 assert(N.getOpcode() == X86ISD::PSHUFD &&
41733 "Called with something other than an x86 128-bit half shuffle!");
41734
41735 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41736 // of the shuffles in the chain so that we can form a fresh chain to replace
41737 // this one.
41739 SDValue V = N.getOperand(0);
41740 for (; V.hasOneUse(); V = V.getOperand(0)) {
41741 switch (V.getOpcode()) {
41742 default:
41743 return SDValue(); // Nothing combined!
41744
41745 case ISD::BITCAST:
41746 // Skip bitcasts as we always know the type for the target specific
41747 // instructions.
41748 continue;
41749
41750 case X86ISD::PSHUFD:
41751 // Found another dword shuffle.
41752 break;
41753
41754 case X86ISD::PSHUFLW:
41755 // Check that the low words (being shuffled) are the identity in the
41756 // dword shuffle, and the high words are self-contained.
41757 if (Mask[0] != 0 || Mask[1] != 1 ||
41758 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41759 return SDValue();
41760
41761 Chain.push_back(V);
41762 continue;
41763
41764 case X86ISD::PSHUFHW:
41765 // Check that the high words (being shuffled) are the identity in the
41766 // dword shuffle, and the low words are self-contained.
41767 if (Mask[2] != 2 || Mask[3] != 3 ||
41768 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41769 return SDValue();
41770
41771 Chain.push_back(V);
41772 continue;
41773
41774 case X86ISD::UNPCKL:
41775 case X86ISD::UNPCKH:
41776 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41777 // shuffle into a preceding word shuffle.
41778 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41779 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41780 return SDValue();
41781
41782 // Search for a half-shuffle which we can combine with.
41783 unsigned CombineOp =
41784 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41785 if (V.getOperand(0) != V.getOperand(1) ||
41786 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41787 return SDValue();
41788 Chain.push_back(V);
41789 V = V.getOperand(0);
41790 do {
41791 switch (V.getOpcode()) {
41792 default:
41793 return SDValue(); // Nothing to combine.
41794
41795 case X86ISD::PSHUFLW:
41796 case X86ISD::PSHUFHW:
41797 if (V.getOpcode() == CombineOp)
41798 break;
41799
41800 Chain.push_back(V);
41801
41802 [[fallthrough]];
41803 case ISD::BITCAST:
41804 V = V.getOperand(0);
41805 continue;
41806 }
41807 break;
41808 } while (V.hasOneUse());
41809 break;
41810 }
41811 // Break out of the loop if we break out of the switch.
41812 break;
41813 }
41814
41815 if (!V.hasOneUse())
41816 // We fell out of the loop without finding a viable combining instruction.
41817 return SDValue();
41818
41819 // Merge this node's mask and our incoming mask.
41821 for (int &M : Mask)
41822 M = VMask[M];
41823 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41824 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41825
41826 // Rebuild the chain around this new shuffle.
41827 while (!Chain.empty()) {
41828 SDValue W = Chain.pop_back_val();
41829
41830 if (V.getValueType() != W.getOperand(0).getValueType())
41831 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41832
41833 switch (W.getOpcode()) {
41834 default:
41835 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41836
41837 case X86ISD::UNPCKL:
41838 case X86ISD::UNPCKH:
41839 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41840 break;
41841
41842 case X86ISD::PSHUFD:
41843 case X86ISD::PSHUFLW:
41844 case X86ISD::PSHUFHW:
41845 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41846 break;
41847 }
41848 }
41849 if (V.getValueType() != N.getValueType())
41850 V = DAG.getBitcast(N.getValueType(), V);
41851
41852 // Return the new chain to replace N.
41853 return V;
41854}
41855
41856// Attempt to commute shufps LHS loads:
41857// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41859 SelectionDAG &DAG) {
41860 // TODO: Add vXf64 support.
41861 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41862 return SDValue();
41863
41864 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41865 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41866 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41867 return SDValue();
41868 SDValue N0 = V.getOperand(0);
41869 SDValue N1 = V.getOperand(1);
41870 unsigned Imm = V.getConstantOperandVal(2);
41871 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41872 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41874 return SDValue();
41875 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41876 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41877 DAG.getTargetConstant(Imm, DL, MVT::i8));
41878 };
41879
41880 switch (N.getOpcode()) {
41881 case X86ISD::VPERMILPI:
41882 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41883 unsigned Imm = N.getConstantOperandVal(1);
41884 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41885 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41886 }
41887 break;
41888 case X86ISD::SHUFP: {
41889 SDValue N0 = N.getOperand(0);
41890 SDValue N1 = N.getOperand(1);
41891 unsigned Imm = N.getConstantOperandVal(2);
41892 if (N0 == N1) {
41893 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41894 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41895 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41896 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41897 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41898 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41899 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41900 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41901 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41902 }
41903 break;
41904 }
41905 }
41906
41907 return SDValue();
41908}
41909
41910// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41911// iff we don't demand the same element index for both X and Y.
41912static SDValue
41914 const APInt &DemandedElts, SelectionDAG &DAG,
41915 const X86Subtarget &Subtarget, const SDLoc &DL) {
41916 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41917 if (!N0.hasOneUse() || !N1.hasOneUse())
41918 return SDValue();
41919
41920 unsigned NumElts = VT.getVectorNumElements();
41923
41924 // See if both operands are shuffles, and that we can scale the shuffle masks
41925 // to the same width as the blend mask.
41926 // TODO: Support SM_SentinelZero?
41927 SmallVector<SDValue, 2> Ops0, Ops1;
41928 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41929 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41930 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41931 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41932 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41933 return SDValue();
41934
41935 // Determine the demanded elts from both permutes.
41936 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41937 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41938 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41939 Demanded1,
41940 /*AllowUndefElts=*/true) ||
41941 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41942 DemandedRHS0, /*AllowUndefElts=*/true) ||
41943 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41944 DemandedRHS1, /*AllowUndefElts=*/true))
41945 return SDValue();
41946
41947 // Confirm that we only use a single operand from both permutes and that we
41948 // don't demand the same index from both.
41949 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41950 DemandedLHS0.intersects(DemandedLHS1))
41951 return SDValue();
41952
41953 // Use the permute demanded elts masks as the new blend mask.
41954 // Create the new permute mask as a blend of the 2 original permute masks.
41955 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41956 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41957 for (unsigned I = 0; I != NumElts; ++I) {
41958 if (Demanded0[I]) {
41959 int M = ScaledMask0[I];
41960 if (0 <= M) {
41961 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41962 "BlendMask demands LHS AND RHS");
41963 NewBlendMask[M] = M;
41964 NewPermuteMask[I] = M;
41965 }
41966 } else if (Demanded1[I]) {
41967 int M = ScaledMask1[I];
41968 if (0 <= M) {
41969 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41970 "BlendMask demands LHS AND RHS");
41971 NewBlendMask[M] = M + NumElts;
41972 NewPermuteMask[I] = M;
41973 }
41974 }
41975 }
41976 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41977 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41978
41979 // v16i16 shuffles can explode in complexity very easily, only accept them if
41980 // the blend mask is the same in the 128-bit subvectors (or can widen to
41981 // v8i32) and the permute can be widened as well.
41982 if (VT == MVT::v16i16) {
41983 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41984 !canWidenShuffleElements(NewBlendMask))
41985 return SDValue();
41986 if (!canWidenShuffleElements(NewPermuteMask))
41987 return SDValue();
41988 }
41989
41990 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41991 // widened to a lane permute (vperm2f128).
41992 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41994 NewPermuteMask) &&
41995 !canScaleShuffleElements(NewPermuteMask, 2))
41996 return SDValue();
41997
41998 SDValue NewBlend =
41999 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
42000 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
42001 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
42002 NewPermuteMask);
42003}
42004
42005// TODO - move this to TLI like isBinOp?
42006static bool isUnaryOp(unsigned Opcode) {
42007 switch (Opcode) {
42008 case ISD::CTLZ:
42009 case ISD::CTTZ:
42010 case ISD::CTPOP:
42011 return true;
42012 }
42013 return false;
42014}
42015
42016// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42017// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42019 const SDLoc &DL) {
42020 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42021 EVT ShuffleVT = N.getValueType();
42022 unsigned Opc = N.getOpcode();
42023
42024 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
42025 // AllZeros/AllOnes constants are freely shuffled and will peek through
42026 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
42027 // merge with target shuffles if it has one use so shuffle combining is
42028 // likely to kick in. Shuffles of splats are expected to be removed.
42029 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
42030 ISD::isBuildVectorAllZeros(Op.getNode()) ||
42034 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
42035 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
42036 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
42037 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
42038 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
42039 };
42040 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
42041 // Ensure we only shuffle whole vector src elements, unless its a logical
42042 // binops where we can more aggressively move shuffles from dst to src.
42043 return isLogicOp(BinOp) ||
42044 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42045 };
42046
42047 switch (Opc) {
42048 // Unary and Unary+Permute Shuffles.
42049 case X86ISD::PSHUFB: {
42050 // Don't merge PSHUFB if it contains zero'd elements.
42051 SmallVector<int> Mask;
42053 if (!getTargetShuffleMask(N, false, Ops, Mask))
42054 break;
42055 [[fallthrough]];
42056 }
42057 case X86ISD::VBROADCAST:
42058 case X86ISD::MOVDDUP:
42059 case X86ISD::PSHUFD:
42060 case X86ISD::PSHUFHW:
42061 case X86ISD::PSHUFLW:
42062 case X86ISD::VPERMV:
42063 case X86ISD::VPERMI:
42064 case X86ISD::VPERMILPI: {
42065 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42066 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42067 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42068 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42069 unsigned SrcOpcode = N0.getOpcode();
42070 EVT OpVT = N0.getValueType();
42071 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42074 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42075 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42076 IsMergeableWithShuffle(Op01, FoldShuf)) {
42077 SDValue LHS, RHS;
42078 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42079 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42080 if (Opc == X86ISD::VPERMV) {
42081 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42082 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42083 } else if (N.getNumOperands() == 2) {
42084 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42085 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42086 } else {
42087 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42088 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42089 }
42090 return DAG.getBitcast(ShuffleVT,
42091 DAG.getNode(SrcOpcode, DL, OpVT,
42092 DAG.getBitcast(OpVT, LHS),
42093 DAG.getBitcast(OpVT, RHS)));
42094 }
42095 }
42096 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42097 OpVT.getScalarSizeInBits() ==
42099 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42100 if (Opc == X86ISD::VPERMV)
42101 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42102 else if (N.getNumOperands() == 2)
42103 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42104 else
42105 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42106 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42107 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42108 }
42109 }
42110 break;
42111 }
42112 // Binary and Binary+Permute Shuffles.
42113 case X86ISD::INSERTPS: {
42114 // Don't merge INSERTPS if it contains zero'd elements.
42115 unsigned InsertPSMask = N.getConstantOperandVal(2);
42116 unsigned ZeroMask = InsertPSMask & 0xF;
42117 if (ZeroMask != 0)
42118 break;
42119 [[fallthrough]];
42120 }
42121 case X86ISD::MOVSD:
42122 case X86ISD::MOVSS:
42123 case X86ISD::BLENDI:
42124 case X86ISD::SHUFP:
42125 case X86ISD::UNPCKH:
42126 case X86ISD::UNPCKL: {
42127 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42128 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42129 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42130 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42131 unsigned SrcOpcode = N0.getOpcode();
42132 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42133 N0.getValueType() == N1.getValueType() &&
42134 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42135 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42140 // Ensure the total number of shuffles doesn't increase by folding this
42141 // shuffle through to the source ops.
42142 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42143 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42144 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42145 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42146 SDValue LHS, RHS;
42147 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42148 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42149 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42150 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42151 if (N.getNumOperands() == 3) {
42152 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42153 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42154 } else {
42155 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42156 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42157 }
42158 EVT OpVT = N0.getValueType();
42159 return DAG.getBitcast(ShuffleVT,
42160 DAG.getNode(SrcOpcode, DL, OpVT,
42161 DAG.getBitcast(OpVT, LHS),
42162 DAG.getBitcast(OpVT, RHS)));
42163 }
42164 }
42165 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42166 N0.getValueType() == N1.getValueType() &&
42167 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42168 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42171 SDValue Res;
42172 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42173 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42174 if (N.getNumOperands() == 3) {
42175 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42176 } else {
42177 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42178 }
42179 EVT OpVT = N0.getValueType();
42180 return DAG.getBitcast(
42181 ShuffleVT,
42182 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42183 }
42184 // TODO: We can generalize this for other shuffles/conversions.
42185 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42186 N1.getOpcode() == SrcOpcode &&
42187 N0.getValueType() == N1.getValueType() &&
42188 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42189 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42190 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42191 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42192 EVT OpSrcVT = N0.getOperand(0).getValueType();
42193 EVT OpDstVT = N0.getValueType();
42194 SDValue Res =
42195 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42196 return DAG.getBitcast(ShuffleVT,
42197 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42198 }
42199 }
42200 break;
42201 }
42202 }
42203 return SDValue();
42204}
42205
42206/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42208 SelectionDAG &DAG,
42209 const SDLoc &DL) {
42210 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42211
42212 MVT VT = V.getSimpleValueType();
42213 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42214 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42215 unsigned SrcOpc0 = Src0.getOpcode();
42216 unsigned SrcOpc1 = Src1.getOpcode();
42217 EVT SrcVT0 = Src0.getValueType();
42218 EVT SrcVT1 = Src1.getValueType();
42219
42220 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42221 return SDValue();
42222
42223 switch (SrcOpc0) {
42224 case X86ISD::MOVDDUP: {
42225 SDValue LHS = Src0.getOperand(0);
42226 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42227 SDValue Res =
42228 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42229 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42230 return DAG.getBitcast(VT, Res);
42231 }
42232 case X86ISD::VPERMILPI:
42233 // TODO: Handle v4f64 permutes with different low/high lane masks.
42234 if (SrcVT0 == MVT::v4f64) {
42235 uint64_t Mask = Src0.getConstantOperandVal(1);
42236 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42237 break;
42238 }
42239 [[fallthrough]];
42240 case X86ISD::VSHLI:
42241 case X86ISD::VSRLI:
42242 case X86ISD::VSRAI:
42243 case X86ISD::PSHUFD:
42244 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42245 SDValue LHS = Src0.getOperand(0);
42246 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42247 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42248 V.getOperand(2));
42249 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42250 return DAG.getBitcast(VT, Res);
42251 }
42252 break;
42253 }
42254
42255 return SDValue();
42256}
42257
42258/// Try to combine x86 target specific shuffles.
42260 SelectionDAG &DAG,
42262 const X86Subtarget &Subtarget) {
42263 using namespace SDPatternMatch;
42264
42265 MVT VT = N.getSimpleValueType();
42266 unsigned NumElts = VT.getVectorNumElements();
42268 unsigned Opcode = N.getOpcode();
42269 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42270
42271 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42272 return R;
42273
42274 // Handle specific target shuffles.
42275 switch (Opcode) {
42276 case X86ISD::MOVDDUP: {
42277 SDValue Src = N.getOperand(0);
42278 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42279 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42280 ISD::isNormalLoad(Src.getNode())) {
42281 LoadSDNode *LN = cast<LoadSDNode>(Src);
42282 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42283 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42284 DCI.CombineTo(N.getNode(), Movddup);
42285 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42287 return N; // Return N so it doesn't get rechecked!
42288 }
42289 }
42290
42291 return SDValue();
42292 }
42293 case X86ISD::VBROADCAST: {
42294 SDValue Src = N.getOperand(0);
42295 SDValue BC = peekThroughBitcasts(Src);
42296 EVT SrcVT = Src.getValueType();
42297 EVT BCVT = BC.getValueType();
42298
42299 // If broadcasting from another shuffle, attempt to simplify it.
42300 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42301 if (isTargetShuffle(BC.getOpcode()) &&
42302 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42303 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42304 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42306 for (unsigned i = 0; i != Scale; ++i)
42307 DemandedMask[i] = i;
42309 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42310 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42311 /*AllowVariableCrossLaneMask=*/true,
42312 /*AllowVariablePerLaneMask=*/true,
42313 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42314 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42315 DAG.getBitcast(SrcVT, Res));
42316 }
42317
42318 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42319 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42320 if (Src.getOpcode() == ISD::BITCAST &&
42321 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42322 TLI.isTypeLegal(BCVT) &&
42324 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42325 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42327 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42328 }
42329
42330 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42331 // If we're re-broadcasting a smaller type then broadcast with that type and
42332 // bitcast.
42333 // TODO: Do this for any splat?
42334 if (Src.getOpcode() == ISD::BITCAST &&
42335 (BC.getOpcode() == X86ISD::VBROADCAST ||
42337 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42338 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42339 MVT NewVT =
42341 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42342 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42343 }
42344
42345 // Reduce broadcast source vector to lowest 128-bits.
42346 if (SrcVT.getSizeInBits() > 128)
42347 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42348 extract128BitVector(Src, 0, DAG, DL));
42349
42350 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42351 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42352 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42353 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42354
42355 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42356 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42357 isNullConstant(Src.getOperand(1)) &&
42358 Src.getValueType() ==
42359 Src.getOperand(0).getValueType().getScalarType() &&
42360 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42361 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42362
42363 // Share broadcast with the longest vector and extract low subvector (free).
42364 // Ensure the same SDValue from the SDNode use is being used.
42365 for (SDNode *User : Src->users())
42366 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42367 Src == User->getOperand(0) &&
42368 User->getValueSizeInBits(0).getFixedValue() >
42369 VT.getFixedSizeInBits()) {
42370 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42371 VT.getSizeInBits());
42372 }
42373
42374 // vbroadcast(scalarload X) -> vbroadcast_load X
42375 // For float loads, extract other uses of the scalar from the broadcast.
42376 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42377 ISD::isNormalLoad(Src.getNode())) {
42378 LoadSDNode *LN = cast<LoadSDNode>(Src);
42379 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42380 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42381 SDValue BcastLd =
42383 LN->getMemoryVT(), LN->getMemOperand());
42384 // If the load value is used only by N, replace it via CombineTo N.
42385 bool NoReplaceExtract = Src.hasOneUse();
42386 DCI.CombineTo(N.getNode(), BcastLd);
42387 if (NoReplaceExtract) {
42388 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42390 } else {
42391 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42392 DAG.getVectorIdxConstant(0, DL));
42393 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42394 }
42395 return N; // Return N so it doesn't get rechecked!
42396 }
42397
42398 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42399 // i16. So shrink it ourselves if we can make a broadcast_load.
42400 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42401 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42402 assert(Subtarget.hasAVX2() && "Expected AVX2");
42403 SDValue TruncIn = Src.getOperand(0);
42404
42405 // If this is a truncate of a non extending load we can just narrow it to
42406 // use a broadcast_load.
42407 if (ISD::isNormalLoad(TruncIn.getNode())) {
42408 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42409 // Unless its volatile or atomic.
42410 if (LN->isSimple()) {
42411 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42412 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42413 SDValue BcastLd = DAG.getMemIntrinsicNode(
42414 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42415 LN->getPointerInfo(), LN->getBaseAlign(),
42416 LN->getMemOperand()->getFlags());
42417 DCI.CombineTo(N.getNode(), BcastLd);
42418 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42419 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42420 return N; // Return N so it doesn't get rechecked!
42421 }
42422 }
42423
42424 // If this is a truncate of an i16 extload, we can directly replace it.
42425 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42426 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42427 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42428 if (LN->getMemoryVT().getSizeInBits() == 16) {
42429 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42430 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42431 SDValue BcastLd =
42433 LN->getMemoryVT(), LN->getMemOperand());
42434 DCI.CombineTo(N.getNode(), BcastLd);
42435 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42436 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42437 return N; // Return N so it doesn't get rechecked!
42438 }
42439 }
42440
42441 // If this is a truncate of load that has been shifted right, we can
42442 // offset the pointer and use a narrower load.
42443 if (TruncIn.getOpcode() == ISD::SRL &&
42444 TruncIn.getOperand(0).hasOneUse() &&
42445 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42446 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42447 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42448 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42449 // Make sure the shift amount and the load size are divisible by 16.
42450 // Don't do this if the load is volatile or atomic.
42451 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42452 LN->isSimple()) {
42453 unsigned Offset = ShiftAmt / 8;
42454 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42457 SDValue Ops[] = { LN->getChain(), Ptr };
42458 SDValue BcastLd = DAG.getMemIntrinsicNode(
42459 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42461 LN->getMemOperand()->getFlags());
42462 DCI.CombineTo(N.getNode(), BcastLd);
42463 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42464 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42465 return N; // Return N so it doesn't get rechecked!
42466 }
42467 }
42468 }
42469
42470 // vbroadcast(vzload X) -> vbroadcast_load X
42471 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42473 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42474 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42475 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42476 SDValue BcastLd =
42478 LN->getMemoryVT(), LN->getMemOperand());
42479 DCI.CombineTo(N.getNode(), BcastLd);
42480 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42482 return N; // Return N so it doesn't get rechecked!
42483 }
42484 }
42485
42486 // vbroadcast(vector load X) -> vbroadcast_load
42487 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42488 LoadSDNode *LN = cast<LoadSDNode>(Src);
42489 // Unless the load is volatile or atomic.
42490 if (LN->isSimple()) {
42491 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42492 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42493 SDValue BcastLd = DAG.getMemIntrinsicNode(
42495 LN->getPointerInfo(), LN->getBaseAlign(),
42496 LN->getMemOperand()->getFlags());
42497 DCI.CombineTo(N.getNode(), BcastLd);
42498 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42500 return N; // Return N so it doesn't get rechecked!
42501 }
42502 }
42503
42504 return SDValue();
42505 }
42506 case X86ISD::VZEXT_MOVL: {
42507 SDValue N0 = N.getOperand(0);
42508
42509 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42510 // Zeroing out the upper elements means we're just shifting a zero value.
42511 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42512 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42513 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42514 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42515 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42516 if (N0.hasOneUse())
42517 return DAG.getNode(
42518 N0.getOpcode(), DL, VT,
42519 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42520 N0.getOperand(1));
42521 }
42522
42523 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42524 // the load is volatile.
42525 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42526 auto *LN = cast<LoadSDNode>(N0);
42527 if (SDValue VZLoad =
42528 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42529 DCI.CombineTo(N.getNode(), VZLoad);
42530 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42532 return N;
42533 }
42534 }
42535
42536 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42537 // and can just use a VZEXT_LOAD.
42538 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42539 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42540 auto *LN = cast<MemSDNode>(N0);
42541 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42542 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42543 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42544 SDValue VZLoad =
42546 LN->getMemoryVT(), LN->getMemOperand());
42547 DCI.CombineTo(N.getNode(), VZLoad);
42548 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42550 return N;
42551 }
42552 }
42553
42554 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42555 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42556 // if the upper bits of the i64 are zero.
42557 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42558 N0.getOperand(0).hasOneUse() &&
42559 N0.getOperand(0).getValueType() == MVT::i64) {
42560 SDValue In = N0.getOperand(0);
42561 APInt Mask = APInt::getHighBitsSet(64, 32);
42562 if (DAG.MaskedValueIsZero(In, Mask)) {
42563 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42564 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42565 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42566 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42567 return DAG.getBitcast(VT, Movl);
42568 }
42569 }
42570
42571 // Load a scalar integer constant directly to XMM instead of transferring an
42572 // immediate value from GPR.
42573 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42574 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42575 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42576 // Create a vector constant - scalar constant followed by zeros.
42577 EVT ScalarVT = N0.getOperand(0).getValueType();
42578 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42579 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42580 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42581 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42582
42583 // Load the vector constant from constant pool.
42584 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42585 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42586 MachinePointerInfo MPI =
42588 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42589 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42591 }
42592 }
42593
42594 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42595 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42596 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42597 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42598 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42600
42601 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42602 isNullConstant(V.getOperand(2))) {
42603 SDValue In = V.getOperand(1);
42605 In.getValueSizeInBits() /
42606 VT.getScalarSizeInBits());
42607 In = DAG.getBitcast(SubVT, In);
42608 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42609 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42610 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42611 V.getOperand(2));
42612 }
42613 }
42614
42615 return SDValue();
42616 }
42617 case X86ISD::BLENDI: {
42618 SDValue N0 = N.getOperand(0);
42619 SDValue N1 = N.getOperand(1);
42620 unsigned EltBits = VT.getScalarSizeInBits();
42621
42622 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42623 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42624 // TODO: Handle MVT::v16i16 repeated blend mask.
42625 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42626 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42627 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42628 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42629 unsigned NewSize = SrcVT.getVectorNumElements();
42630 APInt BlendMask = getBLENDIBlendMask(N);
42631 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42632 return DAG.getBitcast(
42633 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42634 N1.getOperand(0),
42635 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42636 DL, MVT::i8)));
42637 }
42638 }
42639 // Share PSHUFB masks:
42640 // blend(pshufb(x,m1),pshufb(y,m2))
42641 // --> m3 = blend(m1,m2)
42642 // blend(pshufb(x,m3),pshufb(y,m3))
42643 if (N0.hasOneUse() && N1.hasOneUse()) {
42644 SmallVector<int> Mask, ByteMask;
42648 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42649 RHS.getOpcode() == X86ISD::PSHUFB &&
42650 LHS.getOperand(1) != RHS.getOperand(1) &&
42651 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42652 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42653 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42655 "BLENDI decode mismatch");
42656 MVT ShufVT = LHS.getSimpleValueType();
42657 SDValue MaskLHS = LHS.getOperand(1);
42658 SDValue MaskRHS = RHS.getOperand(1);
42659 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42661 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42662 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42663 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42664 LHS.getOperand(0), NewMask);
42665 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42666 RHS.getOperand(0), NewMask);
42667 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42668 DAG.getBitcast(VT, NewLHS),
42669 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42670 }
42671 }
42672 }
42673 }
42674 return SDValue();
42675 }
42676 case X86ISD::SHUFP: {
42677 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42678 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42679 // TODO: Support types other than v4f32.
42680 if (VT == MVT::v4f32) {
42681 bool Updated = false;
42682 SmallVector<int> Mask;
42684 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42685 for (int i = 0; i != 2; ++i) {
42686 SmallVector<SDValue> SubOps;
42687 SmallVector<int> SubMask, SubScaledMask;
42689 // TODO: Scaling might be easier if we specify the demanded elts.
42690 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42691 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42692 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42693 int Ofs = i * 2;
42694 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42695 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42696 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42697 Updated = true;
42698 }
42699 }
42700 }
42701 if (Updated) {
42702 for (int &M : Mask)
42703 M %= 4;
42704 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42705 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42706 }
42707 }
42708 return SDValue();
42709 }
42710 case X86ISD::VPERMI: {
42711 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42712 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42713 SDValue N0 = N.getOperand(0);
42714 SDValue N1 = N.getOperand(1);
42715 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42716 if (N0.getOpcode() == ISD::BITCAST &&
42717 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42718 SDValue Src = N0.getOperand(0);
42719 EVT SrcVT = Src.getValueType();
42720 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42721 return DAG.getBitcast(VT, Res);
42722 }
42723 return SDValue();
42724 }
42725 case X86ISD::SHUF128: {
42726 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42727 // see if we can peek through and access the subvector directly.
42728 if (VT.is512BitVector()) {
42729 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42730 // the upper subvector is used.
42731 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42732 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42733 uint64_t Mask = N->getConstantOperandVal(2);
42734 SmallVector<SDValue> LHSOps, RHSOps;
42735 SDValue NewLHS, NewRHS;
42736 if ((Mask & 0x0A) == 0x0A &&
42737 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42738 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42739 Mask &= ~0x0A;
42740 }
42741 if ((Mask & 0xA0) == 0xA0 &&
42742 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42743 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42744 Mask &= ~0xA0;
42745 }
42746 if (NewLHS || NewRHS)
42747 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42748 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42749 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42750 DAG.getTargetConstant(Mask, DL, MVT::i8));
42751 }
42752 return SDValue();
42753 }
42754 case X86ISD::VPERM2X128: {
42755 SDValue LHS = N->getOperand(0);
42756 SDValue RHS = N->getOperand(1);
42757 unsigned Imm = N.getConstantOperandVal(2) & 255;
42758
42759 // Canonicalize unary/repeated operands to LHS.
42760 if (LHS.isUndef() && !RHS.isUndef())
42761 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42762 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42763 if (LHS == RHS)
42764 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42765 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42766
42767 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42768 if (LHS.getOpcode() == ISD::BITCAST &&
42769 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42770 EVT SrcVT = LHS.getOperand(0).getValueType();
42771 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42772 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42773 DAG.getBitcast(SrcVT, LHS),
42774 DAG.getBitcast(SrcVT, RHS),
42775 N->getOperand(2)));
42776 }
42777 }
42778
42779 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42781 return Res;
42782
42783 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42784 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42785 auto FindSubVector128 = [&](unsigned Idx) {
42786 if (Idx > 3)
42787 return SDValue();
42788 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42789 SmallVector<SDValue> SubOps;
42790 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42791 return SubOps[Idx & 1];
42792 unsigned NumElts = Src.getValueType().getVectorNumElements();
42793 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42794 Src.getOperand(1).getValueSizeInBits() == 128 &&
42795 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42796 return Src.getOperand(1);
42797 }
42798 return SDValue();
42799 };
42800 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42801 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42802 MVT SubVT = VT.getHalfNumVectorElementsVT();
42803 SubLo = DAG.getBitcast(SubVT, SubLo);
42804 SubHi = DAG.getBitcast(SubVT, SubHi);
42805 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42806 }
42807 }
42808
42809 // Attempt to match VBROADCAST*128 subvector broadcast load.
42810 if (RHS.isUndef()) {
42812 DecodeVPERM2X128Mask(4, Imm, Mask);
42813 if (isUndefOrInRange(Mask, 0, 4)) {
42814 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42815 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42816 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42817 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42818 MVT MemVT = VT.getHalfNumVectorElementsVT();
42819 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42821 cast<LoadSDNode>(LHS), Ofs, DAG);
42822 }
42823 }
42824 }
42825
42826 return SDValue();
42827 }
42828 case X86ISD::PSHUFD:
42829 case X86ISD::PSHUFLW:
42830 case X86ISD::PSHUFHW: {
42831 SDValue N0 = N.getOperand(0);
42832 SDValue N1 = N.getOperand(1);
42833 if (N0->hasOneUse()) {
42835 switch (V.getOpcode()) {
42836 case X86ISD::VSHL:
42837 case X86ISD::VSRL:
42838 case X86ISD::VSRA:
42839 case X86ISD::VSHLI:
42840 case X86ISD::VSRLI:
42841 case X86ISD::VSRAI:
42842 case X86ISD::VROTLI:
42843 case X86ISD::VROTRI: {
42844 MVT InnerVT = V.getSimpleValueType();
42845 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42846 SDValue Res = DAG.getNode(Opcode, DL, VT,
42847 DAG.getBitcast(VT, V.getOperand(0)), N1);
42848 Res = DAG.getBitcast(InnerVT, Res);
42849 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42850 return DAG.getBitcast(VT, Res);
42851 }
42852 break;
42853 }
42854 }
42855 }
42856
42857 Mask = getPSHUFShuffleMask(N);
42858 assert(Mask.size() == 4);
42859 break;
42860 }
42861 case X86ISD::MOVSD:
42862 case X86ISD::MOVSH:
42863 case X86ISD::MOVSS: {
42864 SDValue N0 = N.getOperand(0);
42865 SDValue N1 = N.getOperand(1);
42866
42867 // Canonicalize scalar FPOps:
42868 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42869 // If commutable, allow OP(N1[0], N0[0]).
42870 unsigned Opcode1 = N1.getOpcode();
42871 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42872 Opcode1 == ISD::FDIV) {
42873 SDValue N10 = N1.getOperand(0);
42874 SDValue N11 = N1.getOperand(1);
42875 if (N10 == N0 ||
42876 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42877 if (N10 != N0)
42878 std::swap(N10, N11);
42879 MVT SVT = VT.getVectorElementType();
42880 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42881 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42882 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42883 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42884 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42885 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42886 }
42887 }
42888
42889 return SDValue();
42890 }
42891 case X86ISD::INSERTPS: {
42892 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42893 SDValue Op0 = N.getOperand(0);
42894 SDValue Op1 = N.getOperand(1);
42895 unsigned InsertPSMask = N.getConstantOperandVal(2);
42896 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42897 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42898 unsigned ZeroMask = InsertPSMask & 0xF;
42899
42900 // If we zero out all elements from Op0 then we don't need to reference it.
42901 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42902 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42903 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42904
42905 // If we zero out the element from Op1 then we don't need to reference it.
42906 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42907 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42908 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42909
42910 // Attempt to merge insertps Op1 with an inner target shuffle node.
42911 SmallVector<int, 8> TargetMask1;
42913 APInt KnownUndef1, KnownZero1;
42914 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42915 KnownZero1)) {
42916 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42917 // Zero/UNDEF insertion - zero out element and remove dependency.
42918 InsertPSMask |= (1u << DstIdx);
42919 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42920 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42921 }
42922 // Update insertps mask srcidx and reference the source input directly.
42923 int M = TargetMask1[SrcIdx];
42924 assert(0 <= M && M < 8 && "Shuffle index out of range");
42925 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42926 Op1 = Ops1[M < 4 ? 0 : 1];
42927 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42928 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42929 }
42930
42931 // Attempt to merge insertps Op0 with an inner target shuffle node.
42932 SmallVector<int, 8> TargetMask0;
42934 APInt KnownUndef0, KnownZero0;
42935 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42936 KnownZero0)) {
42937 bool Updated = false;
42938 bool UseInput00 = false;
42939 bool UseInput01 = false;
42940 for (int i = 0; i != 4; ++i) {
42941 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42942 // No change if element is already zero or the inserted element.
42943 continue;
42944 }
42945
42946 if (KnownUndef0[i] || KnownZero0[i]) {
42947 // If the target mask is undef/zero then we must zero the element.
42948 InsertPSMask |= (1u << i);
42949 Updated = true;
42950 continue;
42951 }
42952
42953 // The input vector element must be inline.
42954 int M = TargetMask0[i];
42955 if (M != i && M != (i + 4))
42956 return SDValue();
42957
42958 // Determine which inputs of the target shuffle we're using.
42959 UseInput00 |= (0 <= M && M < 4);
42960 UseInput01 |= (4 <= M);
42961 }
42962
42963 // If we're not using both inputs of the target shuffle then use the
42964 // referenced input directly.
42965 if (UseInput00 && !UseInput01) {
42966 Updated = true;
42967 Op0 = Ops0[0];
42968 } else if (!UseInput00 && UseInput01) {
42969 Updated = true;
42970 Op0 = Ops0[1];
42971 }
42972
42973 if (Updated)
42974 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42975 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42976 }
42977
42978 // If we're inserting an element from a vbroadcast load, fold the
42979 // load into the X86insertps instruction. We need to convert the scalar
42980 // load to a vector and clear the source lane of the INSERTPS control.
42981 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42982 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42983 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42984 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42985 MemIntr->getBasePtr(),
42986 MemIntr->getMemOperand());
42987 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42989 Load),
42990 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42991 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42992 return Insert;
42993 }
42994 }
42995
42996 return SDValue();
42997 }
42998 case X86ISD::VPERMV: {
42999 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
43001 SmallVector<SDValue, 2> SrcOps, SubOps;
43002 SDValue Src = peekThroughBitcasts(N.getOperand(1));
43003 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
43004 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
43005 collectConcatOps(Src.getNode(), SubOps, DAG)) {
43006 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43007 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
43008 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
43009 "Unexpected split ops");
43010 // Bail if we were permuting a widened vector.
43011 if (SubOps[1].isUndef() &&
43012 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
43013 return SDValue();
43014 // Bail if any subops would have folded into the concat.
43015 if (any_of(SubOps, isShuffleFoldableLoad))
43016 return SDValue();
43017 // Concat 4x128 back to 2x256.
43018 if (SubOps.size() == 4) {
43019 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
43020 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
43021 }
43022 // Convert mask to 2 operand shuffle.
43023 int HalfElts = NumElts / 2;
43024 for (int &M : Mask)
43025 M += M >= HalfElts ? HalfElts : 0;
43026 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
43027 VT.getSizeInBits());
43028 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
43029 VT.getSizeInBits());
43030 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
43031 DAG.getBitcast(VT, Hi), Subtarget, DAG);
43032 }
43033 return SDValue();
43034 }
43035 case X86ISD::VPERMV3: {
43036 MVT WideVT = VT.getDoubleNumVectorElementsVT();
43037 bool CanConcat = VT.is128BitVector() ||
43038 (VT.is256BitVector() && Subtarget.useAVX512Regs());
43041 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43042 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43043 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43044 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43045 // Canonicalize to VPERMV if both sources are the same.
43046 if (V1 == V2) {
43047 for (int &M : Mask)
43048 M = (M < 0 ? M : (M & (NumElts - 1)));
43049 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43050 DAG.getUNDEF(VT), Subtarget, DAG);
43051 }
43052 // If sources are half width, then concat and use VPERMV with adjusted
43053 // mask.
43054 SDValue Ops[2];
43055 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43056 if (sd_match(V1,
43058 sd_match(V2,
43060 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43061 if (SDValue ConcatSrc =
43062 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43063 for (int &M : Mask)
43064 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43065 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43066 DAG.getUNDEF(VT), Subtarget, DAG);
43067 }
43068 }
43069 // Commute foldable source to the RHS.
43070 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43071 !isShuffleFoldableLoad(N.getOperand(2))) {
43073 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43074 N.getOperand(0), Subtarget, DAG);
43075 }
43076 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43077 // freely concatenated, with a commuted shuffle mask.
43078 if (CanConcat) {
43079 if (SDValue ConcatSrc = combineConcatVectorOps(
43080 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43081 Subtarget)) {
43083 Mask.append(NumElts, SM_SentinelUndef);
43084 SDValue Perm =
43085 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43086 DAG.getUNDEF(WideVT), Subtarget, DAG);
43087 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43088 DAG.getVectorIdxConstant(0, DL));
43089 }
43090 }
43091 }
43092 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43093 // freely concatenated.
43094 if (CanConcat) {
43095 if (SDValue ConcatSrc = combineConcatVectorOps(
43096 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43097 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43098 DL, WideVT.getSizeInBits());
43099 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43100 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43101 DAG.getVectorIdxConstant(0, DL));
43102 }
43103 }
43104 return SDValue();
43105 }
43106 default:
43107 return SDValue();
43108 }
43109
43110 // Nuke no-op shuffles that show up after combining.
43111 if (isNoopShuffleMask(Mask))
43112 return N.getOperand(0);
43113
43114 // Look for simplifications involving one or two shuffle instructions.
43115 SDValue V = N.getOperand(0);
43116 switch (N.getOpcode()) {
43117 default:
43118 break;
43119 case X86ISD::PSHUFLW:
43120 case X86ISD::PSHUFHW:
43121 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43122
43123 // See if this reduces to a PSHUFD which is no more expensive and can
43124 // combine with more operations. Note that it has to at least flip the
43125 // dwords as otherwise it would have been removed as a no-op.
43126 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43127 int DMask[] = {0, 1, 2, 3};
43128 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43129 DMask[DOffset + 0] = DOffset + 1;
43130 DMask[DOffset + 1] = DOffset + 0;
43131 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43132 V = DAG.getBitcast(DVT, V);
43133 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43134 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43135 return DAG.getBitcast(VT, V);
43136 }
43137
43138 // Look for shuffle patterns which can be implemented as a single unpack.
43139 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43140 // only works when we have a PSHUFD followed by two half-shuffles.
43141 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43142 (V.getOpcode() == X86ISD::PSHUFLW ||
43143 V.getOpcode() == X86ISD::PSHUFHW) &&
43144 V.getOpcode() != N.getOpcode() &&
43145 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43146 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43147 if (D.getOpcode() == X86ISD::PSHUFD) {
43150 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43151 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43152 int WordMask[8];
43153 for (int i = 0; i < 4; ++i) {
43154 WordMask[i + NOffset] = Mask[i] + NOffset;
43155 WordMask[i + VOffset] = VMask[i] + VOffset;
43156 }
43157 // Map the word mask through the DWord mask.
43158 int MappedMask[8];
43159 for (int i = 0; i < 8; ++i)
43160 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43161 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43162 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43163 // We can replace all three shuffles with an unpack.
43164 V = DAG.getBitcast(VT, D.getOperand(0));
43165 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43167 DL, VT, V, V);
43168 }
43169 }
43170 }
43171
43172 break;
43173
43174 case X86ISD::PSHUFD:
43175 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43176 return NewN;
43177
43178 break;
43179 }
43180
43181 return SDValue();
43182}
43183
43184/// Checks if the shuffle mask takes subsequent elements
43185/// alternately from two vectors.
43186/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43187static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43188
43189 int ParitySrc[2] = {-1, -1};
43190 unsigned Size = Mask.size();
43191 for (unsigned i = 0; i != Size; ++i) {
43192 int M = Mask[i];
43193 if (M < 0)
43194 continue;
43195
43196 // Make sure we are using the matching element from the input.
43197 if ((M % Size) != i)
43198 return false;
43199
43200 // Make sure we use the same input for all elements of the same parity.
43201 int Src = M / Size;
43202 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43203 return false;
43204 ParitySrc[i % 2] = Src;
43205 }
43206
43207 // Make sure each input is used.
43208 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43209 return false;
43210
43211 Op0Even = ParitySrc[0] == 0;
43212 return true;
43213}
43214
43215/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43216/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43217/// are written to the parameters \p Opnd0 and \p Opnd1.
43218///
43219/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43220/// so it is easier to generically match. We also insert dummy vector shuffle
43221/// nodes for the operands which explicitly discard the lanes which are unused
43222/// by this operation to try to flow through the rest of the combiner
43223/// the fact that they're unused.
43224static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43225 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43226 bool &IsSubAdd, bool &HasAllowContract) {
43227
43228 EVT VT = N->getValueType(0);
43229 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43230 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43232 return false;
43233
43234 // We only handle target-independent shuffles.
43235 // FIXME: It would be easy and harmless to use the target shuffle mask
43236 // extraction tool to support more.
43237 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43238 return false;
43239
43240 SDValue V1 = N->getOperand(0);
43241 SDValue V2 = N->getOperand(1);
43242
43243 // Make sure we have an FADD and an FSUB.
43244 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43245 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43246 V1.getOpcode() == V2.getOpcode())
43247 return false;
43248
43249 // If there are other uses of these operations we can't fold them.
43250 if (!V1->hasOneUse() || !V2->hasOneUse())
43251 return false;
43252
43253 // Ensure that both operations have the same operands. Note that we can
43254 // commute the FADD operands.
43255 SDValue LHS, RHS;
43256 if (V1.getOpcode() == ISD::FSUB) {
43257 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43258 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43259 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43260 return false;
43261 } else {
43262 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43263 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43264 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43265 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43266 return false;
43267 }
43268
43269 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43270 bool Op0Even;
43271 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43272 return false;
43273
43274 // It's a subadd if the vector in the even parity is an FADD.
43275 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43276 : V2->getOpcode() == ISD::FADD;
43277 HasAllowContract =
43279
43280 Opnd0 = LHS;
43281 Opnd1 = RHS;
43282 return true;
43283}
43284
43285/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43287 const X86Subtarget &Subtarget,
43288 SelectionDAG &DAG) {
43289 // We only handle target-independent shuffles.
43290 // FIXME: It would be easy and harmless to use the target shuffle mask
43291 // extraction tool to support more.
43292 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43293 return SDValue();
43294
43295 MVT VT = N->getSimpleValueType(0);
43296 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43297 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43298 return SDValue();
43299
43300 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43301 SDValue Op0 = N->getOperand(0);
43302 SDValue Op1 = N->getOperand(1);
43303 SDValue FMAdd = Op0, FMSub = Op1;
43304 if (FMSub.getOpcode() != X86ISD::FMSUB)
43305 std::swap(FMAdd, FMSub);
43306
43307 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43308 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43309 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43310 FMAdd.getOperand(2) != FMSub.getOperand(2))
43311 return SDValue();
43312
43313 // Check for correct shuffle mask.
43314 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43315 bool Op0Even;
43316 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43317 return SDValue();
43318
43319 // FMAddSub takes zeroth operand from FMSub node.
43320 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43321 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43322 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43323 FMAdd.getOperand(2));
43324}
43325
43326/// Try to combine a shuffle into a target-specific add-sub or
43327/// mul-add-sub node.
43329 const X86Subtarget &Subtarget,
43330 SelectionDAG &DAG) {
43331 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43332 return V;
43333
43334 SDValue Opnd0, Opnd1;
43335 bool IsSubAdd;
43336 bool HasAllowContract;
43337 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43338 HasAllowContract))
43339 return SDValue();
43340
43341 MVT VT = N->getSimpleValueType(0);
43342
43343 // Try to generate X86ISD::FMADDSUB node here.
43344 SDValue Opnd2;
43345 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43346 HasAllowContract)) {
43347 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43348 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43349 }
43350
43351 if (IsSubAdd)
43352 return SDValue();
43353
43354 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43355 // the ADDSUB idiom has been successfully recognized. There are no known
43356 // X86 targets with 512-bit ADDSUB instructions!
43357 if (VT.is512BitVector())
43358 return SDValue();
43359
43360 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43361 // the ADDSUB idiom has been successfully recognized. There are no known
43362 // X86 targets with FP16 ADDSUB instructions!
43363 if (VT.getVectorElementType() == MVT::f16)
43364 return SDValue();
43365
43366 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43367}
43368
43369/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43370/// low half of each source vector and does not set any high half elements in
43371/// the destination vector, narrow the shuffle to half its original size.
43373 EVT VT = Shuf->getValueType(0);
43374 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43375 return SDValue();
43376 if (!VT.is256BitVector() && !VT.is512BitVector())
43377 return SDValue();
43378
43379 // See if we can ignore all of the high elements of the shuffle.
43380 ArrayRef<int> Mask = Shuf->getMask();
43381 if (!isUndefUpperHalf(Mask))
43382 return SDValue();
43383
43384 // Check if the shuffle mask accesses only the low half of each input vector
43385 // (half-index output is 0 or 2).
43386 int HalfIdx1, HalfIdx2;
43387 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43388 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43389 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43390 return SDValue();
43391
43392 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43393 // The trick is knowing that all of the insert/extract are actually free
43394 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43395 // of narrow inputs into a narrow output, and that is always cheaper than
43396 // the wide shuffle that we started with.
43397 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43398 Shuf->getOperand(1), HalfMask, HalfIdx1,
43399 HalfIdx2, false, DAG, /*UseConcat*/ true);
43400}
43401
43404 const X86Subtarget &Subtarget) {
43405 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43406 if (SDValue V = narrowShuffle(Shuf, DAG))
43407 return V;
43408
43409 // If we have legalized the vector types, look for blends of FADD and FSUB
43410 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43411 SDLoc dl(N);
43412 EVT VT = N->getValueType(0);
43413 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43414 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43415 if (SDValue AddSub =
43416 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43417 return AddSub;
43418
43419 // Attempt to combine into a vector load/broadcast.
43421 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43422 return LD;
43423
43424 if (isTargetShuffle(N->getOpcode())) {
43425 SDValue Op(N, 0);
43426 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43427 return Shuffle;
43428
43429 // Try recursively combining arbitrary sequences of x86 shuffle
43430 // instructions into higher-order shuffles. We do this after combining
43431 // specific PSHUF instruction sequences into their minimal form so that we
43432 // can evaluate how many specialized shuffle instructions are involved in
43433 // a particular chain.
43434 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43435 return Res;
43436
43437 // Simplify source operands based on shuffle mask.
43438 // TODO - merge this into combineX86ShufflesRecursively.
43439 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43440 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43441 return SDValue(N, 0);
43442
43443 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43444 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43445 // Perform this after other shuffle combines to allow inner shuffles to be
43446 // combined away first.
43447 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43448 return BinOp;
43449 }
43450
43451 return SDValue();
43452}
43453
43454// Simplify variable target shuffle masks based on the demanded elements.
43455// TODO: Handle DemandedBits in mask indices as well?
43457 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43458 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43459 // If we're demanding all elements don't bother trying to simplify the mask.
43460 unsigned NumElts = DemandedElts.getBitWidth();
43461 if (DemandedElts.isAllOnes())
43462 return false;
43463
43464 SDValue Mask = Op.getOperand(MaskIndex);
43465 if (!Mask.hasOneUse())
43466 return false;
43467
43468 // Attempt to generically simplify the variable shuffle mask.
43469 APInt MaskUndef, MaskZero;
43470 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43471 Depth + 1))
43472 return true;
43473
43474 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43475 // TODO: Support other types from getTargetShuffleMaskIndices?
43477 EVT BCVT = BC.getValueType();
43478 auto *Load = dyn_cast<LoadSDNode>(BC);
43479 if (!Load || !Load->getBasePtr().hasOneUse())
43480 return false;
43481
43482 const Constant *C = getTargetConstantFromNode(Load);
43483 if (!C)
43484 return false;
43485
43486 Type *CTy = C->getType();
43487 if (!CTy->isVectorTy() ||
43488 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43489 return false;
43490
43491 // Handle scaling for i64 elements on 32-bit targets.
43492 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43493 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43494 return false;
43495 unsigned Scale = NumCstElts / NumElts;
43496
43497 // Simplify mask if we have an undemanded element that is not undef.
43498 bool Simplified = false;
43499 SmallVector<Constant *, 32> ConstVecOps;
43500 for (unsigned i = 0; i != NumCstElts; ++i) {
43501 Constant *Elt = C->getAggregateElement(i);
43502 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43503 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43504 Simplified = true;
43505 continue;
43506 }
43507 ConstVecOps.push_back(Elt);
43508 }
43509 if (!Simplified)
43510 return false;
43511
43512 // Generate new constant pool entry + legalize immediately for the load.
43513 SDLoc DL(Op);
43514 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43515 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43516 SDValue NewMask = TLO.DAG.getLoad(
43517 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43519 Load->getAlign());
43520 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43521}
43522
43524 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43525 TargetLoweringOpt &TLO, unsigned Depth) const {
43526 int NumElts = DemandedElts.getBitWidth();
43527 unsigned Opc = Op.getOpcode();
43528 EVT VT = Op.getValueType();
43529
43530 // Handle special case opcodes.
43531 switch (Opc) {
43532 case X86ISD::PMULDQ:
43533 case X86ISD::PMULUDQ: {
43534 APInt LHSUndef, LHSZero;
43535 APInt RHSUndef, RHSZero;
43536 SDValue LHS = Op.getOperand(0);
43537 SDValue RHS = Op.getOperand(1);
43538 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43539 Depth + 1))
43540 return true;
43541 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43542 Depth + 1))
43543 return true;
43544 // Multiply by zero.
43545 KnownZero = LHSZero | RHSZero;
43546 break;
43547 }
43548 case X86ISD::VPMADDUBSW:
43549 case X86ISD::VPMADDWD: {
43550 APInt LHSUndef, LHSZero;
43551 APInt RHSUndef, RHSZero;
43552 SDValue LHS = Op.getOperand(0);
43553 SDValue RHS = Op.getOperand(1);
43554 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43555
43556 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43557 Depth + 1))
43558 return true;
43559 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43560 Depth + 1))
43561 return true;
43562
43563 // TODO: Multiply by zero.
43564
43565 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43566 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43567 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43568 Depth + 1))
43569 return true;
43570 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43571 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43572 Depth + 1))
43573 return true;
43574 break;
43575 }
43576 case X86ISD::PSADBW: {
43577 SDValue LHS = Op.getOperand(0);
43578 SDValue RHS = Op.getOperand(1);
43579 assert(VT.getScalarType() == MVT::i64 &&
43580 LHS.getValueType() == RHS.getValueType() &&
43581 LHS.getValueType().getScalarType() == MVT::i8 &&
43582 "Unexpected PSADBW types");
43583
43584 // Aggressively peek through ops to get at the demanded elts.
43585 if (!DemandedElts.isAllOnes()) {
43586 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43587 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43589 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43591 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43592 if (NewLHS || NewRHS) {
43593 NewLHS = NewLHS ? NewLHS : LHS;
43594 NewRHS = NewRHS ? NewRHS : RHS;
43595 return TLO.CombineTo(
43596 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43597 }
43598 }
43599 break;
43600 }
43601 case X86ISD::VSHL:
43602 case X86ISD::VSRL:
43603 case X86ISD::VSRA: {
43604 // We only need the bottom 64-bits of the (128-bit) shift amount.
43605 SDValue Amt = Op.getOperand(1);
43606 MVT AmtVT = Amt.getSimpleValueType();
43607 assert(AmtVT.is128BitVector() && "Unexpected value type");
43608
43609 // If we reuse the shift amount just for sse shift amounts then we know that
43610 // only the bottom 64-bits are only ever used.
43611 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43612 unsigned UseOpc = Use->getOpcode();
43613 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43614 UseOpc == X86ISD::VSRA) &&
43615 Use->getOperand(0) != Amt;
43616 });
43617
43618 APInt AmtUndef, AmtZero;
43619 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43620 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43621 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43622 Depth + 1, AssumeSingleUse))
43623 return true;
43624 [[fallthrough]];
43625 }
43626 case X86ISD::VSHLI:
43627 case X86ISD::VSRLI:
43628 case X86ISD::VSRAI: {
43629 SDValue Src = Op.getOperand(0);
43630 APInt SrcUndef;
43631 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43632 Depth + 1))
43633 return true;
43634
43635 // Fold shift(0,x) -> 0
43636 if (DemandedElts.isSubsetOf(KnownZero))
43637 return TLO.CombineTo(
43638 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43639
43640 // Aggressively peek through ops to get at the demanded elts.
43641 if (!DemandedElts.isAllOnes())
43643 Src, DemandedElts, TLO.DAG, Depth + 1))
43644 return TLO.CombineTo(
43645 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43646 break;
43647 }
43648 case X86ISD::VPSHA:
43649 case X86ISD::VPSHL:
43650 case X86ISD::VSHLV:
43651 case X86ISD::VSRLV:
43652 case X86ISD::VSRAV: {
43653 APInt LHSUndef, LHSZero;
43654 APInt RHSUndef, RHSZero;
43655 SDValue LHS = Op.getOperand(0);
43656 SDValue RHS = Op.getOperand(1);
43657 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43658 Depth + 1))
43659 return true;
43660
43661 // Fold shift(0,x) -> 0
43662 if (DemandedElts.isSubsetOf(LHSZero))
43663 return TLO.CombineTo(
43664 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43665
43666 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43667 Depth + 1))
43668 return true;
43669
43670 KnownZero = LHSZero;
43671 break;
43672 }
43673 case X86ISD::CMPM:
43674 case X86ISD::CMPP: {
43675 // Scalarize packed fp comparison if we only require element 0.
43676 if (DemandedElts == 1) {
43677 SDLoc dl(Op);
43678 MVT VT = Op.getSimpleValueType();
43679 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43680 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43681 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43682 SDValue CC = Op.getOperand(2);
43683 if (Opc == X86ISD::CMPM) {
43684 SDValue Cmp =
43685 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43686 return TLO.CombineTo(
43687 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43688 }
43689 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43690 return TLO.CombineTo(Op,
43691 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43692 }
43693 break;
43694 }
43695 case X86ISD::PCMPEQ:
43696 case X86ISD::PCMPGT: {
43697 APInt LHSUndef, LHSZero;
43698 APInt RHSUndef, RHSZero;
43699 SDValue LHS = Op.getOperand(0);
43700 SDValue RHS = Op.getOperand(1);
43701 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43702 Depth + 1))
43703 return true;
43704 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43705 Depth + 1))
43706 return true;
43707 break;
43708 }
43709 case X86ISD::KSHIFTL: {
43710 SDValue Src = Op.getOperand(0);
43711 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43712 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43713 unsigned ShiftAmt = Amt->getZExtValue();
43714
43715 if (ShiftAmt == 0)
43716 return TLO.CombineTo(Op, Src);
43717
43718 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43719 // single shift. We can do this if the bottom bits (which are shifted
43720 // out) are never demanded.
43721 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43722 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43723 unsigned C1 = Src.getConstantOperandVal(1);
43724 unsigned NewOpc = X86ISD::KSHIFTL;
43725 int Diff = ShiftAmt - C1;
43726 if (Diff < 0) {
43727 Diff = -Diff;
43728 NewOpc = X86ISD::KSHIFTR;
43729 }
43730
43731 SDLoc dl(Op);
43732 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43733 return TLO.CombineTo(
43734 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43735 }
43736 }
43737
43738 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43739 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43740 Depth + 1))
43741 return true;
43742
43743 KnownUndef <<= ShiftAmt;
43744 KnownZero <<= ShiftAmt;
43745 KnownZero.setLowBits(ShiftAmt);
43746 break;
43747 }
43748 case X86ISD::KSHIFTR: {
43749 SDValue Src = Op.getOperand(0);
43750 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43751 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43752 unsigned ShiftAmt = Amt->getZExtValue();
43753
43754 if (ShiftAmt == 0)
43755 return TLO.CombineTo(Op, Src);
43756
43757 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43758 // single shift. We can do this if the top bits (which are shifted
43759 // out) are never demanded.
43760 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43761 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43762 unsigned C1 = Src.getConstantOperandVal(1);
43763 unsigned NewOpc = X86ISD::KSHIFTR;
43764 int Diff = ShiftAmt - C1;
43765 if (Diff < 0) {
43766 Diff = -Diff;
43767 NewOpc = X86ISD::KSHIFTL;
43768 }
43769
43770 SDLoc dl(Op);
43771 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43772 return TLO.CombineTo(
43773 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43774 }
43775 }
43776
43777 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43778 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43779 Depth + 1))
43780 return true;
43781
43782 KnownUndef.lshrInPlace(ShiftAmt);
43783 KnownZero.lshrInPlace(ShiftAmt);
43784 KnownZero.setHighBits(ShiftAmt);
43785 break;
43786 }
43787 case X86ISD::ANDNP: {
43788 // ANDNP = (~LHS & RHS);
43789 SDValue LHS = Op.getOperand(0);
43790 SDValue RHS = Op.getOperand(1);
43791
43792 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43793 APInt UndefElts;
43794 SmallVector<APInt> EltBits;
43795 int NumElts = VT.getVectorNumElements();
43796 int EltSizeInBits = VT.getScalarSizeInBits();
43797 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43798 APInt OpElts = DemandedElts;
43799 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43800 EltBits)) {
43801 OpBits.clearAllBits();
43802 OpElts.clearAllBits();
43803 for (int I = 0; I != NumElts; ++I) {
43804 if (!DemandedElts[I])
43805 continue;
43806 if (UndefElts[I]) {
43807 // We can't assume an undef src element gives an undef dst - the
43808 // other src might be zero.
43809 OpBits.setAllBits();
43810 OpElts.setBit(I);
43811 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43812 (!Invert && !EltBits[I].isZero())) {
43813 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43814 OpElts.setBit(I);
43815 }
43816 }
43817 }
43818 return std::make_pair(OpBits, OpElts);
43819 };
43820 APInt BitsLHS, EltsLHS;
43821 APInt BitsRHS, EltsRHS;
43822 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43823 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43824
43825 APInt LHSUndef, LHSZero;
43826 APInt RHSUndef, RHSZero;
43827 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43828 Depth + 1))
43829 return true;
43830 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43831 Depth + 1))
43832 return true;
43833
43834 if (!DemandedElts.isAllOnes()) {
43835 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43836 TLO.DAG, Depth + 1);
43837 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43838 TLO.DAG, Depth + 1);
43839 if (NewLHS || NewRHS) {
43840 NewLHS = NewLHS ? NewLHS : LHS;
43841 NewRHS = NewRHS ? NewRHS : RHS;
43842 return TLO.CombineTo(
43843 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43844 }
43845 }
43846 break;
43847 }
43848 case X86ISD::CVTSI2P:
43849 case X86ISD::CVTUI2P:
43850 case X86ISD::CVTPH2PS:
43851 case X86ISD::CVTPS2PH: {
43852 SDValue Src = Op.getOperand(0);
43853 EVT SrcVT = Src.getValueType();
43854 APInt SrcUndef, SrcZero;
43855 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43856 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43857 Depth + 1))
43858 return true;
43859 break;
43860 }
43861 case X86ISD::PACKSS:
43862 case X86ISD::PACKUS: {
43863 SDValue N0 = Op.getOperand(0);
43864 SDValue N1 = Op.getOperand(1);
43865
43866 APInt DemandedLHS, DemandedRHS;
43867 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43868
43869 APInt LHSUndef, LHSZero;
43870 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43871 Depth + 1))
43872 return true;
43873 APInt RHSUndef, RHSZero;
43874 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43875 Depth + 1))
43876 return true;
43877
43878 // TODO - pass on known zero/undef.
43879
43880 // Aggressively peek through ops to get at the demanded elts.
43881 // TODO - we should do this for all target/faux shuffles ops.
43882 if (!DemandedElts.isAllOnes()) {
43883 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43884 TLO.DAG, Depth + 1);
43885 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43886 TLO.DAG, Depth + 1);
43887 if (NewN0 || NewN1) {
43888 NewN0 = NewN0 ? NewN0 : N0;
43889 NewN1 = NewN1 ? NewN1 : N1;
43890 return TLO.CombineTo(Op,
43891 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43892 }
43893 }
43894 break;
43895 }
43896 case X86ISD::HADD:
43897 case X86ISD::HSUB:
43898 case X86ISD::FHADD:
43899 case X86ISD::FHSUB: {
43900 SDValue N0 = Op.getOperand(0);
43901 SDValue N1 = Op.getOperand(1);
43902
43903 APInt DemandedLHS, DemandedRHS;
43904 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43905
43906 APInt LHSUndef, LHSZero;
43907 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43908 Depth + 1))
43909 return true;
43910 APInt RHSUndef, RHSZero;
43911 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43912 Depth + 1))
43913 return true;
43914
43915 // TODO - pass on known zero/undef.
43916
43917 // Aggressively peek through ops to get at the demanded elts.
43918 // TODO: Handle repeated operands.
43919 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43920 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43921 TLO.DAG, Depth + 1);
43922 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43923 TLO.DAG, Depth + 1);
43924 if (NewN0 || NewN1) {
43925 NewN0 = NewN0 ? NewN0 : N0;
43926 NewN1 = NewN1 ? NewN1 : N1;
43927 return TLO.CombineTo(Op,
43928 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43929 }
43930 }
43931 break;
43932 }
43933 case X86ISD::VTRUNC:
43934 case X86ISD::VTRUNCS:
43935 case X86ISD::VTRUNCUS: {
43936 SDValue Src = Op.getOperand(0);
43937 MVT SrcVT = Src.getSimpleValueType();
43938 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43939 APInt SrcUndef, SrcZero;
43940 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43941 Depth + 1))
43942 return true;
43943 KnownZero = SrcZero.zextOrTrunc(NumElts);
43944 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43945 break;
43946 }
43947 case X86ISD::BLENDI: {
43948 SmallVector<int, 16> BlendMask;
43949 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43951 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43952 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43953 return TLO.CombineTo(Op, R);
43954 break;
43955 }
43956 case X86ISD::BLENDV: {
43957 APInt SelUndef, SelZero;
43958 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43959 SelZero, TLO, Depth + 1))
43960 return true;
43961
43962 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43963 APInt LHSUndef, LHSZero;
43964 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43965 LHSZero, TLO, Depth + 1))
43966 return true;
43967
43968 APInt RHSUndef, RHSZero;
43969 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43970 RHSZero, TLO, Depth + 1))
43971 return true;
43972
43973 KnownZero = LHSZero & RHSZero;
43974 KnownUndef = LHSUndef & RHSUndef;
43975 break;
43976 }
43977 case X86ISD::VZEXT_MOVL: {
43978 // If upper demanded elements are already zero then we have nothing to do.
43979 SDValue Src = Op.getOperand(0);
43980 APInt DemandedUpperElts = DemandedElts;
43981 DemandedUpperElts.clearLowBits(1);
43982 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43983 return TLO.CombineTo(Op, Src);
43984 break;
43985 }
43986 case X86ISD::VZEXT_LOAD: {
43987 // If upper demanded elements are not demanded then simplify to a
43988 // scalar_to_vector(load()).
43990 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43991 SDLoc DL(Op);
43992 auto *Mem = cast<MemSDNode>(Op);
43993 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43994 Mem->getMemOperand());
43995 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43996 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
43997 }
43998 break;
43999 }
44000 case X86ISD::VBROADCAST: {
44001 SDValue Src = Op.getOperand(0);
44002 MVT SrcVT = Src.getSimpleValueType();
44003 // Don't bother broadcasting if we just need the 0'th element.
44004 if (DemandedElts == 1) {
44005 if (!SrcVT.isVector())
44006 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
44007 else if (Src.getValueType() != VT)
44008 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
44009 SDLoc(Op));
44010 return TLO.CombineTo(Op, Src);
44011 }
44012 if (!SrcVT.isVector())
44013 break;
44014 APInt SrcUndef, SrcZero;
44015 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
44016 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
44017 Depth + 1))
44018 return true;
44019 // Aggressively peek through src to get at the demanded elt.
44020 // TODO - we should do this for all target/faux shuffles ops.
44022 Src, SrcElts, TLO.DAG, Depth + 1))
44023 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44024 break;
44025 }
44026 case X86ISD::VPERMV:
44027 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
44028 Depth))
44029 return true;
44030 break;
44031 case X86ISD::PSHUFB:
44032 case X86ISD::VPERMV3:
44033 case X86ISD::VPERMILPV:
44034 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
44035 Depth))
44036 return true;
44037 break;
44038 case X86ISD::VPPERM:
44039 case X86ISD::VPERMIL2:
44040 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
44041 Depth))
44042 return true;
44043 break;
44044 }
44045
44046 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44047 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44048 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44049 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44050 DemandedElts.lshr(NumElts / 2) == 0) {
44051 unsigned SizeInBits = VT.getSizeInBits();
44052 unsigned ExtSizeInBits = SizeInBits / 2;
44053
44054 // See if 512-bit ops only use the bottom 128-bits.
44055 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44056 ExtSizeInBits = SizeInBits / 4;
44057
44058 switch (Opc) {
44059 // Scalar broadcast.
44060 case X86ISD::VBROADCAST: {
44061 SDLoc DL(Op);
44062 SDValue Src = Op.getOperand(0);
44063 if (Src.getValueSizeInBits() > ExtSizeInBits)
44064 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44065 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44066 ExtSizeInBits / VT.getScalarSizeInBits());
44067 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44068 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44069 TLO.DAG, DL, ExtSizeInBits));
44070 }
44072 SDLoc DL(Op);
44073 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44074 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44075 ExtSizeInBits / VT.getScalarSizeInBits());
44076 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44077 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44078 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44079 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44080 MemIntr->getMemOperand());
44082 Bcst.getValue(1));
44083 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44084 TLO.DAG, DL, ExtSizeInBits));
44085 }
44086 // Subvector broadcast.
44088 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44089 EVT MemVT = MemIntr->getMemoryVT();
44090 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44091 SDLoc DL(Op);
44092 SDValue Ld =
44093 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44094 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44096 Ld.getValue(1));
44097 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44098 TLO.DAG, DL, ExtSizeInBits));
44099 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44100 SDLoc DL(Op);
44101 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44102 ExtSizeInBits / VT.getScalarSizeInBits());
44103 if (SDValue BcstLd =
44104 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44105 return TLO.CombineTo(Op,
44106 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44107 TLO.DAG, DL, ExtSizeInBits));
44108 }
44109 break;
44110 }
44111 // Byte shifts by immediate.
44112 case X86ISD::VSHLDQ:
44113 case X86ISD::VSRLDQ:
44114 // Shift by uniform.
44115 case X86ISD::VSHL:
44116 case X86ISD::VSRL:
44117 case X86ISD::VSRA:
44118 // Shift by immediate.
44119 case X86ISD::VSHLI:
44120 case X86ISD::VSRLI:
44121 case X86ISD::VSRAI: {
44122 SDLoc DL(Op);
44123 SDValue Ext0 =
44124 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44125 SDValue ExtOp =
44126 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44127 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44128 SDValue Insert =
44129 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44130 return TLO.CombineTo(Op, Insert);
44131 }
44132 case X86ISD::VPERMI: {
44133 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44134 // TODO: This should be done in shuffle combining.
44135 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44137 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44138 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44139 SDLoc DL(Op);
44140 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44141 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44142 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44143 return TLO.CombineTo(Op, Insert);
44144 }
44145 }
44146 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44147 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44148 SDLoc DL(Op);
44149 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44150 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44151 Op.getOperand(1));
44152 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44153 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44154 return TLO.CombineTo(Op, Insert);
44155 }
44156 break;
44157 }
44158 case X86ISD::VPERMV: {
44161 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44162 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44163 VT == MVT::v16f32) &&
44164 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44165 // For lane-crossing shuffles, only split in half in case we're still
44166 // referencing higher elements.
44167 unsigned HalfElts = NumElts / 2;
44168 unsigned HalfSize = SizeInBits / 2;
44169 Mask.resize(HalfElts);
44170 if (all_of(Mask,
44171 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44173 SDLoc DL(Op);
44174 SDValue Ext;
44175 SDValue M =
44176 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44177 SDValue V =
44178 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44179 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44180 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44181 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44182 else {
44184 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44185 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44186 TLO.DAG.getBitcast(ShufVT, V), M);
44187 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44188 }
44189 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44190 Subtarget, TLO.DAG, DL, SizeInBits);
44191 return TLO.CombineTo(Op, Insert);
44192 }
44193 }
44194 break;
44195 }
44196 case X86ISD::VPERMV3: {
44199 if (Subtarget.hasVLX() &&
44200 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44201 // For lane-crossing shuffles, only split in half in case we're still
44202 // referencing higher elements.
44203 unsigned HalfElts = NumElts / 2;
44204 unsigned HalfSize = SizeInBits / 2;
44205 Mask.resize(HalfElts);
44206 if (all_of(Mask, [&](int M) {
44207 return isUndefOrInRange(M, 0, HalfElts) ||
44208 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44209 })) {
44210 // Adjust mask elements for 2nd operand to point to half width.
44211 for (int &M : Mask)
44212 M = (M < NumElts) ? M : (M - HalfElts);
44214 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44215 SDLoc DL(Op);
44216 SDValue Ext = TLO.DAG.getNode(
44217 Opc, DL, HalfVT,
44218 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44219 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44220 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44221 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44222 Subtarget, TLO.DAG, DL, SizeInBits);
44223 return TLO.CombineTo(Op, Insert);
44224 }
44225 }
44226 break;
44227 }
44228 case X86ISD::VPERM2X128: {
44229 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44230 SDLoc DL(Op);
44231 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44232 if (LoMask & 0x8)
44233 return TLO.CombineTo(
44234 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44235 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44236 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44237 SDValue ExtOp =
44238 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44239 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44240 SDValue Insert =
44241 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44242 return TLO.CombineTo(Op, Insert);
44243 }
44244 // Conversions.
44245 // TODO: Add more CVT opcodes when we have test coverage.
44246 case X86ISD::CVTTP2UI: {
44247 if (!Subtarget.hasVLX())
44248 break;
44249 [[fallthrough]];
44250 }
44251 case X86ISD::CVTTP2SI: {
44252 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44253 !Subtarget.hasVLX())
44254 break;
44255 [[fallthrough]];
44256 }
44257 case X86ISD::CVTPH2PS: {
44258 SDLoc DL(Op);
44259 unsigned Scale = SizeInBits / ExtSizeInBits;
44260 SDValue SrcOp = Op.getOperand(0);
44261 MVT SrcVT = SrcOp.getSimpleValueType();
44262 unsigned SrcExtSize =
44263 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44265 ExtSizeInBits / VT.getScalarSizeInBits());
44266 SDValue ExtOp = TLO.DAG.getNode(
44267 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44268 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44269 SDValue Insert =
44270 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44271 return TLO.CombineTo(Op, Insert);
44272 }
44273 // Zero upper elements.
44274 case X86ISD::VZEXT_MOVL:
44275 // Variable blend.
44276 case X86ISD::BLENDV:
44277 // Target unary shuffles:
44278 case X86ISD::MOVDDUP:
44279 // Target unary shuffles by immediate:
44280 case X86ISD::PSHUFD:
44281 case X86ISD::PSHUFLW:
44282 case X86ISD::PSHUFHW:
44283 case X86ISD::VPERMILPI:
44284 // (Non-Lane Crossing) Target Shuffles.
44285 case X86ISD::VPERMILPV:
44286 case X86ISD::VPERMIL2:
44287 case X86ISD::PSHUFB:
44288 case X86ISD::UNPCKL:
44289 case X86ISD::UNPCKH:
44290 case X86ISD::BLENDI:
44291 // Integer ops.
44292 case X86ISD::PACKSS:
44293 case X86ISD::PACKUS:
44294 case X86ISD::PCMPEQ:
44295 case X86ISD::PCMPGT:
44296 case X86ISD::PMULUDQ:
44297 case X86ISD::PMULDQ:
44298 case X86ISD::VSHLV:
44299 case X86ISD::VSRLV:
44300 case X86ISD::VSRAV:
44301 // Float ops.
44302 case X86ISD::FMAX:
44303 case X86ISD::FMIN:
44304 case X86ISD::FMAXC:
44305 case X86ISD::FMINC:
44306 case X86ISD::FRSQRT:
44307 case X86ISD::FRCP:
44308 // Horizontal Ops.
44309 case X86ISD::HADD:
44310 case X86ISD::HSUB:
44311 case X86ISD::FHADD:
44312 case X86ISD::FHSUB: {
44313 SDLoc DL(Op);
44315 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44316 SDValue SrcOp = Op.getOperand(i);
44317 EVT SrcVT = SrcOp.getValueType();
44318 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44319 "Unsupported vector size");
44320 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44321 ExtSizeInBits)
44322 : SrcOp);
44323 }
44324 MVT ExtVT = VT.getSimpleVT();
44325 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44326 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44327 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44328 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44329 SDValue Insert =
44330 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44331 return TLO.CombineTo(Op, Insert);
44332 }
44333 }
44334 }
44335
44336 // For splats, unless we *only* demand the 0'th element,
44337 // stop attempts at simplification here, we aren't going to improve things,
44338 // this is better than any potential shuffle.
44339 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44340 return false;
44341
44342 // Get target/faux shuffle mask.
44343 APInt OpUndef, OpZero;
44344 SmallVector<int, 64> OpMask;
44345 SmallVector<SDValue, 2> OpInputs;
44346 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44347 OpZero, TLO.DAG, Depth, false))
44348 return false;
44349
44350 // Shuffle inputs must be the same size as the result.
44351 if (OpMask.size() != (unsigned)NumElts ||
44352 llvm::any_of(OpInputs, [VT](SDValue V) {
44353 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44354 !V.getValueType().isVector();
44355 }))
44356 return false;
44357
44358 KnownZero = OpZero;
44359 KnownUndef = OpUndef;
44360
44361 // Check if shuffle mask can be simplified to undef/zero/identity.
44362 int NumSrcs = OpInputs.size();
44363 for (int i = 0; i != NumElts; ++i)
44364 if (!DemandedElts[i])
44365 OpMask[i] = SM_SentinelUndef;
44366
44367 if (isUndefInRange(OpMask, 0, NumElts)) {
44368 KnownUndef.setAllBits();
44369 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44370 }
44371 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44372 KnownZero.setAllBits();
44373 return TLO.CombineTo(
44374 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44375 }
44376 for (int Src = 0; Src != NumSrcs; ++Src)
44377 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44378 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44379
44380 // Attempt to simplify inputs.
44381 for (int Src = 0; Src != NumSrcs; ++Src) {
44382 // TODO: Support inputs of different types.
44383 if (OpInputs[Src].getValueType() != VT)
44384 continue;
44385
44386 int Lo = Src * NumElts;
44387 APInt SrcElts = APInt::getZero(NumElts);
44388 for (int i = 0; i != NumElts; ++i)
44389 if (DemandedElts[i]) {
44390 int M = OpMask[i] - Lo;
44391 if (0 <= M && M < NumElts)
44392 SrcElts.setBit(M);
44393 }
44394
44395 // TODO - Propagate input undef/zero elts.
44396 APInt SrcUndef, SrcZero;
44397 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44398 TLO, Depth + 1))
44399 return true;
44400 }
44401
44402 // If we don't demand all elements, then attempt to combine to a simpler
44403 // shuffle.
44404 // We need to convert the depth to something combineX86ShufflesRecursively
44405 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44406 // to match. This prevents combineX86ShuffleChain from returning a
44407 // combined shuffle that's the same as the original root, causing an
44408 // infinite loop.
44409 if (!DemandedElts.isAllOnes()) {
44410 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44411
44412 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44413 for (int i = 0; i != NumElts; ++i)
44414 if (DemandedElts[i])
44415 DemandedMask[i] = i;
44416
44418 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44420 /*AllowVariableCrossLaneMask=*/true,
44421 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44422 TLO.DAG, SDLoc(Op), Subtarget);
44423 if (NewShuffle)
44424 return TLO.CombineTo(Op, NewShuffle);
44425 }
44426
44427 return false;
44428}
44429
44431 SDValue Op, const APInt &OriginalDemandedBits,
44432 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44433 unsigned Depth) const {
44434 EVT VT = Op.getValueType();
44435 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44436 unsigned Opc = Op.getOpcode();
44437 switch(Opc) {
44438 case X86ISD::VTRUNC: {
44439 KnownBits KnownOp;
44440 SDValue Src = Op.getOperand(0);
44441 MVT SrcVT = Src.getSimpleValueType();
44442
44443 // Simplify the input, using demanded bit information.
44444 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44445 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44446 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44447 return true;
44448 break;
44449 }
44450 case X86ISD::PMULDQ:
44451 case X86ISD::PMULUDQ: {
44452 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44453 KnownBits KnownLHS, KnownRHS;
44454 SDValue LHS = Op.getOperand(0);
44455 SDValue RHS = Op.getOperand(1);
44456
44457 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44458 // FIXME: Can we bound this better?
44459 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44460 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44461 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44462
44463 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44464 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44465 DemandedMaskLHS = DemandedMask;
44466 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44467 DemandedMaskRHS = DemandedMask;
44468
44469 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44470 KnownLHS, TLO, Depth + 1))
44471 return true;
44472 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44473 KnownRHS, TLO, Depth + 1))
44474 return true;
44475
44476 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44477 KnownRHS = KnownRHS.trunc(32);
44478 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44479 KnownRHS.getConstant().isOne()) {
44480 SDLoc DL(Op);
44481 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44482 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44483 }
44484
44485 // Aggressively peek through ops to get at the demanded low bits.
44487 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44489 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44490 if (DemandedLHS || DemandedRHS) {
44491 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44492 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44493 return TLO.CombineTo(
44494 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44495 }
44496 break;
44497 }
44498 case X86ISD::ANDNP: {
44499 KnownBits Known2;
44500 SDValue Op0 = Op.getOperand(0);
44501 SDValue Op1 = Op.getOperand(1);
44502
44503 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44504 Known, TLO, Depth + 1))
44505 return true;
44506
44507 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44508 OriginalDemandedElts, Known2, TLO, Depth + 1))
44509 return true;
44510
44511 // If the RHS is a constant, see if we can simplify it.
44512 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44513 OriginalDemandedElts, TLO))
44514 return true;
44515
44516 // ANDNP = (~Op0 & Op1);
44517 Known.One &= Known2.Zero;
44518 Known.Zero |= Known2.One;
44519 break;
44520 }
44521 case X86ISD::VSHLI: {
44522 SDValue Op0 = Op.getOperand(0);
44523 SDValue Op1 = Op.getOperand(1);
44524
44525 unsigned ShAmt = Op1->getAsZExtVal();
44526 if (ShAmt >= BitWidth)
44527 break;
44528
44529 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44530
44531 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44532 // single shift. We can do this if the bottom bits (which are shifted
44533 // out) are never demanded.
44534 if (Op0.getOpcode() == X86ISD::VSRLI &&
44535 OriginalDemandedBits.countr_zero() >= ShAmt) {
44536 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44537 if (Shift2Amt < BitWidth) {
44538 int Diff = ShAmt - Shift2Amt;
44539 if (Diff == 0)
44540 return TLO.CombineTo(Op, Op0.getOperand(0));
44541
44542 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44543 SDValue NewShift = TLO.DAG.getNode(
44544 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44545 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44546 return TLO.CombineTo(Op, NewShift);
44547 }
44548 }
44549
44550 // If we are only demanding sign bits then we can use the shift source directly.
44551 unsigned NumSignBits =
44552 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44553 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44554 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44555 return TLO.CombineTo(Op, Op0);
44556
44557 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44558 TLO, Depth + 1))
44559 return true;
44560
44561 Known <<= ShAmt;
44562
44563 // Low bits known zero.
44564 Known.Zero.setLowBits(ShAmt);
44565
44566 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44567 // Attempt to avoid multi-use ops if we don't need anything from them.
44568 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44569 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44570 SDValue NewOp =
44571 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44572 return TLO.CombineTo(Op, NewOp);
44573 }
44574 }
44575 return false;
44576 }
44577 case X86ISD::VSRLI: {
44578 SDValue Op0 = Op.getOperand(0);
44579 SDValue Op1 = Op.getOperand(1);
44580
44581 unsigned ShAmt = Op1->getAsZExtVal();
44582 if (ShAmt >= BitWidth)
44583 break;
44584
44585 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44586
44587 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44588 TLO, Depth + 1))
44589 return true;
44590
44591 Known >>= ShAmt;
44592
44593 // High bits known zero.
44594 Known.Zero.setHighBits(ShAmt);
44595
44596 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44597 // Attempt to avoid multi-use ops if we don't need anything from them.
44598 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44599 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44600 SDValue NewOp =
44601 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44602 return TLO.CombineTo(Op, NewOp);
44603 }
44604 }
44605 return false;
44606 }
44607 case X86ISD::VSRAI: {
44608 SDValue Op0 = Op.getOperand(0);
44609 SDValue Op1 = Op.getOperand(1);
44610
44611 unsigned ShAmt = Op1->getAsZExtVal();
44612 if (ShAmt >= BitWidth)
44613 break;
44614
44615 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44616
44617 // If we only want bits that already match the signbit then we don't need
44618 // to shift.
44619 unsigned NumHiDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44620 if (TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1) >=
44621 NumHiDemandedBits)
44622 return TLO.CombineTo(Op, Op0);
44623
44624 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44625 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44626 SDValue Op00 = Op0.getOperand(0);
44627 unsigned NumSignBits =
44628 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44629 if (ShAmt < NumSignBits)
44630 return TLO.CombineTo(Op, Op00);
44631 }
44632
44633 // If any of the demanded bits are produced by the sign extension, we also
44634 // demand the input sign bit.
44635 if (OriginalDemandedBits.countl_zero() < ShAmt)
44636 DemandedMask.setSignBit();
44637
44638 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44639 TLO, Depth + 1))
44640 return true;
44641
44642 Known >>= ShAmt;
44643
44644 // If the input sign bit is known to be zero, or if none of the top bits
44645 // are demanded, turn this into an unsigned shift right.
44646 if (Known.Zero[BitWidth - ShAmt - 1] ||
44647 OriginalDemandedBits.countl_zero() >= ShAmt)
44648 return TLO.CombineTo(
44649 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44650
44651 // High bits are known one.
44652 if (Known.One[BitWidth - ShAmt - 1])
44653 Known.One.setHighBits(ShAmt);
44654
44655 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44656 // Attempt to avoid multi-use ops if we don't need anything from them.
44657 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44658 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44659 SDValue NewOp =
44660 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44661 return TLO.CombineTo(Op, NewOp);
44662 }
44663 }
44664 return false;
44665 }
44666 case X86ISD::BLENDI: {
44667 SDValue LHS = Op.getOperand(0);
44668 SDValue RHS = Op.getOperand(1);
44669 APInt Mask = getBLENDIBlendMask(Op);
44670
44671 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44672 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44673 TLO, Depth + 1))
44674 return true;
44675
44676 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44677 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44678 TLO, Depth + 1))
44679 return true;
44680
44681 // Attempt to avoid multi-use ops if we don't need anything from them.
44683 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44685 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44686 if (NewLHS || NewRHS) {
44687 NewLHS = NewLHS ? NewLHS : LHS;
44688 NewRHS = NewRHS ? NewRHS : RHS;
44689 return TLO.CombineTo(Op,
44690 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44691 NewLHS, NewRHS, Op.getOperand(2)));
44692 }
44693 break;
44694 }
44695 case X86ISD::BLENDV: {
44696 SDValue Sel = Op.getOperand(0);
44697 SDValue LHS = Op.getOperand(1);
44698 SDValue RHS = Op.getOperand(2);
44699
44700 APInt SignMask = APInt::getSignMask(BitWidth);
44702 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44704 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44706 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44707
44708 if (NewSel || NewLHS || NewRHS) {
44709 NewSel = NewSel ? NewSel : Sel;
44710 NewLHS = NewLHS ? NewLHS : LHS;
44711 NewRHS = NewRHS ? NewRHS : RHS;
44712 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44713 NewSel, NewLHS, NewRHS));
44714 }
44715 break;
44716 }
44717 case X86ISD::PEXTRB:
44718 case X86ISD::PEXTRW: {
44719 SDValue Vec = Op.getOperand(0);
44720 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44721 MVT VecVT = Vec.getSimpleValueType();
44722 unsigned NumVecElts = VecVT.getVectorNumElements();
44723
44724 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44725 unsigned Idx = CIdx->getZExtValue();
44726 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44727
44728 // If we demand no bits from the vector then we must have demanded
44729 // bits from the implict zext - simplify to zero.
44730 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44731 if (DemandedVecBits == 0)
44732 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44733
44734 APInt KnownUndef, KnownZero;
44735 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44736 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44737 KnownZero, TLO, Depth + 1))
44738 return true;
44739
44740 KnownBits KnownVec;
44741 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44742 KnownVec, TLO, Depth + 1))
44743 return true;
44744
44746 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44747 return TLO.CombineTo(
44748 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44749
44750 Known = KnownVec.zext(BitWidth);
44751 return false;
44752 }
44753 break;
44754 }
44755 case X86ISD::PINSRB:
44756 case X86ISD::PINSRW: {
44757 SDValue Vec = Op.getOperand(0);
44758 SDValue Scl = Op.getOperand(1);
44759 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44760 MVT VecVT = Vec.getSimpleValueType();
44761
44762 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44763 unsigned Idx = CIdx->getZExtValue();
44764 if (!OriginalDemandedElts[Idx])
44765 return TLO.CombineTo(Op, Vec);
44766
44767 KnownBits KnownVec;
44768 APInt DemandedVecElts(OriginalDemandedElts);
44769 DemandedVecElts.clearBit(Idx);
44770 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44771 KnownVec, TLO, Depth + 1))
44772 return true;
44773
44774 KnownBits KnownScl;
44775 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44776 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44777 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44778 return true;
44779
44780 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44781 Known = KnownVec.intersectWith(KnownScl);
44782 return false;
44783 }
44784 break;
44785 }
44786 case X86ISD::PACKSS:
44787 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44788 // sign bit then we can just ask for the source operands sign bit.
44789 // TODO - add known bits handling.
44790 if (OriginalDemandedBits.isSignMask()) {
44791 APInt DemandedLHS, DemandedRHS;
44792 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44793
44794 KnownBits KnownLHS, KnownRHS;
44795 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44796 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44797 KnownLHS, TLO, Depth + 1))
44798 return true;
44799 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44800 KnownRHS, TLO, Depth + 1))
44801 return true;
44802
44803 // Attempt to avoid multi-use ops if we don't need anything from them.
44805 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44807 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44808 if (DemandedOp0 || DemandedOp1) {
44809 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44810 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44811 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44812 }
44813 }
44814 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44815 break;
44816 case X86ISD::VBROADCAST: {
44817 SDValue Src = Op.getOperand(0);
44818 MVT SrcVT = Src.getSimpleValueType();
44819 APInt DemandedElts = APInt::getOneBitSet(
44820 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44821 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44822 TLO, Depth + 1))
44823 return true;
44824 // If we don't need the upper bits, attempt to narrow the broadcast source.
44825 // Don't attempt this on AVX512 as it might affect broadcast folding.
44826 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44827 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44828 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44829 Src->hasOneUse()) {
44830 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44831 SDValue NewSrc =
44832 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44833 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44834 SDValue NewBcst =
44835 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44836 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44837 }
44838 break;
44839 }
44840 case X86ISD::PCMPGT:
44841 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44842 // iff we only need the sign bit then we can use R directly.
44843 if (OriginalDemandedBits.isSignMask() &&
44844 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44845 return TLO.CombineTo(Op, Op.getOperand(1));
44846 break;
44847 case X86ISD::MOVMSK: {
44848 SDValue Src = Op.getOperand(0);
44849 MVT SrcVT = Src.getSimpleValueType();
44850 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44851 unsigned NumElts = SrcVT.getVectorNumElements();
44852
44853 // If we don't need the sign bits at all just return zero.
44854 if (OriginalDemandedBits.countr_zero() >= NumElts)
44855 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44856
44857 // See if we only demand bits from the lower 128-bit vector.
44858 if (SrcVT.is256BitVector() &&
44859 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44860 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44861 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44862 }
44863
44864 // Only demand the vector elements of the sign bits we need.
44865 APInt KnownUndef, KnownZero;
44866 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44867 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44868 TLO, Depth + 1))
44869 return true;
44870
44871 Known.Zero = KnownZero.zext(BitWidth);
44872 Known.Zero.setHighBits(BitWidth - NumElts);
44873
44874 // MOVMSK only uses the MSB from each vector element.
44875 KnownBits KnownSrc;
44876 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44877 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44878 Depth + 1))
44879 return true;
44880
44881 if (KnownSrc.One[SrcBits - 1])
44882 Known.One.setLowBits(NumElts);
44883 else if (KnownSrc.Zero[SrcBits - 1])
44884 Known.Zero.setLowBits(NumElts);
44885
44886 // Attempt to avoid multi-use os if we don't need anything from it.
44888 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44889 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44890 return false;
44891 }
44892 case X86ISD::TESTP: {
44893 SDValue Op0 = Op.getOperand(0);
44894 SDValue Op1 = Op.getOperand(1);
44895 MVT OpVT = Op0.getSimpleValueType();
44896 assert((OpVT.getVectorElementType() == MVT::f32 ||
44897 OpVT.getVectorElementType() == MVT::f64) &&
44898 "Illegal vector type for X86ISD::TESTP");
44899
44900 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44901 KnownBits KnownSrc;
44902 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44903 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44904 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44905 AssumeSingleUse) ||
44906 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44907 AssumeSingleUse);
44908 }
44909 case X86ISD::CMOV: {
44910 KnownBits Known2;
44911 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44912 OriginalDemandedElts, Known2, TLO, Depth + 1))
44913 return true;
44914 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44915 OriginalDemandedElts, Known, TLO, Depth + 1))
44916 return true;
44917
44918 // Only known if known in both the LHS and RHS.
44919 Known = Known.intersectWith(Known2);
44920 return false;
44921 }
44922 case X86ISD::BEXTR:
44923 case X86ISD::BEXTRI: {
44924 SDValue Op0 = Op.getOperand(0);
44925 SDValue Op1 = Op.getOperand(1);
44926
44927 // Only bottom 16-bits of the control bits are required.
44928 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44929 // NOTE: SimplifyDemandedBits won't do this for constants.
44930 uint64_t Val1 = Cst1->getZExtValue();
44931 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44932 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44933 SDLoc DL(Op);
44934 return TLO.CombineTo(
44935 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44936 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44937 }
44938
44939 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44940 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44941
44942 // If the length is 0, the result is 0.
44943 if (Length == 0) {
44944 Known.setAllZero();
44945 return false;
44946 }
44947
44948 if ((Shift + Length) <= BitWidth) {
44949 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44950 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44951 return true;
44952
44953 Known = Known.extractBits(Length, Shift);
44954 Known = Known.zextOrTrunc(BitWidth);
44955 return false;
44956 }
44957 } else {
44958 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44959 KnownBits Known1;
44960 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44961 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44962 return true;
44963
44964 // If the length is 0, replace with 0.
44965 KnownBits LengthBits = Known1.extractBits(8, 8);
44966 if (LengthBits.isZero())
44967 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44968 }
44969
44970 break;
44971 }
44972 case X86ISD::PDEP: {
44973 SDValue Op0 = Op.getOperand(0);
44974 SDValue Op1 = Op.getOperand(1);
44975
44976 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44977 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44978
44979 // If the demanded bits has leading zeroes, we don't demand those from the
44980 // mask.
44981 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44982 return true;
44983
44984 // The number of possible 1s in the mask determines the number of LSBs of
44985 // operand 0 used. Undemanded bits from the mask don't matter so filter
44986 // them before counting.
44987 KnownBits Known2;
44988 uint64_t Count = (~Known.Zero & LoMask).popcount();
44989 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44990 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44991 return true;
44992
44993 // Zeroes are retained from the mask, but not ones.
44994 Known.One.clearAllBits();
44995 // The result will have at least as many trailing zeros as the non-mask
44996 // operand since bits can only map to the same or higher bit position.
44997 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44998 return false;
44999 }
45000 case X86ISD::VPMADD52L:
45001 case X86ISD::VPMADD52H: {
45002 KnownBits KnownOp0, KnownOp1, KnownOp2;
45003 SDValue Op0 = Op.getOperand(0);
45004 SDValue Op1 = Op.getOperand(1);
45005 SDValue Op2 = Op.getOperand(2);
45006 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
45007 // operand 2).
45008 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
45009 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
45010 TLO, Depth + 1))
45011 return true;
45012
45013 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
45014 TLO, Depth + 1))
45015 return true;
45016
45017 if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
45018 KnownOp2, TLO, Depth + 1))
45019 return true;
45020
45021 KnownBits KnownMul;
45022 KnownOp0 = KnownOp0.trunc(52);
45023 KnownOp1 = KnownOp1.trunc(52);
45024 KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
45025 : KnownBits::mulhu(KnownOp0, KnownOp1);
45026 KnownMul = KnownMul.zext(64);
45027
45028 // lo/hi(X * Y) + Z --> C + Z
45029 if (KnownMul.isConstant()) {
45030 SDLoc DL(Op);
45031 SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
45032 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
45033 }
45034
45035 Known = KnownBits::add(KnownMul, KnownOp2);
45036 return false;
45037 }
45038 }
45039
45041 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
45042}
45043
45045 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
45046 SelectionDAG &DAG, unsigned Depth) const {
45047 int NumElts = DemandedElts.getBitWidth();
45048 unsigned Opc = Op.getOpcode();
45049 EVT VT = Op.getValueType();
45050
45051 switch (Opc) {
45052 case X86ISD::PINSRB:
45053 case X86ISD::PINSRW: {
45054 // If we don't demand the inserted element, return the base vector.
45055 SDValue Vec = Op.getOperand(0);
45056 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
45057 MVT VecVT = Vec.getSimpleValueType();
45058 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
45059 !DemandedElts[CIdx->getZExtValue()])
45060 return Vec;
45061 break;
45062 }
45063 case X86ISD::VSHLI: {
45064 // If we are only demanding sign bits then we can use the shift source
45065 // directly.
45066 SDValue Op0 = Op.getOperand(0);
45067 unsigned ShAmt = Op.getConstantOperandVal(1);
45068 unsigned BitWidth = DemandedBits.getBitWidth();
45069 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45070 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45071 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45072 return Op0;
45073 break;
45074 }
45075 case X86ISD::VSRAI:
45076 // iff we only need the sign bit then we can use the source directly.
45077 // TODO: generalize where we only demand extended signbits.
45078 if (DemandedBits.isSignMask())
45079 return Op.getOperand(0);
45080 break;
45081 case X86ISD::PCMPGT:
45082 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45083 // iff we only need the sign bit then we can use R directly.
45084 if (DemandedBits.isSignMask() &&
45085 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45086 return Op.getOperand(1);
45087 break;
45088 case X86ISD::BLENDV: {
45089 // BLENDV: Cond (MSB) ? LHS : RHS
45090 SDValue Cond = Op.getOperand(0);
45091 SDValue LHS = Op.getOperand(1);
45092 SDValue RHS = Op.getOperand(2);
45093
45094 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45095 if (CondKnown.isNegative())
45096 return LHS;
45097 if (CondKnown.isNonNegative())
45098 return RHS;
45099 break;
45100 }
45101 case X86ISD::ANDNP: {
45102 // ANDNP = (~LHS & RHS);
45103 SDValue LHS = Op.getOperand(0);
45104 SDValue RHS = Op.getOperand(1);
45105
45106 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45107 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45108
45109 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45110 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45111 // this context, so return RHS.
45112 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45113 return RHS;
45114 break;
45115 }
45116 }
45117
45118 APInt ShuffleUndef, ShuffleZero;
45119 SmallVector<int, 16> ShuffleMask;
45121 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45122 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45123 // If all the demanded elts are from one operand and are inline,
45124 // then we can use the operand directly.
45125 int NumOps = ShuffleOps.size();
45126 if (ShuffleMask.size() == (unsigned)NumElts &&
45128 return VT.getSizeInBits() == V.getValueSizeInBits();
45129 })) {
45130
45131 if (DemandedElts.isSubsetOf(ShuffleUndef))
45132 return DAG.getUNDEF(VT);
45133 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45134 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45135
45136 // Bitmask that indicates which ops have only been accessed 'inline'.
45137 APInt IdentityOp = APInt::getAllOnes(NumOps);
45138 for (int i = 0; i != NumElts; ++i) {
45139 int M = ShuffleMask[i];
45140 if (!DemandedElts[i] || ShuffleUndef[i])
45141 continue;
45142 int OpIdx = M / NumElts;
45143 int EltIdx = M % NumElts;
45144 if (M < 0 || EltIdx != i) {
45145 IdentityOp.clearAllBits();
45146 break;
45147 }
45148 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45149 if (IdentityOp == 0)
45150 break;
45151 }
45152 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45153 "Multiple identity shuffles detected");
45154
45155 if (IdentityOp != 0)
45156 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45157 }
45158 }
45159
45161 Op, DemandedBits, DemandedElts, DAG, Depth);
45162}
45163
45165 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45166 bool PoisonOnly, unsigned Depth) const {
45167 unsigned NumElts = DemandedElts.getBitWidth();
45168
45169 switch (Op.getOpcode()) {
45171 case X86ISD::Wrapper:
45172 case X86ISD::WrapperRIP:
45173 return true;
45174 case X86ISD::PACKSS:
45175 case X86ISD::PACKUS: {
45176 APInt DemandedLHS, DemandedRHS;
45177 getPackDemandedElts(Op.getSimpleValueType(), DemandedElts, DemandedLHS,
45178 DemandedRHS);
45179 return (!DemandedLHS ||
45180 DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedLHS,
45181 PoisonOnly, Depth + 1)) &&
45182 (!DemandedRHS ||
45183 DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(1), DemandedRHS,
45184 PoisonOnly, Depth + 1));
45185 }
45186 case X86ISD::INSERTPS:
45187 case X86ISD::BLENDI:
45188 case X86ISD::PSHUFB:
45189 case X86ISD::PSHUFD:
45190 case X86ISD::UNPCKL:
45191 case X86ISD::UNPCKH:
45192 case X86ISD::VPERMILPV:
45193 case X86ISD::VPERMILPI:
45194 case X86ISD::VPERMV:
45195 case X86ISD::VPERMV3: {
45198 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45199 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45200 APInt::getZero(NumElts));
45201 for (auto M : enumerate(Mask)) {
45202 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45203 continue;
45204 if (M.value() == SM_SentinelUndef)
45205 return false;
45206 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45207 "Shuffle mask index out of range");
45208 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45209 }
45210 for (auto Op : enumerate(Ops))
45211 if (!DemandedSrcElts[Op.index()].isZero() &&
45213 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45214 return false;
45215 return true;
45216 }
45217 break;
45218 }
45219 }
45221 Op, DemandedElts, DAG, PoisonOnly, Depth);
45222}
45223
45225 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45226 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45227
45228 switch (Op.getOpcode()) {
45229 // SSE bit logic.
45230 case X86ISD::FAND:
45231 case X86ISD::FOR:
45232 case X86ISD::FXOR:
45233 case X86ISD::FANDN:
45234 case X86ISD::ANDNP:
45235 case X86ISD::VPTERNLOG:
45236 return false;
45237 // SSE vector insert/extracts use modulo indices.
45238 case X86ISD::PINSRB:
45239 case X86ISD::PINSRW:
45240 case X86ISD::PEXTRB:
45241 case X86ISD::PEXTRW:
45242 return false;
45243 // SSE vector multiplies are either inbounds or saturate.
45244 case X86ISD::VPMADDUBSW:
45245 case X86ISD::VPMADDWD:
45246 return false;
45247 // SSE vector shifts handle out of bounds shift amounts.
45248 case X86ISD::VSHLI:
45249 case X86ISD::VSRLI:
45250 case X86ISD::VSRAI:
45251 return false;
45252 // SSE blends.
45253 case X86ISD::BLENDI:
45254 case X86ISD::BLENDV:
45255 return false;
45256 // SSE packs.
45257 case X86ISD::PACKSS:
45258 case X86ISD::PACKUS:
45259 return false;
45260 // SSE target shuffles.
45261 case X86ISD::INSERTPS:
45262 case X86ISD::PSHUFB:
45263 case X86ISD::PSHUFD:
45264 case X86ISD::UNPCKL:
45265 case X86ISD::UNPCKH:
45266 case X86ISD::VPERMILPV:
45267 case X86ISD::VPERMILPI:
45268 case X86ISD::VPERMV:
45269 case X86ISD::VPERMV3:
45270 return false;
45271 // SSE comparisons handle all icmp/fcmp cases.
45272 // TODO: Add CMPM/MM with test coverage.
45273 case X86ISD::CMPP:
45274 case X86ISD::PCMPEQ:
45275 case X86ISD::PCMPGT:
45276 return false;
45277 // SSE signbit extraction.
45278 case X86ISD::MOVMSK:
45279 return false;
45280 // GFNI instructions.
45283 case X86ISD::GF2P8MULB:
45284 return false;
45286 switch (Op->getConstantOperandVal(0)) {
45287 case Intrinsic::x86_sse2_pmadd_wd:
45288 case Intrinsic::x86_avx2_pmadd_wd:
45289 case Intrinsic::x86_avx512_pmaddw_d_512:
45290 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45291 case Intrinsic::x86_avx2_pmadd_ub_sw:
45292 case Intrinsic::x86_avx512_pmaddubs_w_512:
45293 return false;
45294 case Intrinsic::x86_avx512_vpermi2var_d_128:
45295 case Intrinsic::x86_avx512_vpermi2var_d_256:
45296 case Intrinsic::x86_avx512_vpermi2var_d_512:
45297 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45298 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45299 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45300 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45301 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45302 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45303 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45304 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45305 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45306 case Intrinsic::x86_avx512_vpermi2var_q_128:
45307 case Intrinsic::x86_avx512_vpermi2var_q_256:
45308 case Intrinsic::x86_avx512_vpermi2var_q_512:
45309 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45310 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45311 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45312 return false;
45313 }
45314 }
45316 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45317}
45318
45320 const APInt &DemandedElts,
45321 APInt &UndefElts,
45322 const SelectionDAG &DAG,
45323 unsigned Depth) const {
45324 unsigned NumElts = DemandedElts.getBitWidth();
45325 unsigned Opc = Op.getOpcode();
45326
45327 switch (Opc) {
45328 case X86ISD::VBROADCAST:
45330 UndefElts = APInt::getZero(NumElts);
45331 return true;
45332 }
45333
45334 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45335 DAG, Depth);
45336}
45337
45338// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45339// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45340static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45341 bool AllowTruncate, unsigned Depth) {
45342 // Limit recursion.
45344 return false;
45345 switch (Src.getOpcode()) {
45346 case ISD::TRUNCATE:
45347 if (!AllowTruncate)
45348 return false;
45349 [[fallthrough]];
45350 case ISD::SETCC:
45351 return Src.getOperand(0).getValueSizeInBits() == Size;
45352 case ISD::FREEZE:
45353 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45354 Depth + 1);
45355 case ISD::AND:
45356 case ISD::XOR:
45357 case ISD::OR:
45358 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45359 Depth + 1) &&
45360 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45361 Depth + 1);
45362 case ISD::SELECT:
45363 case ISD::VSELECT:
45364 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45365 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45366 Depth + 1) &&
45367 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45368 Depth + 1);
45369 case ISD::BUILD_VECTOR:
45370 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45371 ISD::isBuildVectorAllOnes(Src.getNode());
45372 }
45373 return false;
45374}
45375
45376// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45377static unsigned getAltBitOpcode(unsigned Opcode) {
45378 switch(Opcode) {
45379 // clang-format off
45380 case ISD::AND: return X86ISD::FAND;
45381 case ISD::OR: return X86ISD::FOR;
45382 case ISD::XOR: return X86ISD::FXOR;
45383 case X86ISD::ANDNP: return X86ISD::FANDN;
45384 // clang-format on
45385 }
45386 llvm_unreachable("Unknown bitwise opcode");
45387}
45388
45389// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45391 const SDLoc &DL) {
45392 EVT SrcVT = Src.getValueType();
45393 if (SrcVT != MVT::v4i1)
45394 return SDValue();
45395
45396 switch (Src.getOpcode()) {
45397 case ISD::SETCC:
45398 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45399 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45400 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45401 SDValue Op0 = Src.getOperand(0);
45402 if (ISD::isNormalLoad(Op0.getNode()))
45403 return DAG.getBitcast(MVT::v4f32, Op0);
45404 if (Op0.getOpcode() == ISD::BITCAST &&
45405 Op0.getOperand(0).getValueType() == MVT::v4f32)
45406 return Op0.getOperand(0);
45407 }
45408 break;
45409 case ISD::AND:
45410 case ISD::XOR:
45411 case ISD::OR: {
45412 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45413 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45414 if (Op0 && Op1)
45415 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45416 Op1);
45417 break;
45418 }
45419 }
45420 return SDValue();
45421}
45422
45423// Helper to push sign extension of vXi1 SETCC result through bitops.
45425 SDValue Src, const SDLoc &DL) {
45426 switch (Src.getOpcode()) {
45427 case ISD::SETCC:
45428 case ISD::FREEZE:
45429 case ISD::TRUNCATE:
45430 case ISD::BUILD_VECTOR:
45431 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45432 case ISD::AND:
45433 case ISD::XOR:
45434 case ISD::OR:
45435 return DAG.getNode(
45436 Src.getOpcode(), DL, SExtVT,
45437 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45438 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45439 case ISD::SELECT:
45440 case ISD::VSELECT:
45441 return DAG.getSelect(
45442 DL, SExtVT, Src.getOperand(0),
45443 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45444 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45445 }
45446 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45447}
45448
45449// Try to match patterns such as
45450// (i16 bitcast (v16i1 x))
45451// ->
45452// (i16 movmsk (16i8 sext (v16i1 x)))
45453// before the illegal vector is scalarized on subtargets that don't have legal
45454// vxi1 types.
45456 const SDLoc &DL,
45457 const X86Subtarget &Subtarget) {
45458 EVT SrcVT = Src.getValueType();
45459 if (Subtarget.useSoftFloat() || !SrcVT.isSimple() ||
45460 SrcVT.getScalarType() != MVT::i1)
45461 return SDValue();
45462
45463 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45464 // legalization destroys the v4i32 type.
45465 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45466 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45467 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45468 DAG.getBitcast(MVT::v4f32, V));
45469 return DAG.getZExtOrTrunc(V, DL, VT);
45470 }
45471 }
45472
45473 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45474 // movmskb even with avx512. This will be better than truncating to vXi1 and
45475 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45476 // vpcmpeqb/vpcmpgtb.
45477 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45478 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45479 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45480 Src.getOperand(0).getValueType() == MVT::v64i8);
45481
45482 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45483 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45484 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45485 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45486 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45487 EVT CmpVT = Src.getOperand(0).getValueType();
45488 EVT EltVT = CmpVT.getVectorElementType();
45489 if (CmpVT.getSizeInBits() <= 256 &&
45490 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45491 PreferMovMsk = true;
45492 }
45493
45494 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45495 // MOVMSK is supported in SSE2 or later.
45496 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45497 return SDValue();
45498
45499 // If the upper ops of a concatenation are undef, then try to bitcast the
45500 // lower op and extend.
45501 SmallVector<SDValue, 4> SubSrcOps;
45502 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45503 SubSrcOps.size() >= 2) {
45504 SDValue LowerOp = SubSrcOps[0];
45505 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45506 if (LowerOp.getOpcode() == ISD::SETCC &&
45507 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45508 EVT SubVT = VT.getIntegerVT(
45509 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45510 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45511 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45512 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45513 }
45514 }
45515 }
45516
45517 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45518 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45519 // v8i16 and v16i16.
45520 // For these two cases, we can shuffle the upper element bytes to a
45521 // consecutive sequence at the start of the vector and treat the results as
45522 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45523 // for v16i16 this is not the case, because the shuffle is expensive, so we
45524 // avoid sign-extending to this type entirely.
45525 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45526 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45527 MVT SExtVT;
45528 bool PropagateSExt = false;
45529 switch (SrcVT.getSimpleVT().SimpleTy) {
45530 default:
45531 return SDValue();
45532 case MVT::v2i1:
45533 SExtVT = MVT::v2i64;
45534 break;
45535 case MVT::v4i1:
45536 SExtVT = MVT::v4i32;
45537 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45538 // sign-extend to a 256-bit operation to avoid truncation.
45539 if (Subtarget.hasAVX() &&
45540 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45541 SExtVT = MVT::v4i64;
45542 PropagateSExt = true;
45543 }
45544 break;
45545 case MVT::v8i1:
45546 SExtVT = MVT::v8i16;
45547 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45548 // sign-extend to a 256-bit operation to match the compare.
45549 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45550 // 256-bit because the shuffle is cheaper than sign extending the result of
45551 // the compare.
45552 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45553 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45554 SExtVT = MVT::v8i32;
45555 PropagateSExt = true;
45556 }
45557 break;
45558 case MVT::v16i1:
45559 SExtVT = MVT::v16i8;
45560 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45561 // it is not profitable to sign-extend to 256-bit because this will
45562 // require an extra cross-lane shuffle which is more expensive than
45563 // truncating the result of the compare to 128-bits.
45564 break;
45565 case MVT::v32i1:
45566 SExtVT = MVT::v32i8;
45567 break;
45568 case MVT::v64i1:
45569 // If we have AVX512F, but not AVX512BW and the input is truncated from
45570 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45571 if (Subtarget.hasAVX512()) {
45572 if (Subtarget.hasBWI())
45573 return SDValue();
45574 SExtVT = MVT::v64i8;
45575 break;
45576 }
45577 // Split if this is a <64 x i8> comparison result.
45578 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45579 SExtVT = MVT::v64i8;
45580 break;
45581 }
45582 return SDValue();
45583 };
45584
45585 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45586 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45587
45588 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45589 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45590 } else {
45591 if (SExtVT == MVT::v8i16) {
45592 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45593 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45594 }
45595 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45596 }
45597
45598 EVT IntVT =
45600 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45601 return DAG.getBitcast(VT, V);
45602}
45603
45604// Convert a vXi1 constant build vector to the same width scalar integer.
45606 EVT SrcVT = Op.getValueType();
45607 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45608 "Expected a vXi1 vector");
45610 "Expected a constant build vector");
45611
45612 APInt Imm(SrcVT.getVectorNumElements(), 0);
45613 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45614 SDValue In = Op.getOperand(Idx);
45615 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45616 Imm.setBit(Idx);
45617 }
45618 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45619 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45620}
45621
45624 const X86Subtarget &Subtarget) {
45625 using namespace SDPatternMatch;
45626 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45627
45628 if (!DCI.isBeforeLegalizeOps())
45629 return SDValue();
45630
45631 // Only do this if we have k-registers.
45632 if (!Subtarget.hasAVX512())
45633 return SDValue();
45634
45635 EVT DstVT = N->getValueType(0);
45636 SDValue Op = N->getOperand(0);
45637 EVT SrcVT = Op.getValueType();
45638
45639 // Make sure we have a bitcast between mask registers and a scalar type.
45640 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45641 DstVT.isScalarInteger()) &&
45642 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45643 SrcVT.isScalarInteger()))
45644 return SDValue();
45645
45646 SDValue LHS, RHS;
45647
45648 // Look for logic ops.
45650 return SDValue();
45651
45652 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45653 // least one of the getBitcast() will fold away).
45654 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45656 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45657 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45658
45659 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45660 // Most of these have to move a constant from the scalar domain anyway.
45663 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45664 DAG.getBitcast(DstVT, LHS), RHS);
45665 }
45666
45667 return SDValue();
45668}
45669
45671 const X86Subtarget &Subtarget) {
45672 SDLoc DL(BV);
45673 unsigned NumElts = BV->getNumOperands();
45674 SDValue Splat = BV->getSplatValue();
45675
45676 // Build MMX element from integer GPR or SSE float values.
45677 auto CreateMMXElement = [&](SDValue V) {
45678 if (V.isUndef())
45679 return DAG.getUNDEF(MVT::x86mmx);
45680 if (V.getValueType().isFloatingPoint()) {
45681 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45682 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45683 V = DAG.getBitcast(MVT::v2i64, V);
45684 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45685 }
45686 V = DAG.getBitcast(MVT::i32, V);
45687 } else {
45688 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45689 }
45690 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45691 };
45692
45693 // Convert build vector ops to MMX data in the bottom elements.
45695
45696 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45697
45698 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45699 if (Splat) {
45700 if (Splat.isUndef())
45701 return DAG.getUNDEF(MVT::x86mmx);
45702
45703 Splat = CreateMMXElement(Splat);
45704
45705 if (Subtarget.hasSSE1()) {
45706 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45707 if (NumElts == 8)
45708 Splat = DAG.getNode(
45709 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45710 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45711 TLI.getPointerTy(DAG.getDataLayout())),
45712 Splat, Splat);
45713
45714 // Use PSHUFW to repeat 16-bit elements.
45715 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45716 return DAG.getNode(
45717 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45718 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45719 TLI.getPointerTy(DAG.getDataLayout())),
45720 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45721 }
45722 Ops.append(NumElts, Splat);
45723 } else {
45724 for (unsigned i = 0; i != NumElts; ++i)
45725 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45726 }
45727
45728 // Use tree of PUNPCKLs to build up general MMX vector.
45729 while (Ops.size() > 1) {
45730 unsigned NumOps = Ops.size();
45731 unsigned IntrinOp =
45732 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45733 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45734 : Intrinsic::x86_mmx_punpcklbw));
45735 SDValue Intrin = DAG.getTargetConstant(
45736 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45737 for (unsigned i = 0; i != NumOps; i += 2)
45738 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45739 Ops[i], Ops[i + 1]);
45740 Ops.resize(NumOps / 2);
45741 }
45742
45743 return Ops[0];
45744}
45745
45746// Recursive function that attempts to find if a bool vector node was originally
45747// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45748// integer. If so, replace the scalar ops with bool vector equivalents back down
45749// the chain.
45751 SelectionDAG &DAG,
45752 const X86Subtarget &Subtarget,
45753 unsigned Depth = 0) {
45755 return SDValue(); // Limit search depth.
45756
45757 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45758 unsigned Opc = V.getOpcode();
45759 switch (Opc) {
45760 case ISD::BITCAST: {
45761 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45762 SDValue Src = V.getOperand(0);
45763 EVT SrcVT = Src.getValueType();
45764 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45765 return DAG.getBitcast(VT, Src);
45766 break;
45767 }
45768 case ISD::Constant: {
45769 auto *C = cast<ConstantSDNode>(V);
45770 if (C->isZero())
45771 return DAG.getConstant(0, DL, VT);
45772 if (C->isAllOnes())
45773 return DAG.getAllOnesConstant(DL, VT);
45774 break;
45775 }
45776 case ISD::TRUNCATE: {
45777 // If we find a suitable source, a truncated scalar becomes a subvector.
45778 SDValue Src = V.getOperand(0);
45779 EVT NewSrcVT =
45780 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45781 if (TLI.isTypeLegal(NewSrcVT))
45782 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45783 Subtarget, Depth + 1))
45784 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45785 DAG.getVectorIdxConstant(0, DL));
45786 break;
45787 }
45788 case ISD::ANY_EXTEND:
45789 case ISD::ZERO_EXTEND: {
45790 // If we find a suitable source, an extended scalar becomes a subvector.
45791 SDValue Src = V.getOperand(0);
45792 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45793 Src.getScalarValueSizeInBits());
45794 if (TLI.isTypeLegal(NewSrcVT))
45795 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45796 Subtarget, Depth + 1))
45797 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45798 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45799 : DAG.getConstant(0, DL, VT),
45800 N0, DAG.getVectorIdxConstant(0, DL));
45801 break;
45802 }
45803 case ISD::OR:
45804 case ISD::XOR: {
45805 // If we find suitable sources, we can just move the op to the vector
45806 // domain.
45807 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45808 Subtarget, Depth + 1))
45809 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45810 Subtarget, Depth + 1))
45811 return DAG.getNode(Opc, DL, VT, N0, N1);
45812 break;
45813 }
45814 case ISD::SHL: {
45815 // If we find a suitable source, a SHL becomes a KSHIFTL.
45816 SDValue Src0 = V.getOperand(0);
45817 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45818 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45819 break;
45820
45821 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45822 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45823 Depth + 1))
45824 return DAG.getNode(
45825 X86ISD::KSHIFTL, DL, VT, N0,
45826 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45827 break;
45828 }
45829 }
45830
45831 // Does the inner bitcast already exist?
45832 if (Depth > 0)
45833 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45834 return SDValue(Alt, 0);
45835
45836 return SDValue();
45837}
45838
45841 const X86Subtarget &Subtarget) {
45842 SDValue N0 = N->getOperand(0);
45843 EVT VT = N->getValueType(0);
45844 EVT SrcVT = N0.getValueType();
45845 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45846
45847 // Try to match patterns such as
45848 // (i16 bitcast (v16i1 x))
45849 // ->
45850 // (i16 movmsk (16i8 sext (v16i1 x)))
45851 // before the setcc result is scalarized on subtargets that don't have legal
45852 // vxi1 types.
45853 if (DCI.isBeforeLegalize()) {
45854 SDLoc dl(N);
45855 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45856 return V;
45857
45858 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45859 // type, widen both sides to avoid a trip through memory.
45860 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45861 Subtarget.hasAVX512()) {
45862 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45863 N0 = DAG.getBitcast(MVT::v8i1, N0);
45864 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45865 DAG.getVectorIdxConstant(0, dl));
45866 }
45867
45868 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45869 // type, widen both sides to avoid a trip through memory.
45870 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45871 Subtarget.hasAVX512()) {
45872 // Use zeros for the widening if we already have some zeroes. This can
45873 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45874 // stream of this.
45875 // FIXME: It might make sense to detect a concat_vectors with a mix of
45876 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45877 // a separate combine. What we can't do is canonicalize the operands of
45878 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45879 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45880 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45881 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45882 SrcVT = LastOp.getValueType();
45883 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45885 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45886 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45887 N0 = DAG.getBitcast(MVT::i8, N0);
45888 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45889 }
45890 }
45891
45892 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45893 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45894 Ops[0] = N0;
45895 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45896 N0 = DAG.getBitcast(MVT::i8, N0);
45897 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45898 }
45899 } else if (DCI.isAfterLegalizeDAG()) {
45900 // If we're bitcasting from iX to vXi1, see if the integer originally
45901 // began as a vXi1 and whether we can remove the bitcast entirely.
45902 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45903 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45904 if (SDValue V =
45905 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45906 return V;
45907 }
45908 }
45909
45910 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45911 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45912 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45913 // we can help with known bits propagation from the vXi1 domain to the
45914 // scalar domain.
45915 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45916 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45917 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45919 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45920 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45921
45922 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45923 // and the vbroadcast_load are both integer or both fp. In some cases this
45924 // will remove the bitcast entirely.
45925 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45926 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45927 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45928 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45929 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45930 // Don't swap i8/i16 since don't have fp types that size.
45931 if (MemSize >= 32) {
45932 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45933 : MVT::getIntegerVT(MemSize);
45934 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45935 : MVT::getIntegerVT(SrcVTSize);
45936 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45937
45938 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45939 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45940 SDValue ResNode =
45942 MemVT, BCast->getMemOperand());
45943 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45944 return DAG.getBitcast(VT, ResNode);
45945 }
45946 }
45947
45948 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45949 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45950 SDValue Src = peekThroughTruncates(N0);
45951 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45952 Src.getOperand(0).getValueSizeInBits() == 128 &&
45953 isNullConstant(Src.getOperand(1))) {
45954 SDLoc DL(N);
45955 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45956 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45957 DAG.getVectorIdxConstant(0, DL));
45958 }
45959 }
45960
45961 // Since MMX types are special and don't usually play with other vector types,
45962 // it's better to handle them early to be sure we emit efficient code by
45963 // avoiding store-load conversions.
45964 if (VT == MVT::x86mmx) {
45965 // Detect MMX constant vectors.
45966 APInt UndefElts;
45967 SmallVector<APInt, 1> EltBits;
45968 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45969 /*AllowWholeUndefs*/ true,
45970 /*AllowPartialUndefs*/ true)) {
45971 SDLoc DL(N0);
45972 // Handle zero-extension of i32 with MOVD.
45973 if (EltBits[0].countl_zero() >= 32)
45974 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45975 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45976 // Else, bitcast to a double.
45977 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45978 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45979 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45980 }
45981
45982 // Detect bitcasts to x86mmx low word.
45983 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45984 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45985 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45986 bool LowUndef = true, AllUndefOrZero = true;
45987 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45988 SDValue Op = N0.getOperand(i);
45989 LowUndef &= Op.isUndef() || (i >= e/2);
45990 AllUndefOrZero &= isNullConstantOrUndef(Op);
45991 }
45992 if (AllUndefOrZero) {
45993 SDValue N00 = N0.getOperand(0);
45994 SDLoc dl(N00);
45995 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45996 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45997 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45998 }
45999 }
46000
46001 // Detect bitcasts of 64-bit build vectors and convert to a
46002 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
46003 // lowest element.
46004 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
46005 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
46006 SrcVT == MVT::v8i8))
46007 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
46008
46009 // Detect bitcasts between element or subvector extraction to x86mmx.
46010 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
46012 isNullConstant(N0.getOperand(1))) {
46013 SDValue N00 = N0.getOperand(0);
46014 if (N00.getValueType().is128BitVector())
46015 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
46016 DAG.getBitcast(MVT::v2i64, N00));
46017 }
46018
46019 // Detect bitcasts from FP_TO_SINT to x86mmx.
46020 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
46021 SDLoc DL(N0);
46022 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
46023 DAG.getUNDEF(MVT::v2i32));
46024 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
46025 DAG.getBitcast(MVT::v2i64, Res));
46026 }
46027 }
46028
46029 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
46030 // most of these to scalar anyway.
46031 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
46032 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
46034 return combinevXi1ConstantToInteger(N0, DAG);
46035 }
46036
46037 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
46038 VT.getVectorElementType() == MVT::i1) {
46039 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
46040 if (C->isAllOnes())
46041 return DAG.getConstant(1, SDLoc(N0), VT);
46042 if (C->isZero())
46043 return DAG.getConstant(0, SDLoc(N0), VT);
46044 }
46045 }
46046
46047 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
46048 // Turn it into a sign bit compare that produces a k-register. This avoids
46049 // a trip through a GPR.
46050 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
46051 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
46053 unsigned NumElts = VT.getVectorNumElements();
46054 SDValue Src = N0;
46055
46056 // Peek through truncate.
46057 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
46058 Src = N0.getOperand(0);
46059
46060 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
46061 SDValue MovmskIn = Src.getOperand(0);
46062 MVT MovmskVT = MovmskIn.getSimpleValueType();
46063 unsigned MovMskElts = MovmskVT.getVectorNumElements();
46064
46065 // We allow extra bits of the movmsk to be used since they are known zero.
46066 // We can't convert a VPMOVMSKB without avx512bw.
46067 if (MovMskElts <= NumElts &&
46068 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
46069 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
46070 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
46071 SDLoc dl(N);
46072 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
46073 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
46074 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
46075 if (EVT(CmpVT) == VT)
46076 return Cmp;
46077
46078 // Pad with zeroes up to original VT to replace the zeroes that were
46079 // being used from the MOVMSK.
46080 unsigned NumConcats = NumElts / MovMskElts;
46081 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
46082 Ops[0] = Cmp;
46083 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
46084 }
46085 }
46086 }
46087
46088 // Try to remove bitcasts from input and output of mask arithmetic to
46089 // remove GPR<->K-register crossings.
46090 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46091 return V;
46092
46093 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46094 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46095 SrcVT.getVectorNumElements() == 1)
46096 return N0.getOperand(1);
46097
46098 // Convert a bitcasted integer logic operation that has one bitcasted
46099 // floating-point operand into a floating-point logic operation. This may
46100 // create a load of a constant, but that is cheaper than materializing the
46101 // constant in an integer register and transferring it to an SSE register or
46102 // transferring the SSE operand to integer register and back.
46103 unsigned FPOpcode;
46104 switch (N0.getOpcode()) {
46105 // clang-format off
46106 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46107 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46108 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46109 default: return SDValue();
46110 // clang-format on
46111 }
46112
46113 // Check if we have a bitcast from another integer type as well.
46114 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46115 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46116 (Subtarget.hasFP16() && VT == MVT::f16) ||
46117 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46118 TLI.isTypeLegal(VT))))
46119 return SDValue();
46120
46121 SDValue LogicOp0 = N0.getOperand(0);
46122 SDValue LogicOp1 = N0.getOperand(1);
46123 SDLoc DL0(N0);
46124
46125 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46126 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46127 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46128 LogicOp0.getOperand(0).getValueType() == VT &&
46129 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46130 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46131 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46132 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46133 }
46134 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46135 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46136 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46137 LogicOp1.getOperand(0).getValueType() == VT &&
46138 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46139 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46140 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46141 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46142 }
46143
46144 return SDValue();
46145}
46146
46147// (mul (zext a), (sext, b))
46148static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46149 SDValue &Op1) {
46150 Op0 = Mul.getOperand(0);
46151 Op1 = Mul.getOperand(1);
46152
46153 // The operand1 should be signed extend
46154 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46155 std::swap(Op0, Op1);
46156
46157 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46158 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46159 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46160 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46161 return true;
46162
46163 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46164 return (BV && BV->isConstant());
46165 };
46166
46167 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46168 // value, we need to check Op0 is zero extended value. Op1 should be signed
46169 // value, so we just check the signed bits.
46170 if ((IsFreeTruncation(Op0) &&
46171 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46172 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46173 return true;
46174
46175 return false;
46176}
46177
46179 unsigned &LogBias, const SDLoc &DL,
46180 const X86Subtarget &Subtarget) {
46181 // Extend or truncate to MVT::i8 first.
46182 MVT Vi8VT =
46183 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46184 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46185 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46186
46187 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46188 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46189 // The src A, B element type is i8, but the dst C element type is i32.
46190 // When we calculate the reduce stage, we use src vector type vXi8 for it
46191 // so we need logbias 2 to avoid extra 2 stages.
46192 LogBias = 2;
46193
46194 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46195 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46196 RegSize = std::max(512u, RegSize);
46197
46198 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46199 // fill in the missing vector elements with 0.
46200 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46201 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46202 Ops[0] = LHS;
46203 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46204 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46205 Ops[0] = RHS;
46206 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46207
46208 // Actually build the DotProduct, split as 256/512 bits for
46209 // AVXVNNI/AVX512VNNI.
46210 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46212 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46213 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46214 };
46215 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46216 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46217
46218 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46219 DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI());
46220}
46221
46222// Create a PSADBW given two sources representable as zexts of vXi8.
46224 const SDLoc &DL, const X86Subtarget &Subtarget) {
46225 // Find the appropriate width for the PSADBW.
46226 EVT DstVT = N0.getValueType();
46227 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46228 DstVT.getVectorElementCount());
46229 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46230
46231 // Widen the vXi8 vectors, padding with zero vector elements.
46232 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46233 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46234 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46235 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46236 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46237 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46238 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46239
46240 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46241 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46243 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46244 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46245 };
46246 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46247 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46248 PSADBWBuilder);
46249}
46250
46251// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46252// PHMINPOSUW.
46254 const X86Subtarget &Subtarget) {
46255 // Bail without SSE41.
46256 if (!Subtarget.hasSSE41())
46257 return SDValue();
46258
46259 EVT ExtractVT = Extract->getValueType(0);
46260 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46261 return SDValue();
46262
46263 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46264 ISD::NodeType BinOp;
46265 SDValue Src = DAG.matchBinOpReduction(
46266 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46267 if (!Src)
46268 return SDValue();
46269
46270 EVT SrcVT = Src.getValueType();
46271 EVT SrcSVT = SrcVT.getScalarType();
46272 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46273 return SDValue();
46274
46275 SDLoc DL(Extract);
46276 SDValue MinPos = Src;
46277
46278 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46279 while (SrcVT.getSizeInBits() > 128) {
46280 SDValue Lo, Hi;
46281 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46282 SrcVT = Lo.getValueType();
46283 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46284 }
46285 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46286 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46287 "Unexpected value type");
46288
46289 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46290 // to flip the value accordingly.
46291 SDValue Mask;
46292 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46293 if (BinOp == ISD::SMAX)
46294 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46295 else if (BinOp == ISD::SMIN)
46296 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46297 else if (BinOp == ISD::UMAX)
46298 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46299
46300 if (Mask)
46301 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46302
46303 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46304 // shuffling each upper element down and insert zeros. This means that the
46305 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46306 // ready for the PHMINPOS.
46307 if (ExtractVT == MVT::i8) {
46309 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46310 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46311 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46312 }
46313
46314 // Perform the PHMINPOS on a v8i16 vector,
46315 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46316 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46317 MinPos = DAG.getBitcast(SrcVT, MinPos);
46318
46319 if (Mask)
46320 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46321
46322 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46323 DAG.getVectorIdxConstant(0, DL));
46324}
46325
46326// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46328 const X86Subtarget &Subtarget) {
46329 // Bail without SSE2.
46330 if (!Subtarget.hasSSE2())
46331 return SDValue();
46332
46333 EVT ExtractVT = Extract->getValueType(0);
46334 unsigned BitWidth = ExtractVT.getSizeInBits();
46335 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46336 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46337 return SDValue();
46338
46339 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46340 ISD::NodeType BinOp;
46341 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46342 if (!Match && ExtractVT == MVT::i1)
46343 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46344 if (!Match)
46345 return SDValue();
46346
46347 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46348 // which we can't support here for now.
46349 if (Match.getScalarValueSizeInBits() != BitWidth)
46350 return SDValue();
46351
46352 SDValue Movmsk;
46353 SDLoc DL(Extract);
46354 EVT MatchVT = Match.getValueType();
46355 unsigned NumElts = MatchVT.getVectorNumElements();
46356 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46357 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46358 LLVMContext &Ctx = *DAG.getContext();
46359
46360 if (ExtractVT == MVT::i1) {
46361 // Special case for (pre-legalization) vXi1 reductions.
46362 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46363 return SDValue();
46364 if (Match.getOpcode() == ISD::SETCC) {
46365 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46366 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46367 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46368 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46369 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46370 X86::CondCode X86CC;
46371 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46372 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46373 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46374 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46375 DAG, X86CC))
46376 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46377 getSETCC(X86CC, V, DL, DAG));
46378 }
46379 }
46380 if (TLI.isTypeLegal(MatchVT)) {
46381 // If this is a legal AVX512 predicate type then we can just bitcast.
46382 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46383 Movmsk = DAG.getBitcast(MovmskVT, Match);
46384 } else {
46385 // Use combineBitcastvxi1 to create the MOVMSK.
46386 while (NumElts > MaxElts) {
46387 SDValue Lo, Hi;
46388 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46389 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46390 NumElts /= 2;
46391 }
46392 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46393 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46394 }
46395 if (!Movmsk)
46396 return SDValue();
46397 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46398 } else {
46399 // FIXME: Better handling of k-registers or 512-bit vectors?
46400 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46401 if (!(MatchSizeInBits == 128 ||
46402 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46403 return SDValue();
46404
46405 // Make sure this isn't a vector of 1 element. The perf win from using
46406 // MOVMSK diminishes with less elements in the reduction, but it is
46407 // generally better to get the comparison over to the GPRs as soon as
46408 // possible to reduce the number of vector ops.
46409 if (Match.getValueType().getVectorNumElements() < 2)
46410 return SDValue();
46411
46412 // Check that we are extracting a reduction of all sign bits.
46413 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46414 return SDValue();
46415
46416 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46417 SDValue Lo, Hi;
46418 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46419 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46420 MatchSizeInBits = Match.getValueSizeInBits();
46421 }
46422
46423 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46424 MVT MaskSrcVT;
46425 if (64 == BitWidth || 32 == BitWidth)
46427 MatchSizeInBits / BitWidth);
46428 else
46429 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46430
46431 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46432 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46433 NumElts = MaskSrcVT.getVectorNumElements();
46434 }
46435 assert((NumElts <= 32 || NumElts == 64) &&
46436 "Not expecting more than 64 elements");
46437
46438 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46439 if (BinOp == ISD::XOR) {
46440 // parity -> (PARITY(MOVMSK X))
46441 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46442 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46443 }
46444
46445 SDValue CmpC;
46446 ISD::CondCode CondCode;
46447 if (BinOp == ISD::OR) {
46448 // any_of -> MOVMSK != 0
46449 CmpC = DAG.getConstant(0, DL, CmpVT);
46450 CondCode = ISD::CondCode::SETNE;
46451 } else {
46452 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46453 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46454 DL, CmpVT);
46455 CondCode = ISD::CondCode::SETEQ;
46456 }
46457
46458 // The setcc produces an i8 of 0/1, so extend that to the result width and
46459 // negate to get the final 0/-1 mask value.
46460 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46461 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46462 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46463 return DAG.getNegative(Zext, DL, ExtractVT);
46464}
46465
46467 const X86Subtarget &Subtarget) {
46468 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46469 return SDValue();
46470
46471 EVT ExtractVT = Extract->getValueType(0);
46472 // Verify the type we're extracting is i32, as the output element type of
46473 // vpdpbusd is i32.
46474 if (ExtractVT != MVT::i32)
46475 return SDValue();
46476
46477 EVT VT = Extract->getOperand(0).getValueType();
46479 return SDValue();
46480
46481 // Match shuffle + add pyramid.
46482 ISD::NodeType BinOp;
46483 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46484
46485 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46486 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46487 // before adding into the accumulator.
46488 // TODO:
46489 // We also need to verify that the multiply has at least 2x the number of bits
46490 // of the input. We shouldn't match
46491 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46492 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46493 // Root = Root.getOperand(0);
46494
46495 // If there was a match, we want Root to be a mul.
46496 if (!Root || Root.getOpcode() != ISD::MUL)
46497 return SDValue();
46498
46499 // Check whether we have an extend and mul pattern
46500 SDValue LHS, RHS;
46501 if (!detectExtMul(DAG, Root, LHS, RHS))
46502 return SDValue();
46503
46504 // Create the dot product instruction.
46505 SDLoc DL(Extract);
46506 unsigned StageBias;
46507 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46508
46509 // If the original vector was wider than 4 elements, sum over the results
46510 // in the DP vector.
46511 unsigned Stages = Log2_32(VT.getVectorNumElements());
46512 EVT DpVT = DP.getValueType();
46513
46514 if (Stages > StageBias) {
46515 unsigned DpElems = DpVT.getVectorNumElements();
46516
46517 for (unsigned i = Stages - StageBias; i > 0; --i) {
46518 SmallVector<int, 16> Mask(DpElems, -1);
46519 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46520 Mask[j] = MaskEnd + j;
46521
46522 SDValue Shuffle =
46523 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46524 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46525 }
46526 }
46527
46528 // Return the lowest ExtractSizeInBits bits.
46529 EVT ResVT =
46530 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46531 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46532 DP = DAG.getBitcast(ResVT, DP);
46533 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46534 Extract->getOperand(1));
46535}
46536
46538 const X86Subtarget &Subtarget) {
46539 using namespace SDPatternMatch;
46540
46541 // PSADBW is only supported on SSE2 and up.
46542 if (!Subtarget.hasSSE2())
46543 return SDValue();
46544
46545 EVT ExtractVT = Extract->getValueType(0);
46546 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46547 ExtractVT != MVT::i64)
46548 return SDValue();
46549
46550 EVT VT = Extract->getOperand(0).getValueType();
46552 return SDValue();
46553
46554 // Match shuffle + add pyramid.
46555 ISD::NodeType BinOp;
46556 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46557 if (!Root)
46558 return SDValue();
46559
46560 // The operand is expected to be zero extended from i8.
46561 // In order to convert to i64 and above, additional any/zero/sign
46562 // extend is expected.
46563 // The zero extend from 32 bit has no mathematical effect on the result.
46564 // Also the sign extend is basically zero extend
46565 // (extends the sign bit which is zero).
46566 // So it is correct to skip the sign/zero extend instruction.
46567 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46568 Root.getOpcode() == ISD::ZERO_EXTEND ||
46569 Root.getOpcode() == ISD::ANY_EXTEND)
46570 Root = Root.getOperand(0);
46571
46572 // Check whether we have an vXi8 abdu pattern.
46573 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46574 SDValue Src0, Src1;
46575 if (!sd_match(
46576 Root,
46577 m_AnyOf(
46579 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46581 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46582 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46583 m_Abs(
46584 m_Sub(m_AllOf(m_Value(Src0),
46586 m_AllOf(m_Value(Src1),
46587 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46588 return SDValue();
46589
46590 // Create the SAD instruction.
46591 SDLoc DL(Extract);
46592 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46593
46594 // If the original vector was wider than 8 elements, sum over the results
46595 // in the SAD vector.
46596 unsigned Stages = Log2_32(VT.getVectorNumElements());
46597 EVT SadVT = SAD.getValueType();
46598 if (Stages > 3) {
46599 unsigned SadElems = SadVT.getVectorNumElements();
46600
46601 for(unsigned i = Stages - 3; i > 0; --i) {
46602 SmallVector<int, 16> Mask(SadElems, -1);
46603 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46604 Mask[j] = MaskEnd + j;
46605
46606 SDValue Shuffle =
46607 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46608 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46609 }
46610 }
46611
46612 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46613 // Return the lowest ExtractSizeInBits bits.
46614 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46615 SadVT.getSizeInBits() / ExtractSizeInBits);
46616 SAD = DAG.getBitcast(ResVT, SAD);
46617 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46618 Extract->getOperand(1));
46619}
46620
46621// If this extract is from a loaded vector value and will be used as an
46622// integer, that requires a potentially expensive XMM -> GPR transfer.
46623// Additionally, if we can convert to a scalar integer load, that will likely
46624// be folded into a subsequent integer op.
46625// Note: SrcVec might not have a VecVT type, but it must be the same size.
46626// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46627// to a single-use of the loaded vector. For the reasons above, we
46628// expect this to be profitable even if it creates an extra load.
46629static SDValue
46631 const SDLoc &dl, SelectionDAG &DAG,
46633 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46634 "Only EXTRACT_VECTOR_ELT supported so far");
46635
46636 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46637 EVT VT = N->getValueType(0);
46638
46639 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46640 return Use->getOpcode() == ISD::STORE ||
46641 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46642 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46643 });
46644
46645 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46646 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46647 VecVT.getVectorElementType() == VT &&
46648 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46649 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46650 SDValue NewPtr = TLI.getVectorElementPointer(
46651 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46652 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46653 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46654 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46655 SDValue Load =
46656 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46657 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46658 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46659 return Load;
46660 }
46661
46662 return SDValue();
46663}
46664
46665// Attempt to peek through a target shuffle and extract the scalar from the
46666// source.
46669 const X86Subtarget &Subtarget) {
46670 if (DCI.isBeforeLegalizeOps())
46671 return SDValue();
46672
46673 SDLoc dl(N);
46674 SDValue Src = N->getOperand(0);
46675 SDValue Idx = N->getOperand(1);
46676
46677 EVT VT = N->getValueType(0);
46678 EVT SrcVT = Src.getValueType();
46679 EVT SrcSVT = SrcVT.getVectorElementType();
46680 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46681 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46682
46683 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46684 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46685 return SDValue();
46686
46687 const APInt &IdxC = N->getConstantOperandAPInt(1);
46688 if (IdxC.uge(NumSrcElts))
46689 return SDValue();
46690
46691 SDValue SrcBC = peekThroughBitcasts(Src);
46692
46693 // Handle extract(bitcast(broadcast(scalar_value))).
46694 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46695 SDValue SrcOp = SrcBC.getOperand(0);
46696 EVT SrcOpVT = SrcOp.getValueType();
46697 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46698 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46699 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46700 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46701 // TODO support non-zero offsets.
46702 if (Offset == 0) {
46703 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46704 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46705 return SrcOp;
46706 }
46707 }
46708 }
46709
46710 // If we're extracting a single element from a broadcast load and there are
46711 // no other users, just create a single load.
46713 SrcBC.hasOneUse()) {
46714 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46715 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46716 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46717 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46718 SDValue Load =
46719 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46720 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46721 MemIntr->getMemOperand()->getFlags());
46722 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46723 return Load;
46724 }
46725 }
46726
46727 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46728 // TODO: Move to DAGCombine?
46729 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46730 SrcBC.getValueType().isInteger() &&
46731 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46732 SrcBC.getScalarValueSizeInBits() ==
46733 SrcBC.getOperand(0).getValueSizeInBits()) {
46734 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46735 if (IdxC.ult(Scale)) {
46736 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46737 SDValue Scl = SrcBC.getOperand(0);
46738 EVT SclVT = Scl.getValueType();
46739 if (Offset) {
46740 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46741 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46742 }
46743 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46744 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46745 return Scl;
46746 }
46747 }
46748
46749 // Handle extract(truncate(x)) for 0'th index.
46750 // TODO: Treat this as a faux shuffle?
46751 // TODO: When can we use this for general indices?
46752 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46753 (SrcVT.getSizeInBits() % 128) == 0) {
46754 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46755 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46756 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46757 Idx);
46758 }
46759
46760 // We can only legally extract other elements from 128-bit vectors and in
46761 // certain circumstances, depending on SSE-level.
46762 // TODO: Investigate float/double extraction if it will be just stored.
46763 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46764 unsigned Idx) {
46765 EVT VecSVT = VecVT.getScalarType();
46766 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46767 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46768 VecSVT == MVT::i64)) {
46769 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46770 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46771 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46772 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46773 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46774 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46775 Idx &= (NumEltsPerLane - 1);
46776 }
46777 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46778 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46779 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46780 DAG.getBitcast(VecVT, Vec),
46781 DAG.getVectorIdxConstant(Idx, dl));
46782 }
46783 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46784 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46785 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46786 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46787 DAG.getTargetConstant(Idx, dl, MVT::i8));
46788 }
46789 return SDValue();
46790 };
46791
46792 // Resolve the target shuffle inputs and mask.
46795 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46796 return SDValue();
46797
46798 // Shuffle inputs must be the same size as the result.
46799 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46800 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46801 }))
46802 return SDValue();
46803
46804 // Attempt to narrow/widen the shuffle mask to the correct size.
46805 if (Mask.size() != NumSrcElts) {
46806 if ((NumSrcElts % Mask.size()) == 0) {
46807 SmallVector<int, 16> ScaledMask;
46808 int Scale = NumSrcElts / Mask.size();
46809 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46810 Mask = std::move(ScaledMask);
46811 } else if ((Mask.size() % NumSrcElts) == 0) {
46812 // Simplify Mask based on demanded element.
46813 int ExtractIdx = (int)IdxC.getZExtValue();
46814 int Scale = Mask.size() / NumSrcElts;
46815 int Lo = Scale * ExtractIdx;
46816 int Hi = Scale * (ExtractIdx + 1);
46817 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46818 if (i < Lo || Hi <= i)
46819 Mask[i] = SM_SentinelUndef;
46820
46821 SmallVector<int, 16> WidenedMask;
46822 while (Mask.size() > NumSrcElts &&
46823 canWidenShuffleElements(Mask, WidenedMask))
46824 Mask = std::move(WidenedMask);
46825 }
46826 }
46827
46828 // If narrowing/widening failed, see if we can extract+zero-extend.
46829 int ExtractIdx;
46830 EVT ExtractVT;
46831 if (Mask.size() == NumSrcElts) {
46832 ExtractIdx = Mask[IdxC.getZExtValue()];
46833 ExtractVT = SrcVT;
46834 } else {
46835 unsigned Scale = Mask.size() / NumSrcElts;
46836 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46837 return SDValue();
46838 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46839 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46840 return SDValue();
46841 ExtractIdx = Mask[ScaledIdx];
46842 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46843 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46844 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46845 "Failed to widen vector type");
46846 }
46847
46848 // If the shuffle source element is undef/zero then we can just accept it.
46849 if (ExtractIdx == SM_SentinelUndef)
46850 return DAG.getUNDEF(VT);
46851
46852 if (ExtractIdx == SM_SentinelZero)
46853 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46854 : DAG.getConstant(0, dl, VT);
46855
46856 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46857 ExtractIdx = ExtractIdx % Mask.size();
46858 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46859 return DAG.getZExtOrTrunc(V, dl, VT);
46860
46861 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46863 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46864 return V;
46865
46866 return SDValue();
46867}
46868
46869/// Extracting a scalar FP value from vector element 0 is free, so extract each
46870/// operand first, then perform the math as a scalar op.
46872 const X86Subtarget &Subtarget,
46874 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46875 SDValue Vec = ExtElt->getOperand(0);
46876 SDValue Index = ExtElt->getOperand(1);
46877 EVT VT = ExtElt->getValueType(0);
46878 EVT VecVT = Vec.getValueType();
46879
46880 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46881 // non-zero element because the shuffle+scalar op will be cheaper?
46882 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46883 return SDValue();
46884
46885 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46886 // extract, the condition code), so deal with those as a special-case.
46887 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46888 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46889 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46890 return SDValue();
46891
46892 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46893 SDLoc DL(ExtElt);
46894 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46895 Vec.getOperand(0), Index);
46896 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46897 Vec.getOperand(1), Index);
46898 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46899 }
46900
46901 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46902 VT != MVT::f64)
46903 return SDValue();
46904
46905 // Vector FP selects don't fit the pattern of FP math ops (because the
46906 // condition has a different type and we have to change the opcode), so deal
46907 // with those here.
46908 // FIXME: This is restricted to pre type legalization. If we loosen this we
46909 // need to convert vector bool to a scalar bool.
46910 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46911 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46912 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46913 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46914 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46915 SDLoc DL(ExtElt);
46918 Vec.getOperand(0), Index);
46919 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46920 Vec.getOperand(1), Index);
46921 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46922 Vec.getOperand(2), Index);
46923 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46924 }
46925
46926 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46927 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46928 // missed load folding and fma+fneg combining.
46929 switch (Vec.getOpcode()) {
46930 case ISD::FMA: // Begin 3 operands
46931 case ISD::FMAD:
46932 case ISD::FADD: // Begin 2 operands
46933 case ISD::FSUB:
46934 case ISD::FMUL:
46935 case ISD::FDIV:
46936 case ISD::FREM:
46937 case ISD::FCOPYSIGN:
46938 case ISD::FMINNUM:
46939 case ISD::FMAXNUM:
46940 case ISD::FMINNUM_IEEE:
46941 case ISD::FMAXNUM_IEEE:
46942 case ISD::FMAXIMUM:
46943 case ISD::FMINIMUM:
46944 case ISD::FMAXIMUMNUM:
46945 case ISD::FMINIMUMNUM:
46946 case X86ISD::FMAX:
46947 case X86ISD::FMIN:
46948 case ISD::FABS: // Begin 1 operand
46949 case ISD::FSQRT:
46950 case ISD::FRINT:
46951 case ISD::FCEIL:
46952 case ISD::FTRUNC:
46953 case ISD::FNEARBYINT:
46954 case ISD::FROUNDEVEN:
46955 case ISD::FROUND:
46956 case ISD::FFLOOR:
46957 case X86ISD::FRCP:
46958 case X86ISD::FRSQRT: {
46959 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46960 SDLoc DL(ExtElt);
46962 for (SDValue Op : Vec->ops())
46963 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46964 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46965 }
46966 default:
46967 return SDValue();
46968 }
46969 llvm_unreachable("All opcodes should return within switch");
46970}
46971
46972/// Try to convert a vector reduction sequence composed of binops and shuffles
46973/// into horizontal ops.
46975 const X86Subtarget &Subtarget) {
46976 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46977
46978 // We need at least SSE2 to anything here.
46979 if (!Subtarget.hasSSE2())
46980 return SDValue();
46981
46983 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46984 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46985 if (!Rdx)
46986 return SDValue();
46987
46988 SDValue Index = ExtElt->getOperand(1);
46989 assert(isNullConstant(Index) &&
46990 "Reduction doesn't end in an extract from index 0");
46991
46992 EVT VT = ExtElt->getValueType(0);
46993 EVT VecVT = Rdx.getValueType();
46994 if (VecVT.getScalarType() != VT)
46995 return SDValue();
46996
46997 SDLoc DL(ExtElt);
46998 unsigned NumElts = VecVT.getVectorNumElements();
46999 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
47000
47001 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
47002 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
47003 if (V.getValueType() == MVT::v4i8) {
47004 if (ZeroExtend && Subtarget.hasSSE41()) {
47005 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
47006 DAG.getConstant(0, DL, MVT::v4i32),
47007 DAG.getBitcast(MVT::i32, V),
47008 DAG.getVectorIdxConstant(0, DL));
47009 return DAG.getBitcast(MVT::v16i8, V);
47010 }
47011 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
47012 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
47013 : DAG.getUNDEF(MVT::v4i8));
47014 }
47015 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
47016 DAG.getUNDEF(MVT::v8i8));
47017 };
47018
47019 // vXi8 mul reduction - promote to vXi16 mul reduction.
47020 if (Opc == ISD::MUL) {
47021 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
47022 return SDValue();
47023 if (VecVT.getSizeInBits() >= 128) {
47024 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
47025 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47026 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47027 Lo = DAG.getBitcast(WideVT, Lo);
47028 Hi = DAG.getBitcast(WideVT, Hi);
47029 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
47030 while (Rdx.getValueSizeInBits() > 128) {
47031 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47032 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
47033 }
47034 } else {
47035 Rdx = WidenToV16I8(Rdx, false);
47036 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
47037 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
47038 }
47039 if (NumElts >= 8)
47040 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47041 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47042 {4, 5, 6, 7, -1, -1, -1, -1}));
47043 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47044 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47045 {2, 3, -1, -1, -1, -1, -1, -1}));
47046 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47047 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47048 {1, -1, -1, -1, -1, -1, -1, -1}));
47049 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47050 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47051 }
47052
47053 // vXi8 add reduction - sub 128-bit vector.
47054 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
47055 Rdx = WidenToV16I8(Rdx, true);
47056 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47057 DAG.getConstant(0, DL, MVT::v16i8));
47058 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47059 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47060 }
47061
47062 // Must be a >=128-bit vector with pow2 elements.
47063 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
47064 return SDValue();
47065
47066 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
47067 if (VT == MVT::i8) {
47068 while (Rdx.getValueSizeInBits() > 128) {
47069 SDValue Lo, Hi;
47070 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47071 VecVT = Lo.getValueType();
47072 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47073 }
47074 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
47075
47077 MVT::v16i8, DL, Rdx, Rdx,
47078 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
47079 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
47080 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47081 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
47082 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47083 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47084 }
47085
47086 // See if we can use vXi8 PSADBW add reduction for larger zext types.
47087 // If the source vector values are 0-255, then we can use PSADBW to
47088 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
47089 // TODO: See if its worth avoiding vXi16/i32 truncations?
47090 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47091 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47092 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47093 Subtarget.hasAVX512())) {
47094 if (Rdx.getValueType() == MVT::v8i16) {
47095 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47096 DAG.getUNDEF(MVT::v8i16));
47097 } else {
47098 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47099 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47100 if (ByteVT.getSizeInBits() < 128)
47101 Rdx = WidenToV16I8(Rdx, true);
47102 }
47103
47104 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47105 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47107 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47108 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47109 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47110 };
47111 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47112 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47113
47114 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47115 while (Rdx.getValueSizeInBits() > 128) {
47116 SDValue Lo, Hi;
47117 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47118 VecVT = Lo.getValueType();
47119 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47120 }
47121 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47122
47123 if (NumElts > 8) {
47124 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47125 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47126 }
47127
47128 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47129 Rdx = DAG.getBitcast(VecVT, Rdx);
47130 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47131 }
47132
47133 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47134 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47135 return SDValue();
47136
47137 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47138
47139 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47140 // across the whole vector, so we need an extract + hop preliminary stage.
47141 // This is the only step where the operands of the hop are not the same value.
47142 // TODO: We could extend this to handle 512-bit or even longer vectors.
47143 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47144 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47145 unsigned NumElts = VecVT.getVectorNumElements();
47146 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47147 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47148 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47149 VecVT = Rdx.getValueType();
47150 }
47151 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47152 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47153 return SDValue();
47154
47155 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47156 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47157 for (unsigned i = 0; i != ReductionSteps; ++i)
47158 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47159
47160 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47161}
47162
47163/// Detect vector gather/scatter index generation and convert it from being a
47164/// bunch of shuffles and extracts into a somewhat faster sequence.
47165/// For i686, the best sequence is apparently storing the value and loading
47166/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47169 const X86Subtarget &Subtarget) {
47170 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47171 return NewOp;
47172
47173 SDValue InputVector = N->getOperand(0);
47174 SDValue EltIdx = N->getOperand(1);
47175 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47176
47177 EVT SrcVT = InputVector.getValueType();
47178 EVT VT = N->getValueType(0);
47179 SDLoc dl(InputVector);
47180 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47181 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47182 unsigned NumEltBits = VT.getScalarSizeInBits();
47183 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47184
47185 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47186 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47187
47188 // Integer Constant Folding.
47189 if (CIdx && VT.isInteger()) {
47190 APInt UndefVecElts;
47191 SmallVector<APInt, 16> EltBits;
47192 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47193 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47194 EltBits, /*AllowWholeUndefs*/ true,
47195 /*AllowPartialUndefs*/ false)) {
47196 uint64_t Idx = CIdx->getZExtValue();
47197 if (UndefVecElts[Idx])
47198 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47199 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47200 }
47201
47202 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47203 // Improves lowering of bool masks on rust which splits them into byte array.
47204 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47205 SDValue Src = peekThroughBitcasts(InputVector);
47206 if (Src.getValueType().getScalarType() == MVT::i1 &&
47207 TLI.isTypeLegal(Src.getValueType())) {
47208 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47209 SDValue Sub = DAG.getNode(
47210 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47211 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47212 return DAG.getBitcast(VT, Sub);
47213 }
47214 }
47215 }
47216
47217 if (IsPextr) {
47218 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47219 DCI))
47220 return SDValue(N, 0);
47221
47222 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47223 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47224 InputVector.getOpcode() == X86ISD::PINSRW) &&
47225 InputVector.getOperand(2) == EltIdx) {
47226 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47227 "Vector type mismatch");
47228 SDValue Scl = InputVector.getOperand(1);
47229 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47230 return DAG.getZExtOrTrunc(Scl, dl, VT);
47231 }
47232
47233 // TODO - Remove this once we can handle the implicit zero-extension of
47234 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47235 // combineBasicSADPattern.
47236 return SDValue();
47237 }
47238
47239 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47240 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47241 InputVector.getOpcode() == ISD::BITCAST &&
47242 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47243 isNullConstant(EltIdx) && InputVector.hasOneUse())
47244 return DAG.getBitcast(VT, InputVector);
47245
47246 // Detect mmx to i32 conversion through a v2i32 elt extract.
47247 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47248 InputVector.getOpcode() == ISD::BITCAST &&
47249 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47250 isNullConstant(EltIdx) && InputVector.hasOneUse())
47251 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47252 InputVector.getOperand(0));
47253
47254 // Check whether this extract is the root of a sum of absolute differences
47255 // pattern. This has to be done here because we really want it to happen
47256 // pre-legalization,
47257 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47258 return SAD;
47259
47260 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47261 return VPDPBUSD;
47262
47263 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47264 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47265 return Cmp;
47266
47267 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47268 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47269 return MinMax;
47270
47271 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47272 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47273 return V;
47274
47275 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47276 return V;
47277
47278 if (CIdx)
47280 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47281 dl, DAG, DCI))
47282 return V;
47283
47284 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47285 // and then testing the relevant element.
47286 //
47287 // Note that we only combine extracts on the *same* result number, i.e.
47288 // t0 = merge_values a0, a1, a2, a3
47289 // i1 = extract_vector_elt t0, Constant:i64<2>
47290 // i1 = extract_vector_elt t0, Constant:i64<3>
47291 // but not
47292 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47293 // since the latter would need its own MOVMSK.
47294 if (SrcVT.getScalarType() == MVT::i1) {
47295 bool IsVar = !CIdx;
47296 SmallVector<SDNode *, 16> BoolExtracts;
47297 unsigned ResNo = InputVector.getResNo();
47298 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47299 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47300 Use->getOperand(0).getResNo() == ResNo &&
47301 Use->getValueType(0) == MVT::i1) {
47302 BoolExtracts.push_back(Use);
47303 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47304 return true;
47305 }
47306 return false;
47307 };
47308 // TODO: Can we drop the oneuse check for constant extracts?
47309 if (all_of(InputVector->users(), IsBoolExtract) &&
47310 (IsVar || BoolExtracts.size() > 1)) {
47311 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47312 if (SDValue BC =
47313 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47314 for (SDNode *Use : BoolExtracts) {
47315 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47316 // Mask = 1 << MaskIdx
47317 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47318 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47319 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47320 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47321 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47322 DCI.CombineTo(Use, Res);
47323 }
47324 return SDValue(N, 0);
47325 }
47326 }
47327 }
47328
47329 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47330 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47331 SDValue TruncSrc = InputVector.getOperand(0);
47332 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47333 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47334 SDValue NewExt =
47335 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47336 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47337 }
47338 }
47339
47340 return SDValue();
47341}
47342
47343// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47344// This is more or less the reverse of combineBitcastvxi1.
47346 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47347 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47348 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47349 Opcode != ISD::ANY_EXTEND)
47350 return SDValue();
47351 if (!DCI.isBeforeLegalizeOps())
47352 return SDValue();
47353 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47354 return SDValue();
47355
47356 EVT SVT = VT.getScalarType();
47357 EVT InSVT = N0.getValueType().getScalarType();
47358 unsigned EltSizeInBits = SVT.getSizeInBits();
47359
47360 // Input type must be extending a bool vector (bit-casted from a scalar
47361 // integer) to legal integer types.
47362 if (!VT.isVector())
47363 return SDValue();
47364 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47365 return SDValue();
47366 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47367 return SDValue();
47368
47369 SDValue N00 = N0.getOperand(0);
47370 EVT SclVT = N00.getValueType();
47371 if (!SclVT.isScalarInteger())
47372 return SDValue();
47373
47374 SDValue Vec;
47375 SmallVector<int> ShuffleMask;
47376 unsigned NumElts = VT.getVectorNumElements();
47377 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47378
47379 // Broadcast the scalar integer to the vector elements.
47380 if (NumElts > EltSizeInBits) {
47381 // If the scalar integer is greater than the vector element size, then we
47382 // must split it down into sub-sections for broadcasting. For example:
47383 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47384 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47385 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47386 unsigned Scale = NumElts / EltSizeInBits;
47387 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47388 bool UseBroadcast = Subtarget.hasInt256() &&
47389 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47390 Vec = UseBroadcast
47391 ? DAG.getSplat(BroadcastVT, DL, N00)
47392 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47393 Vec = DAG.getBitcast(VT, Vec);
47394
47395 for (unsigned i = 0; i != Scale; ++i) {
47396 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47397 ShuffleMask.append(EltSizeInBits, i + Offset);
47398 }
47399 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47400 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47401 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47402 // If we have register broadcast instructions, use the scalar size as the
47403 // element type for the shuffle. Then cast to the wider element type. The
47404 // widened bits won't be used, and this might allow the use of a broadcast
47405 // load.
47406 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47407 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47408 (NumElts * EltSizeInBits) / NumElts);
47409 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47410 } else {
47411 // For smaller scalar integers, we can simply any-extend it to the vector
47412 // element size (we don't care about the upper bits) and broadcast it to all
47413 // elements.
47414 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47415 }
47416
47417 // Now, mask the relevant bit in each element.
47419 for (unsigned i = 0; i != NumElts; ++i) {
47420 int BitIdx = (i % EltSizeInBits);
47421 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47422 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47423 }
47424 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47425 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47426
47427 // Compare against the bitmask and extend the result.
47428 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47429 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47430 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47431
47432 // For SEXT, this is now done, otherwise shift the result down for
47433 // zero-extension.
47434 if (Opcode == ISD::SIGN_EXTEND)
47435 return Vec;
47436 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47437 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47438}
47439
47440/// If both arms of a vector select are concatenated vectors, split the select,
47441/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47442/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47443/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47445 const X86Subtarget &Subtarget) {
47446 unsigned Opcode = N->getOpcode();
47447 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47448 return SDValue();
47449
47450 // TODO: Split 512-bit vectors too?
47451 EVT VT = N->getValueType(0);
47452 if (!VT.is256BitVector())
47453 return SDValue();
47454
47455 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47456 SDValue Cond = N->getOperand(0);
47457 SDValue TVal = N->getOperand(1);
47458 SDValue FVal = N->getOperand(2);
47459 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47460 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47461 return SDValue();
47462
47463 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47465 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47466 };
47467 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47468 /*CheckBWI*/ false);
47469}
47470
47472 const SDLoc &DL) {
47473 SDValue Cond = N->getOperand(0);
47474 SDValue LHS = N->getOperand(1);
47475 SDValue RHS = N->getOperand(2);
47476
47477 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47478 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47479 if (!TrueC || !FalseC)
47480 return SDValue();
47481
47482 // Don't do this for crazy integer types.
47483 EVT VT = N->getValueType(0);
47484 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47485 return SDValue();
47486
47487 // We're going to use the condition bit in math or logic ops. We could allow
47488 // this with a wider condition value (post-legalization it becomes an i8),
47489 // but if nothing is creating selects that late, it doesn't matter.
47490 if (Cond.getValueType() != MVT::i1)
47491 return SDValue();
47492
47493 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47494 // 3, 5, or 9 with i32/i64, so those get transformed too.
47495 // TODO: For constants that overflow or do not differ by power-of-2 or small
47496 // multiplier, convert to 'and' + 'add'.
47497 const APInt &TrueVal = TrueC->getAPIntValue();
47498 const APInt &FalseVal = FalseC->getAPIntValue();
47499
47500 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47501 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47502 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47503 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47504 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47505 return SDValue();
47506 }
47507
47508 bool OV;
47509 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47510 if (OV)
47511 return SDValue();
47512
47513 APInt AbsDiff = Diff.abs();
47514 if (AbsDiff.isPowerOf2() ||
47515 ((VT == MVT::i32 || VT == MVT::i64) &&
47516 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47517
47518 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47519 // of the condition can usually be folded into a compare predicate, but even
47520 // without that, the sequence should be cheaper than a CMOV alternative.
47521 if (TrueVal.slt(FalseVal)) {
47522 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47523 std::swap(TrueC, FalseC);
47524 }
47525
47526 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47527 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47528
47529 // Multiply condition by the difference if non-one.
47530 if (!AbsDiff.isOne())
47531 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47532
47533 // Add the base if non-zero.
47534 if (!FalseC->isZero())
47535 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47536
47537 return R;
47538 }
47539
47540 return SDValue();
47541}
47542
47543/// If this is a *dynamic* select (non-constant condition) and we can match
47544/// this node with one of the variable blend instructions, restructure the
47545/// condition so that blends can use the high (sign) bit of each element.
47546/// This function will also call SimplifyDemandedBits on already created
47547/// BLENDV to perform additional simplifications.
47549 const SDLoc &DL,
47551 const X86Subtarget &Subtarget) {
47552 SDValue Cond = N->getOperand(0);
47553 if ((N->getOpcode() != ISD::VSELECT &&
47554 N->getOpcode() != X86ISD::BLENDV) ||
47556 return SDValue();
47557
47558 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47559 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47560 EVT VT = N->getValueType(0);
47561
47562 // We can only handle the cases where VSELECT is directly legal on the
47563 // subtarget. We custom lower VSELECT nodes with constant conditions and
47564 // this makes it hard to see whether a dynamic VSELECT will correctly
47565 // lower, so we both check the operation's status and explicitly handle the
47566 // cases where a *dynamic* blend will fail even though a constant-condition
47567 // blend could be custom lowered.
47568 // FIXME: We should find a better way to handle this class of problems.
47569 // Potentially, we should combine constant-condition vselect nodes
47570 // pre-legalization into shuffles and not mark as many types as custom
47571 // lowered.
47573 return SDValue();
47574 // FIXME: We don't support i16-element blends currently. We could and
47575 // should support them by making *all* the bits in the condition be set
47576 // rather than just the high bit and using an i8-element blend.
47577 if (VT.getVectorElementType() == MVT::i16)
47578 return SDValue();
47579 // Dynamic blending was only available from SSE4.1 onward.
47580 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47581 return SDValue();
47582 // Byte blends are only available in AVX2
47583 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47584 return SDValue();
47585 // There are no 512-bit blend instructions that use sign bits.
47586 if (VT.is512BitVector())
47587 return SDValue();
47588
47589 // Don't optimize before the condition has been transformed to a legal type
47590 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47592 return SDValue();
47593
47594 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47595 for (SDUse &Use : Cond->uses())
47596 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47597 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47598 Use.getOperandNo() != 0)
47599 return false;
47600
47601 return true;
47602 };
47603
47605
47606 if (OnlyUsedAsSelectCond(Cond)) {
47607 KnownBits Known;
47609 !DCI.isBeforeLegalizeOps());
47610 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47611 return SDValue();
47612
47613 // If we changed the computation somewhere in the DAG, this change will
47614 // affect all users of Cond. Update all the nodes so that we do not use
47615 // the generic VSELECT anymore. Otherwise, we may perform wrong
47616 // optimizations as we messed with the actual expectation for the vector
47617 // boolean values.
47618 for (SDNode *U : Cond->users()) {
47619 if (U->getOpcode() == X86ISD::BLENDV)
47620 continue;
47621
47622 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47623 Cond, U->getOperand(1), U->getOperand(2));
47624 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47625 DCI.AddToWorklist(U);
47626 }
47627 DCI.CommitTargetLoweringOpt(TLO);
47628 return SDValue(N, 0);
47629 }
47630
47631 // Otherwise we can still at least try to simplify multiple use bits.
47633 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47634 N->getOperand(1), N->getOperand(2));
47635
47636 return SDValue();
47637}
47638
47639// Try to match:
47640// (or (and (M, (sub 0, X)), (pandn M, X)))
47641// which is a special case of:
47642// (select M, (sub 0, X), X)
47643// Per:
47644// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47645// We know that, if fNegate is 0 or 1:
47646// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47647//
47648// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47649// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47650// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47651// This lets us transform our vselect to:
47652// (add (xor X, M), (and M, 1))
47653// And further to:
47654// (sub (xor X, M), M)
47656 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47657 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47658 using namespace SDPatternMatch;
47659 EVT MaskVT = Mask.getValueType();
47660 assert(MaskVT.isInteger() &&
47661 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47662 "Mask must be zero/all-bits");
47663
47664 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47666 return SDValue();
47667
47668 SDValue V;
47669 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47671 return SDValue();
47672
47673 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47674 SDValue SubOp2 = Mask;
47675
47676 // If the negate was on the false side of the select, then
47677 // the operands of the SUB need to be swapped. PR 27251.
47678 // This is because the pattern being matched above is
47679 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47680 // but if the pattern matched was
47681 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47682 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47683 // pattern also needs to be a negation of the replacement pattern above.
47684 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47685 // sub accomplishes the negation of the replacement pattern.
47686 if (V == Y)
47687 std::swap(SubOp1, SubOp2);
47688
47689 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47690 return DAG.getBitcast(VT, Res);
47691}
47692
47694 const X86Subtarget &Subtarget) {
47695 using namespace SDPatternMatch;
47696 if (!Subtarget.hasAVX512())
47697 return SDValue();
47698
47699 ISD::CondCode CC;
47700 SDValue Cond, X, Y, LHS, RHS;
47703 m_CondCode(CC)))),
47704 m_Value(LHS), m_Value(RHS))))
47705 return SDValue();
47706
47707 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47708 !canCombineAsMaskOperation(RHS, Subtarget))
47709 return SDValue();
47710
47711 // Commute LHS and RHS to create opportunity to select mask instruction.
47712 // (vselect M, L, R) -> (vselect ~M, R, L)
47713 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47714 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47715 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47716}
47717
47718/// Do target-specific dag combines on SELECT and VSELECT nodes.
47721 const X86Subtarget &Subtarget) {
47722 SDLoc DL(N);
47723 SDValue Cond = N->getOperand(0);
47724 SDValue LHS = N->getOperand(1);
47725 SDValue RHS = N->getOperand(2);
47726
47727 // Try simplification again because we use this function to optimize
47728 // BLENDV nodes that are not handled by the generic combiner.
47729 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47730 return V;
47731
47732 // When avx512 is available the lhs operand of select instruction can be
47733 // folded with mask instruction, while the rhs operand can't. Commute the
47734 // lhs and rhs of the select instruction to create the opportunity of
47735 // folding.
47736 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47737 return V;
47738
47739 EVT VT = LHS.getValueType();
47740 EVT CondVT = Cond.getValueType();
47741 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47742 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47743
47744 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47745 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47746 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47747 if (CondVT.isVector() && CondVT.isInteger() &&
47748 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47749 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47752 DL, DAG, Subtarget))
47753 return V;
47754
47755 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47756 SmallVector<int, 64> CondMask;
47757 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47758 N->getOpcode() == X86ISD::BLENDV)) {
47759 // Convert vselects with constant condition into shuffles.
47760 if (DCI.isBeforeLegalizeOps())
47761 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47762
47763 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47764 // by forcing the unselected elements to zero.
47765 // TODO: Can we handle more shuffles with this?
47766 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47767 SmallVector<SDValue, 1> LHSOps, RHSOps;
47768 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47771 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47772 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47773 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47774 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47775 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47776 assert(ByteMask.size() == LHSMask.size() &&
47777 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47778 for (auto [I, M] : enumerate(ByteMask)) {
47779 // getConstVector sets negative shuffle mask values as undef, so
47780 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47781 if (M < (int)ByteMask.size()) {
47782 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47783 RHSMask[I] = 0x80;
47784 } else {
47785 LHSMask[I] = 0x80;
47786 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47787 }
47788 }
47789 MVT ByteVT = LHSShuf.getSimpleValueType();
47790 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47791 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47792 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47793 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47794 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47795 }
47796 }
47797
47798 // Attempt to combine as shuffle.
47799 SDValue Op(N, 0);
47800 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47801 return Res;
47802 }
47803 }
47804
47805 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47806 // instructions match the semantics of the common C idiom x<y?x:y but not
47807 // x<=y?x:y, because of how they handle negative zero (which can be
47808 // ignored in unsafe-math mode).
47809 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47810 if ((Cond.getOpcode() == ISD::SETCC ||
47811 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47812 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47813 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47814 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47815 (Subtarget.hasSSE2() ||
47816 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47817 bool IsStrict = Cond->isStrictFPOpcode();
47818 ISD::CondCode CC =
47819 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47820 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47821 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47822
47823 unsigned Opcode = 0;
47824 // Check for x CC y ? x : y.
47825 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47826 switch (CC) {
47827 default: break;
47828 case ISD::SETULT:
47829 // Converting this to a min would handle NaNs incorrectly, and swapping
47830 // the operands would cause it to handle comparisons between positive
47831 // and negative zero incorrectly.
47832 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47834 !(DAG.isKnownNeverZeroFloat(LHS) ||
47836 break;
47837 std::swap(LHS, RHS);
47838 }
47839 Opcode = X86ISD::FMIN;
47840 break;
47841 case ISD::SETOLE:
47842 // Converting this to a min would handle comparisons between positive
47843 // and negative zero incorrectly.
47846 break;
47847 Opcode = X86ISD::FMIN;
47848 break;
47849 case ISD::SETULE:
47850 // Converting this to a min would handle both negative zeros and NaNs
47851 // incorrectly, but we can swap the operands to fix both.
47852 std::swap(LHS, RHS);
47853 [[fallthrough]];
47854 case ISD::SETOLT:
47855 case ISD::SETLT:
47856 case ISD::SETLE:
47857 Opcode = X86ISD::FMIN;
47858 break;
47859
47860 case ISD::SETOGE:
47861 // Converting this to a max would handle comparisons between positive
47862 // and negative zero incorrectly.
47865 break;
47866 Opcode = X86ISD::FMAX;
47867 break;
47868 case ISD::SETUGT:
47869 // Converting this to a max would handle NaNs incorrectly, and swapping
47870 // the operands would cause it to handle comparisons between positive
47871 // and negative zero incorrectly.
47872 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47874 !(DAG.isKnownNeverZeroFloat(LHS) ||
47876 break;
47877 std::swap(LHS, RHS);
47878 }
47879 Opcode = X86ISD::FMAX;
47880 break;
47881 case ISD::SETUGE:
47882 // Converting this to a max would handle both negative zeros and NaNs
47883 // incorrectly, but we can swap the operands to fix both.
47884 std::swap(LHS, RHS);
47885 [[fallthrough]];
47886 case ISD::SETOGT:
47887 case ISD::SETGT:
47888 case ISD::SETGE:
47889 Opcode = X86ISD::FMAX;
47890 break;
47891 }
47892 // Check for x CC y ? y : x -- a min/max with reversed arms.
47893 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47894 switch (CC) {
47895 default: break;
47896 case ISD::SETOGE:
47897 // Converting this to a min would handle comparisons between positive
47898 // and negative zero incorrectly, and swapping the operands would
47899 // cause it to handle NaNs incorrectly.
47901 !(DAG.isKnownNeverZeroFloat(LHS) ||
47902 DAG.isKnownNeverZeroFloat(RHS))) {
47903 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47904 break;
47905 std::swap(LHS, RHS);
47906 }
47907 Opcode = X86ISD::FMIN;
47908 break;
47909 case ISD::SETUGT:
47910 // Converting this to a min would handle NaNs incorrectly.
47911 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47912 break;
47913 Opcode = X86ISD::FMIN;
47914 break;
47915 case ISD::SETUGE:
47916 // Converting this to a min would handle both negative zeros and NaNs
47917 // incorrectly, but we can swap the operands to fix both.
47918 std::swap(LHS, RHS);
47919 [[fallthrough]];
47920 case ISD::SETOGT:
47921 case ISD::SETGT:
47922 case ISD::SETGE:
47923 Opcode = X86ISD::FMIN;
47924 break;
47925
47926 case ISD::SETULT:
47927 // Converting this to a max would handle NaNs incorrectly.
47928 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47929 break;
47930 Opcode = X86ISD::FMAX;
47931 break;
47932 case ISD::SETOLE:
47933 // Converting this to a max would handle comparisons between positive
47934 // and negative zero incorrectly, and swapping the operands would
47935 // cause it to handle NaNs incorrectly.
47937 !DAG.isKnownNeverZeroFloat(LHS) &&
47938 !DAG.isKnownNeverZeroFloat(RHS)) {
47939 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47940 break;
47941 std::swap(LHS, RHS);
47942 }
47943 Opcode = X86ISD::FMAX;
47944 break;
47945 case ISD::SETULE:
47946 // Converting this to a max would handle both negative zeros and NaNs
47947 // incorrectly, but we can swap the operands to fix both.
47948 std::swap(LHS, RHS);
47949 [[fallthrough]];
47950 case ISD::SETOLT:
47951 case ISD::SETLT:
47952 case ISD::SETLE:
47953 Opcode = X86ISD::FMAX;
47954 break;
47955 }
47956 }
47957
47958 if (Opcode) {
47959 if (IsStrict) {
47960 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47962 DL, {N->getValueType(0), MVT::Other},
47963 {Cond.getOperand(0), LHS, RHS});
47964 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47965 return Ret;
47966 }
47967 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47968 }
47969 }
47970
47971 // Some mask scalar intrinsics rely on checking if only one bit is set
47972 // and implement it in C code like this:
47973 // A[0] = (U & 1) ? A[0] : W[0];
47974 // This creates some redundant instructions that break pattern matching.
47975 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47976 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47977 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47978 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47979 SDValue AndNode = Cond.getOperand(0);
47980 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47981 isNullConstant(Cond.getOperand(1)) &&
47982 isOneConstant(AndNode.getOperand(1))) {
47983 // LHS and RHS swapped due to
47984 // setcc outputting 1 when AND resulted in 0 and vice versa.
47985 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47986 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47987 }
47988 }
47989
47990 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47991 // lowering on KNL. In this case we convert it to
47992 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47993 // The same situation all vectors of i8 and i16 without BWI.
47994 // Make sure we extend these even before type legalization gets a chance to
47995 // split wide vectors.
47996 // Since SKX these selects have a proper lowering.
47997 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47998 CondVT.getVectorElementType() == MVT::i1 &&
47999 (VT.getVectorElementType() == MVT::i8 ||
48000 VT.getVectorElementType() == MVT::i16)) {
48001 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
48002 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
48003 }
48004
48005 // AVX512 - Extend select to merge with target shuffle.
48006 // select(mask, extract_subvector(shuffle(x)), y) -->
48007 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
48008 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
48009 if (Subtarget.hasAVX512() && CondVT.isVector() &&
48010 CondVT.getVectorElementType() == MVT::i1) {
48011 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
48012 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
48013 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
48014 isNullConstant(Op.getOperand(1)) &&
48015 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
48016 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
48017 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
48018 ISD::isBuildVectorAllZeros(Alt.getNode()));
48019 };
48020
48021 bool SelectableLHS = SelectableOp(LHS, RHS);
48022 bool SelectableRHS = SelectableOp(RHS, LHS);
48023 if (SelectableLHS || SelectableRHS) {
48024 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
48025 : RHS.getOperand(0).getValueType();
48026 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
48027 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
48028 VT.getSizeInBits());
48029 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
48030 VT.getSizeInBits());
48031 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
48032 DAG.getUNDEF(SrcCondVT), Cond,
48033 DAG.getVectorIdxConstant(0, DL));
48034 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
48035 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
48036 }
48037 }
48038
48039 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
48040 return V;
48041
48042 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
48043 Cond.hasOneUse()) {
48044 EVT CondVT = Cond.getValueType();
48045 SDValue Cond0 = Cond.getOperand(0);
48046 SDValue Cond1 = Cond.getOperand(1);
48047 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
48048
48049 // Canonicalize min/max:
48050 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
48051 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
48052 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
48053 // the need for an extra compare against zero. e.g.
48054 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
48055 // subl %esi, %edi
48056 // testl %edi, %edi
48057 // movl $0, %eax
48058 // cmovgl %edi, %eax
48059 // =>
48060 // xorl %eax, %eax
48061 // subl %esi, $edi
48062 // cmovsl %eax, %edi
48063 //
48064 // We can also canonicalize
48065 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
48066 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
48067 // This allows the use of a test instruction for the compare.
48068 if (LHS == Cond0 && RHS == Cond1) {
48069 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
48070 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
48072 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48073 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48074 }
48075 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
48076 ISD::CondCode NewCC = ISD::SETUGE;
48077 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48078 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48079 }
48080 }
48081
48082 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
48083 // fold eq + gt/lt nested selects into ge/le selects
48084 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
48085 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
48086 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
48087 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
48088 // .. etc ..
48089 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48090 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48091 SDValue InnerSetCC = RHS.getOperand(0);
48092 ISD::CondCode InnerCC =
48093 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48094 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48095 Cond0 == InnerSetCC.getOperand(0) &&
48096 Cond1 == InnerSetCC.getOperand(1)) {
48097 ISD::CondCode NewCC;
48098 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48099 // clang-format off
48100 case ISD::SETGT: NewCC = ISD::SETGE; break;
48101 case ISD::SETLT: NewCC = ISD::SETLE; break;
48102 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48103 case ISD::SETULT: NewCC = ISD::SETULE; break;
48104 default: NewCC = ISD::SETCC_INVALID; break;
48105 // clang-format on
48106 }
48107 if (NewCC != ISD::SETCC_INVALID) {
48108 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48109 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48110 }
48111 }
48112 }
48113 }
48114
48115 // Check if the first operand is all zeros and Cond type is vXi1.
48116 // If this an avx512 target we can improve the use of zero masking by
48117 // swapping the operands and inverting the condition.
48118 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48119 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48120 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48121 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48122 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48123 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48124 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48125 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48126 }
48127
48128 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48129 // get split by legalization.
48130 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48131 CondVT.getVectorElementType() == MVT::i1 &&
48132 TLI.isTypeLegal(VT.getScalarType())) {
48133 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48135 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48136 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48137 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48138 }
48139 }
48140
48141 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48142 // with out-of-bounds clamping.
48143
48144 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48145 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48146 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48147 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48148 // exceeding bitwidth-1.
48149 if (N->getOpcode() == ISD::VSELECT) {
48150 using namespace llvm::SDPatternMatch;
48151 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48152 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48153 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48154 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48156 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48159 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48160 : X86ISD::VSHLV,
48161 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48162 }
48163 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48164 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48165 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48166 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48168 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48171 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48172 : X86ISD::VSHLV,
48173 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48174 }
48175 }
48176
48177 // Early exit check
48178 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48179 return SDValue();
48180
48181 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48182 return V;
48183
48184 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48185 return V;
48186
48187 // select(~Cond, X, Y) -> select(Cond, Y, X)
48188 if (CondVT.getScalarType() != MVT::i1) {
48189 if (SDValue CondNot = IsNOT(Cond, DAG))
48190 return DAG.getNode(N->getOpcode(), DL, VT,
48191 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48192
48193 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48194 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48195 Cond.getOperand(0).getOpcode() == ISD::AND &&
48196 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48197 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48198 Cond.getScalarValueSizeInBits(),
48199 /*AllowUndefs=*/true) &&
48200 Cond.hasOneUse()) {
48201 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48202 Cond.getOperand(0).getOperand(1));
48203 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48204 }
48205
48206 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48207 // signbit.
48208 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48209 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48210 Cond.hasOneUse()) {
48211 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48212 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48213 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48214 }
48215 }
48216
48217 // Try to optimize vXi1 selects if both operands are either all constants or
48218 // bitcasts from scalar integer type. In that case we can convert the operands
48219 // to integer and use an integer select which will be converted to a CMOV.
48220 // We need to take a little bit of care to avoid creating an i64 type after
48221 // type legalization.
48222 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48223 VT.getVectorElementType() == MVT::i1 &&
48224 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48226 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48227 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48228 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48229
48230 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48231 LHS.getOperand(0).getValueType() == IntVT)) &&
48232 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48233 RHS.getOperand(0).getValueType() == IntVT))) {
48234 if (LHSIsConst)
48236 else
48237 LHS = LHS.getOperand(0);
48238
48239 if (RHSIsConst)
48241 else
48242 RHS = RHS.getOperand(0);
48243
48244 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48245 return DAG.getBitcast(VT, Select);
48246 }
48247 }
48248 }
48249
48250 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48251 // single bits, then invert the predicate and swap the select operands.
48252 // This can lower using a vector shift bit-hack rather than mask and compare.
48253 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48254 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48255 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48256 Cond.getOperand(0).getOpcode() == ISD::AND &&
48257 isNullOrNullSplat(Cond.getOperand(1)) &&
48258 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48259 Cond.getOperand(0).getValueType() == VT) {
48260 // The 'and' mask must be composed of power-of-2 constants.
48261 SDValue And = Cond.getOperand(0);
48262 auto *C = isConstOrConstSplat(And.getOperand(1));
48263 if (C && C->getAPIntValue().isPowerOf2()) {
48264 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48265 SDValue NotCond =
48266 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48267 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48268 }
48269
48270 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48271 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48272 // 16-bit lacks a proper blendv.
48273 unsigned EltBitWidth = VT.getScalarSizeInBits();
48274 bool CanShiftBlend =
48275 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48276 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48277 (Subtarget.hasXOP()));
48278 if (CanShiftBlend &&
48279 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48280 return C->getAPIntValue().isPowerOf2();
48281 })) {
48282 // Create a left-shift constant to get the mask bits over to the sign-bit.
48283 SDValue Mask = And.getOperand(1);
48284 SmallVector<int, 32> ShlVals;
48285 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48286 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48287 ShlVals.push_back(EltBitWidth - 1 -
48288 MaskVal->getAPIntValue().exactLogBase2());
48289 }
48290 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48291 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48292 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48293 SDValue NewCond =
48294 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48295 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48296 }
48297 }
48298
48299 return SDValue();
48300}
48301
48302/// Combine:
48303/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48304/// to:
48305/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48306/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48307/// Note that this is only legal for some op/cc combinations.
48309 SelectionDAG &DAG,
48310 const X86Subtarget &Subtarget) {
48311 // This combine only operates on CMP-like nodes.
48312 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48313 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48314 return SDValue();
48315
48316 // Can't replace the cmp if it has more uses than the one we're looking at.
48317 // FIXME: We would like to be able to handle this, but would need to make sure
48318 // all uses were updated.
48319 if (!Cmp.hasOneUse())
48320 return SDValue();
48321
48322 // This only applies to variations of the common case:
48323 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48324 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48325 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48326 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48327 // Using the proper condcodes (see below), overflow is checked for.
48328
48329 // FIXME: We can generalize both constraints:
48330 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48331 // - LHS != 1
48332 // if the result is compared.
48333
48334 SDValue CmpLHS = Cmp.getOperand(0);
48335 SDValue CmpRHS = Cmp.getOperand(1);
48336 EVT CmpVT = CmpLHS.getValueType();
48337
48338 if (!CmpLHS.hasOneUse())
48339 return SDValue();
48340
48341 unsigned Opc = CmpLHS.getOpcode();
48342 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
48343 return SDValue();
48344
48345 SDValue OpRHS = CmpLHS.getOperand(2);
48346 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48347 if (!OpRHSC)
48348 return SDValue();
48349
48350 APInt Addend = OpRHSC->getAPIntValue();
48351 if (Opc == ISD::ATOMIC_LOAD_SUB)
48352 Addend = -Addend;
48353
48354 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48355 if (!CmpRHSC)
48356 return SDValue();
48357
48358 APInt Comparison = CmpRHSC->getAPIntValue();
48359 APInt NegAddend = -Addend;
48360
48361 // See if we can adjust the CC to make the comparison match the negated
48362 // addend.
48363 if (Comparison != NegAddend) {
48364 APInt IncComparison = Comparison + 1;
48365 if (IncComparison == NegAddend) {
48366 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48367 Comparison = IncComparison;
48368 CC = X86::COND_AE;
48369 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48370 Comparison = IncComparison;
48371 CC = X86::COND_L;
48372 }
48373 }
48374 APInt DecComparison = Comparison - 1;
48375 if (DecComparison == NegAddend) {
48376 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48377 Comparison = DecComparison;
48378 CC = X86::COND_A;
48379 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48380 Comparison = DecComparison;
48381 CC = X86::COND_LE;
48382 }
48383 }
48384 }
48385
48386 // If the addend is the negation of the comparison value, then we can do
48387 // a full comparison by emitting the atomic arithmetic as a locked sub.
48388 if (Comparison == NegAddend) {
48389 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48390 // atomic sub.
48391 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48392 auto AtomicSub = DAG.getAtomic(
48393 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48394 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48395 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48396 AN->getMemOperand());
48397 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48398 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48399 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48400 return LockOp;
48401 }
48402
48403 // We can handle comparisons with zero in a number of cases by manipulating
48404 // the CC used.
48405 if (!Comparison.isZero())
48406 return SDValue();
48407
48408 if (CC == X86::COND_S && Addend == 1)
48409 CC = X86::COND_LE;
48410 else if (CC == X86::COND_NS && Addend == 1)
48411 CC = X86::COND_G;
48412 else if (CC == X86::COND_G && Addend == -1)
48413 CC = X86::COND_GE;
48414 else if (CC == X86::COND_LE && Addend == -1)
48415 CC = X86::COND_L;
48416 else
48417 return SDValue();
48418
48419 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48420 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48421 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48422 return LockOp;
48423}
48424
48425// Check whether we're just testing the signbit, and whether we can simplify
48426// this by tracking where the signbit came from.
48428 SelectionDAG &DAG) {
48429 if (CC != X86::COND_S && CC != X86::COND_NS)
48430 return SDValue();
48431
48432 if (!Cmp.hasOneUse())
48433 return SDValue();
48434
48435 SDValue Src;
48436 if (Cmp.getOpcode() == X86ISD::CMP) {
48437 // CMP(X,0) -> signbit test
48438 if (!isNullConstant(Cmp.getOperand(1)))
48439 return SDValue();
48440 Src = Cmp.getOperand(0);
48441 // Peek through a SRA node as we just need the signbit.
48442 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48443 // TODO: Use SimplifyDemandedBits instead of just SRA?
48444 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48445 return SDValue();
48446 Src = Src.getOperand(0);
48447 } else if (Cmp.getOpcode() == X86ISD::OR) {
48448 // OR(X,Y) -> see if only one operand contributes to the signbit.
48449 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48450 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48451 Src = Cmp.getOperand(1);
48452 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48453 Src = Cmp.getOperand(0);
48454 else
48455 return SDValue();
48456 } else {
48457 return SDValue();
48458 }
48459
48460 // Replace with a TEST on the MSB.
48461 SDLoc DL(Cmp);
48462 MVT SrcVT = Src.getSimpleValueType();
48463 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48464
48465 // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded
48466 // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.
48467 if (Src.getOpcode() == ISD::SHL) {
48468 if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48469 Src = Src.getOperand(0);
48470 BitMask.lshrInPlace(*ShiftAmt);
48471 }
48472 } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {
48473 EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();
48474 Src = Src.getOperand(0);
48475 BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());
48476 }
48477
48478 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48479 DAG.getConstant(BitMask, DL, SrcVT));
48480 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48481 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48482 DAG.getConstant(0, DL, SrcVT));
48483}
48484
48485// Check whether a boolean test is testing a boolean value generated by
48486// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48487// code.
48488//
48489// Simplify the following patterns:
48490// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48491// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48492// to (Op EFLAGS Cond)
48493//
48494// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48495// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48496// to (Op EFLAGS !Cond)
48497//
48498// where Op could be BRCOND or CMOV.
48499//
48501 // This combine only operates on CMP-like nodes.
48502 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48503 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48504 return SDValue();
48505
48506 // Quit if not used as a boolean value.
48507 if (CC != X86::COND_E && CC != X86::COND_NE)
48508 return SDValue();
48509
48510 // Check CMP operands. One of them should be 0 or 1 and the other should be
48511 // an SetCC or extended from it.
48512 SDValue Op1 = Cmp.getOperand(0);
48513 SDValue Op2 = Cmp.getOperand(1);
48514
48515 SDValue SetCC;
48516 const ConstantSDNode* C = nullptr;
48517 bool needOppositeCond = (CC == X86::COND_E);
48518 bool checkAgainstTrue = false; // Is it a comparison against 1?
48519
48520 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48521 SetCC = Op2;
48522 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48523 SetCC = Op1;
48524 else // Quit if all operands are not constants.
48525 return SDValue();
48526
48527 if (C->getZExtValue() == 1) {
48528 needOppositeCond = !needOppositeCond;
48529 checkAgainstTrue = true;
48530 } else if (C->getZExtValue() != 0)
48531 // Quit if the constant is neither 0 or 1.
48532 return SDValue();
48533
48534 bool truncatedToBoolWithAnd = false;
48535 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48536 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48537 SetCC.getOpcode() == ISD::TRUNCATE ||
48538 SetCC.getOpcode() == ISD::AND) {
48539 if (SetCC.getOpcode() == ISD::AND) {
48540 int OpIdx = -1;
48541 if (isOneConstant(SetCC.getOperand(0)))
48542 OpIdx = 1;
48543 if (isOneConstant(SetCC.getOperand(1)))
48544 OpIdx = 0;
48545 if (OpIdx < 0)
48546 break;
48547 SetCC = SetCC.getOperand(OpIdx);
48548 truncatedToBoolWithAnd = true;
48549 } else
48550 SetCC = SetCC.getOperand(0);
48551 }
48552
48553 switch (SetCC.getOpcode()) {
48555 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48556 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48557 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48558 // truncated to i1 using 'and'.
48559 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48560 break;
48562 "Invalid use of SETCC_CARRY!");
48563 [[fallthrough]];
48564 case X86ISD::SETCC:
48565 // Set the condition code or opposite one if necessary.
48566 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48567 if (needOppositeCond)
48569 return SetCC.getOperand(1);
48570 case X86ISD::CMOV: {
48571 // Check whether false/true value has canonical one, i.e. 0 or 1.
48574 // Quit if true value is not a constant.
48575 if (!TVal)
48576 return SDValue();
48577 // Quit if false value is not a constant.
48578 if (!FVal) {
48579 SDValue Op = SetCC.getOperand(0);
48580 // Skip 'zext' or 'trunc' node.
48581 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48582 Op.getOpcode() == ISD::TRUNCATE)
48583 Op = Op.getOperand(0);
48584 // A special case for rdrand/rdseed, where 0 is set if false cond is
48585 // found.
48586 if ((Op.getOpcode() != X86ISD::RDRAND &&
48587 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48588 return SDValue();
48589 }
48590 // Quit if false value is not the constant 0 or 1.
48591 bool FValIsFalse = true;
48592 if (FVal && FVal->getZExtValue() != 0) {
48593 if (FVal->getZExtValue() != 1)
48594 return SDValue();
48595 // If FVal is 1, opposite cond is needed.
48596 needOppositeCond = !needOppositeCond;
48597 FValIsFalse = false;
48598 }
48599 // Quit if TVal is not the constant opposite of FVal.
48600 if (FValIsFalse && TVal->getZExtValue() != 1)
48601 return SDValue();
48602 if (!FValIsFalse && TVal->getZExtValue() != 0)
48603 return SDValue();
48604 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48605 if (needOppositeCond)
48607 return SetCC.getOperand(3);
48608 }
48609 }
48610
48611 return SDValue();
48612}
48613
48614/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48615/// Match:
48616/// (X86or (X86setcc) (X86setcc))
48617/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48619 X86::CondCode &CC1, SDValue &Flags,
48620 bool &isAnd) {
48621 if (Cond->getOpcode() == X86ISD::CMP) {
48622 if (!isNullConstant(Cond->getOperand(1)))
48623 return false;
48624
48625 Cond = Cond->getOperand(0);
48626 }
48627
48628 isAnd = false;
48629
48630 SDValue SetCC0, SetCC1;
48631 switch (Cond->getOpcode()) {
48632 default: return false;
48633 case ISD::AND:
48634 case X86ISD::AND:
48635 isAnd = true;
48636 [[fallthrough]];
48637 case ISD::OR:
48638 case X86ISD::OR:
48639 SetCC0 = Cond->getOperand(0);
48640 SetCC1 = Cond->getOperand(1);
48641 break;
48642 };
48643
48644 // Make sure we have SETCC nodes, using the same flags value.
48645 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48646 SetCC1.getOpcode() != X86ISD::SETCC ||
48647 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48648 return false;
48649
48650 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48651 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48652 Flags = SetCC0->getOperand(1);
48653 return true;
48654}
48655
48656// When legalizing carry, we create carries via add X, -1
48657// If that comes from an actual carry, via setcc, we use the
48658// carry directly.
48660 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48661 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48662 bool FoundAndLSB = false;
48663 SDValue Carry = EFLAGS.getOperand(0);
48664 while (Carry.getOpcode() == ISD::TRUNCATE ||
48665 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48666 (Carry.getOpcode() == ISD::AND &&
48667 isOneConstant(Carry.getOperand(1)))) {
48668 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48669 Carry = Carry.getOperand(0);
48670 }
48671 if (Carry.getOpcode() == X86ISD::SETCC ||
48672 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48673 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48674 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48675 SDValue CarryOp1 = Carry.getOperand(1);
48676 if (CarryCC == X86::COND_B)
48677 return CarryOp1;
48678 if (CarryCC == X86::COND_A) {
48679 // Try to convert COND_A into COND_B in an attempt to facilitate
48680 // materializing "setb reg".
48681 //
48682 // Do not flip "e > c", where "c" is a constant, because Cmp
48683 // instruction cannot take an immediate as its first operand.
48684 //
48685 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48686 CarryOp1.getNode()->hasOneUse() &&
48687 CarryOp1.getValueType().isInteger() &&
48688 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48689 SDValue SubCommute =
48690 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48691 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48692 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48693 }
48694 }
48695 // If this is a check of the z flag of an add with 1, switch to the
48696 // C flag.
48697 if (CarryCC == X86::COND_E &&
48698 CarryOp1.getOpcode() == X86ISD::ADD &&
48699 isOneConstant(CarryOp1.getOperand(1)))
48700 return CarryOp1;
48701 } else if (FoundAndLSB) {
48702 SDLoc DL(Carry);
48703 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48704 if (Carry.getOpcode() == ISD::SRL) {
48705 BitNo = Carry.getOperand(1);
48706 Carry = Carry.getOperand(0);
48707 }
48708 return getBT(Carry, BitNo, DL, DAG);
48709 }
48710 }
48711 }
48712
48713 return SDValue();
48714}
48715
48716/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48717/// to avoid the inversion.
48719 SelectionDAG &DAG,
48720 const X86Subtarget &Subtarget) {
48721 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48722 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48723 EFLAGS.getOpcode() != X86ISD::TESTP)
48724 return SDValue();
48725
48726 // PTEST/TESTP sets EFLAGS as:
48727 // TESTZ: ZF = (Op0 & Op1) == 0
48728 // TESTC: CF = (~Op0 & Op1) == 0
48729 // TESTNZC: ZF == 0 && CF == 0
48730 MVT VT = EFLAGS.getSimpleValueType();
48731 SDValue Op0 = EFLAGS.getOperand(0);
48732 SDValue Op1 = EFLAGS.getOperand(1);
48733 MVT OpVT = Op0.getSimpleValueType();
48734 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48735
48736 // TEST*(~X,Y) == TEST*(X,Y)
48737 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48738 X86::CondCode InvCC;
48739 switch (CC) {
48740 case X86::COND_B:
48741 // testc -> testz.
48742 InvCC = X86::COND_E;
48743 break;
48744 case X86::COND_AE:
48745 // !testc -> !testz.
48746 InvCC = X86::COND_NE;
48747 break;
48748 case X86::COND_E:
48749 // testz -> testc.
48750 InvCC = X86::COND_B;
48751 break;
48752 case X86::COND_NE:
48753 // !testz -> !testc.
48754 InvCC = X86::COND_AE;
48755 break;
48756 case X86::COND_A:
48757 case X86::COND_BE:
48758 // testnzc -> testnzc (no change).
48759 InvCC = CC;
48760 break;
48761 default:
48762 InvCC = X86::COND_INVALID;
48763 break;
48764 }
48765
48766 if (InvCC != X86::COND_INVALID) {
48767 CC = InvCC;
48768 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48769 DAG.getBitcast(OpVT, NotOp0), Op1);
48770 }
48771 }
48772
48773 if (CC == X86::COND_B || CC == X86::COND_AE) {
48774 // TESTC(X,~X) == TESTC(X,-1)
48775 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48776 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48777 SDLoc DL(EFLAGS);
48778 return DAG.getNode(
48779 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48780 DAG.getBitcast(OpVT,
48781 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48782 }
48783 }
48784 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48785 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48787 SDValue BC0 = peekThroughBitcasts(Op0);
48788 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48790 SDLoc DL(EFLAGS);
48791 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48792 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48793 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48794 }
48795 }
48796 }
48797
48798 if (CC == X86::COND_E || CC == X86::COND_NE) {
48799 // TESTZ(X,~Y) == TESTC(Y,X)
48800 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48801 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48802 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48803 DAG.getBitcast(OpVT, NotOp1), Op0);
48804 }
48805
48806 if (Op0 == Op1) {
48807 SDValue BC = peekThroughBitcasts(Op0);
48808 EVT BCVT = BC.getValueType();
48809
48810 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48811 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48812 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48813 DAG.getBitcast(OpVT, BC.getOperand(0)),
48814 DAG.getBitcast(OpVT, BC.getOperand(1)));
48815 }
48816
48817 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48818 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48819 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48820 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48821 DAG.getBitcast(OpVT, BC.getOperand(0)),
48822 DAG.getBitcast(OpVT, BC.getOperand(1)));
48823 }
48824
48825 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48826 // to more efficiently extract the sign bits and compare that.
48827 // TODO: Handle TESTC with comparison inversion.
48828 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48829 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48830 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48831 unsigned EltBits = BCVT.getScalarSizeInBits();
48832 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48833 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48834 APInt SignMask = APInt::getSignMask(EltBits);
48835 if (SDValue Res =
48836 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48837 // For vXi16 cases we need to use pmovmksb and extract every other
48838 // sign bit.
48839 SDLoc DL(EFLAGS);
48840 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48841 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48842 MVT FloatVT =
48843 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48844 Res = DAG.getBitcast(FloatVT, Res);
48845 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48846 } else if (EltBits == 16) {
48847 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48848 Res = DAG.getBitcast(MovmskVT, Res);
48849 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48850 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48851 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48852 } else {
48853 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48854 }
48855 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48856 DAG.getConstant(0, DL, MVT::i32));
48857 }
48858 }
48859 }
48860 }
48861
48862 // TESTZ(-1,X) == TESTZ(X,X)
48864 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48865
48866 // TESTZ(X,-1) == TESTZ(X,X)
48868 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48869
48870 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48871 // TODO: Add COND_NE handling?
48872 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48873 SDValue Src0 = peekThroughBitcasts(Op0);
48874 SDValue Src1 = peekThroughBitcasts(Op1);
48875 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48877 peekThroughBitcasts(Src0.getOperand(1)), true);
48879 peekThroughBitcasts(Src1.getOperand(1)), true);
48880 if (Src0 && Src1) {
48881 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48882 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48883 DAG.getBitcast(OpVT2, Src0),
48884 DAG.getBitcast(OpVT2, Src1));
48885 }
48886 }
48887 }
48888 }
48889
48890 return SDValue();
48891}
48892
48893// Attempt to simplify the MOVMSK input based on the comparison type.
48895 SelectionDAG &DAG,
48896 const X86Subtarget &Subtarget) {
48897 // Handle eq/ne against zero (any_of).
48898 // Handle eq/ne against -1 (all_of).
48899 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48900 return SDValue();
48901 if (EFLAGS.getValueType() != MVT::i32)
48902 return SDValue();
48903 unsigned CmpOpcode = EFLAGS.getOpcode();
48904 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48905 return SDValue();
48906 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48907 if (!CmpConstant)
48908 return SDValue();
48909 const APInt &CmpVal = CmpConstant->getAPIntValue();
48910
48911 SDValue CmpOp = EFLAGS.getOperand(0);
48912 unsigned CmpBits = CmpOp.getValueSizeInBits();
48913 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48914
48915 // Peek through any truncate.
48916 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48917 CmpOp = CmpOp.getOperand(0);
48918
48919 // Bail if we don't find a MOVMSK.
48920 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48921 return SDValue();
48922
48923 SDValue Vec = CmpOp.getOperand(0);
48924 MVT VecVT = Vec.getSimpleValueType();
48925 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48926 "Unexpected MOVMSK operand");
48927 unsigned NumElts = VecVT.getVectorNumElements();
48928 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48929
48930 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48931 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48932 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48933 if (!IsAnyOf && !IsAllOf)
48934 return SDValue();
48935
48936 // TODO: Check more combining cases for me.
48937 // Here we check the cmp use number to decide do combining or not.
48938 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48939 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48940 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48941
48942 // See if we can peek through to a vector with a wider element type, if the
48943 // signbits extend down to all the sub-elements as well.
48944 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48945 // potential SimplifyDemandedBits/Elts cases.
48946 // If we looked through a truncate that discard bits, we can't do this
48947 // transform.
48948 // FIXME: We could do this transform for truncates that discarded bits by
48949 // inserting an AND mask between the new MOVMSK and the CMP.
48950 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48951 SDValue BC = peekThroughBitcasts(Vec);
48952 MVT BCVT = BC.getSimpleValueType();
48953 unsigned BCNumElts = BCVT.getVectorNumElements();
48954 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48955 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48956 BCNumEltBits > NumEltBits &&
48957 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48958 SDLoc DL(EFLAGS);
48959 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48960 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48961 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48962 DAG.getConstant(CmpMask, DL, MVT::i32));
48963 }
48964 }
48965
48966 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48967 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48968 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48969 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48970 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48972 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48973 Ops.size() == 2) {
48974 SDLoc DL(EFLAGS);
48975 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48976 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48977 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48978 DAG.getBitcast(SubVT, Ops[0]),
48979 DAG.getBitcast(SubVT, Ops[1]));
48980 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48981 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48982 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48983 DAG.getConstant(CmpMask, DL, MVT::i32));
48984 }
48985 }
48986
48987 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48988 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48989 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48990 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48991 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48992 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48993 SDValue BC = peekThroughBitcasts(Vec);
48994 // Ensure MOVMSK was testing every signbit of BC.
48995 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48996 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48997 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48998 BC.getOperand(0), BC.getOperand(1));
48999 V = DAG.getBitcast(TestVT, V);
49000 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49001 }
49002 // Check for 256-bit split vector cases.
49003 if (BC.getOpcode() == ISD::AND &&
49004 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
49005 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
49006 SDValue LHS = BC.getOperand(0);
49007 SDValue RHS = BC.getOperand(1);
49008 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
49009 LHS.getOperand(0), LHS.getOperand(1));
49010 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
49011 RHS.getOperand(0), RHS.getOperand(1));
49012 LHS = DAG.getBitcast(TestVT, LHS);
49013 RHS = DAG.getBitcast(TestVT, RHS);
49014 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
49015 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49016 }
49017 }
49018 }
49019
49020 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
49021 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
49022 // sign bits prior to the comparison with zero unless we know that
49023 // the vXi16 splats the sign bit down to the lower i8 half.
49024 // TODO: Handle all_of patterns.
49025 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
49026 SDValue VecOp0 = Vec.getOperand(0);
49027 SDValue VecOp1 = Vec.getOperand(1);
49028 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
49029 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
49030 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
49031 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
49032 SDLoc DL(EFLAGS);
49033 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
49034 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49035 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
49036 if (!SignExt0) {
49037 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
49038 DAG.getConstant(0xAAAA, DL, MVT::i16));
49039 }
49040 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49041 DAG.getConstant(0, DL, MVT::i16));
49042 }
49043 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
49044 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
49045 if (CmpBits >= 16 && Subtarget.hasInt256() &&
49046 (IsAnyOf || (SignExt0 && SignExt1))) {
49047 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
49048 SDLoc DL(EFLAGS);
49049 SDValue Result = peekThroughBitcasts(Src);
49050 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
49051 Result.getValueType().getVectorNumElements() <= NumElts) {
49052 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
49053 Result.getOperand(0), Result.getOperand(1));
49054 V = DAG.getBitcast(MVT::v4i64, V);
49055 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49056 }
49057 Result = DAG.getBitcast(MVT::v32i8, Result);
49058 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49059 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
49060 if (!SignExt0 || !SignExt1) {
49061 assert(IsAnyOf &&
49062 "Only perform v16i16 signmasks for any_of patterns");
49063 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
49064 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
49065 }
49066 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49067 DAG.getConstant(CmpMask, DL, MVT::i32));
49068 }
49069 }
49070 }
49071
49072 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
49073 // Since we peek through a bitcast, we need to be careful if the base vector
49074 // type has smaller elements than the MOVMSK type. In that case, even if
49075 // all the elements are demanded by the shuffle mask, only the "high"
49076 // elements which have highbits that align with highbits in the MOVMSK vec
49077 // elements are actually demanded. A simplification of spurious operations
49078 // on the "low" elements take place during other simplifications.
49079 //
49080 // For example:
49081 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
49082 // demanded, because we are swapping around the result can change.
49083 //
49084 // To address this, we check that we can scale the shuffle mask to MOVMSK
49085 // element width (this will ensure "high" elements match). Its slightly overly
49086 // conservative, but fine for an edge case fold.
49087 SmallVector<int, 32> ShuffleMask;
49088 SmallVector<SDValue, 2> ShuffleInputs;
49089 if (NumElts <= CmpBits &&
49090 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
49091 ShuffleMask, DAG) &&
49092 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
49093 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49094 canScaleShuffleElements(ShuffleMask, NumElts)) {
49095 SDLoc DL(EFLAGS);
49096 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49097 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49098 Result =
49099 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49100 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49101 }
49102
49103 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49104 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49105 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49106 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49107 // iff every element is referenced.
49108 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49109 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49110 (NumEltBits == 32 || NumEltBits == 64)) {
49111 SDLoc DL(EFLAGS);
49112 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49113 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49114 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49115 SDValue LHS = Vec;
49116 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49117 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49118 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49119 DAG.getBitcast(FloatVT, LHS),
49120 DAG.getBitcast(FloatVT, RHS));
49121 }
49122
49123 return SDValue();
49124}
49125
49126/// Optimize an EFLAGS definition used according to the condition code \p CC
49127/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49128/// uses of chain values.
49130 SelectionDAG &DAG,
49131 const X86Subtarget &Subtarget) {
49132 if (CC == X86::COND_B)
49133 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49134 return Flags;
49135
49136 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49137 return R;
49138
49139 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49140 return R;
49141
49142 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49143 return R;
49144
49145 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49146 return R;
49147
49148 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49149}
49150
49151/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49154 const X86Subtarget &Subtarget) {
49155 SDLoc DL(N);
49156 EVT VT = N->getValueType(0);
49157 SDValue FalseOp = N->getOperand(0);
49158 SDValue TrueOp = N->getOperand(1);
49159 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49160 SDValue Cond = N->getOperand(3);
49161
49162 // cmov X, X, ?, ? --> X
49163 if (TrueOp == FalseOp)
49164 return TrueOp;
49165
49166 // Try to simplify the EFLAGS and condition code operands.
49167 // We can't always do this as FCMOV only supports a subset of X86 cond.
49168 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49169 if (!(FalseOp.getValueType() == MVT::f80 ||
49170 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49171 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49172 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49173 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49174 Flags};
49175 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49176 }
49177 }
49178
49179 // If this is a select between two integer constants, try to do some
49180 // optimizations. Note that the operands are ordered the opposite of SELECT
49181 // operands.
49182 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49183 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49184 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49185 // larger than FalseC (the false value).
49186 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49188 std::swap(TrueC, FalseC);
49189 std::swap(TrueOp, FalseOp);
49190 }
49191
49192 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49193 // This is efficient for any integer data type (including i8/i16) and
49194 // shift amount.
49195 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49196 Cond = getSETCC(CC, Cond, DL, DAG);
49197
49198 // Zero extend the condition if needed.
49199 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49200
49201 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49202 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49203 DAG.getConstant(ShAmt, DL, MVT::i8));
49204 return Cond;
49205 }
49206
49207 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49208 // for any integer data type, including i8/i16.
49209 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49210 Cond = getSETCC(CC, Cond, DL, DAG);
49211
49212 // Zero extend the condition if needed.
49214 FalseC->getValueType(0), Cond);
49215 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49216 SDValue(FalseC, 0));
49217 return Cond;
49218 }
49219
49220 // Optimize cases that will turn into an LEA instruction. This requires
49221 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49222 if (VT == MVT::i32 || VT == MVT::i64) {
49223 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49224 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49225 "Implicit constant truncation");
49226
49227 bool isFastMultiplier = false;
49228 if (Diff.ult(10)) {
49229 switch (Diff.getZExtValue()) {
49230 default: break;
49231 case 1: // result = add base, cond
49232 case 2: // result = lea base( , cond*2)
49233 case 3: // result = lea base(cond, cond*2)
49234 case 4: // result = lea base( , cond*4)
49235 case 5: // result = lea base(cond, cond*4)
49236 case 8: // result = lea base( , cond*8)
49237 case 9: // result = lea base(cond, cond*8)
49238 isFastMultiplier = true;
49239 break;
49240 }
49241 }
49242
49243 if (isFastMultiplier) {
49244 Cond = getSETCC(CC, Cond, DL ,DAG);
49245 // Zero extend the condition if needed.
49246 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49247 Cond);
49248 // Scale the condition by the difference.
49249 if (Diff != 1)
49250 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49251 DAG.getConstant(Diff, DL, Cond.getValueType()));
49252
49253 // Add the base if non-zero.
49254 if (FalseC->getAPIntValue() != 0)
49255 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49256 SDValue(FalseC, 0));
49257 return Cond;
49258 }
49259 }
49260 }
49261 }
49262
49263 // Handle these cases:
49264 // (select (x != c), e, c) -> select (x != c), e, x),
49265 // (select (x == c), c, e) -> select (x == c), x, e)
49266 // where the c is an integer constant, and the "select" is the combination
49267 // of CMOV and CMP.
49268 //
49269 // The rationale for this change is that the conditional-move from a constant
49270 // needs two instructions, however, conditional-move from a register needs
49271 // only one instruction.
49272 //
49273 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49274 // some instruction-combining opportunities. This opt needs to be
49275 // postponed as late as possible.
49276 //
49277 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49278 // the DCI.xxxx conditions are provided to postpone the optimization as
49279 // late as possible.
49280
49281 ConstantSDNode *CmpAgainst = nullptr;
49282 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49283 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49284 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49285
49286 if (CC == X86::COND_NE &&
49287 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49289 std::swap(TrueOp, FalseOp);
49290 }
49291
49292 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49293 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49294 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49295 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49296 }
49297 }
49298 }
49299
49300 // Transform:
49301 //
49302 // (cmov 1 T (uge T 2))
49303 //
49304 // to:
49305 //
49306 // (adc T 0 (sub T 1))
49307 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49308 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49309 SDValue Cond0 = Cond.getOperand(0);
49310 if (Cond0.getOpcode() == ISD::TRUNCATE)
49311 Cond0 = Cond0.getOperand(0);
49312 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49313 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49314 EVT CondVT = Cond->getValueType(0);
49315 // Subtract 1 and generate a carry.
49316 SDValue NewSub =
49317 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49318 DAG.getConstant(1, DL, CondVT));
49319 SDValue EFLAGS(NewSub.getNode(), 1);
49320 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49321 DAG.getConstant(0, DL, VT), EFLAGS);
49322 }
49323 }
49324
49325 // Fold and/or of setcc's to double CMOV:
49326 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49327 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49328 //
49329 // This combine lets us generate:
49330 // cmovcc1 (jcc1 if we don't have CMOV)
49331 // cmovcc2 (same)
49332 // instead of:
49333 // setcc1
49334 // setcc2
49335 // and/or
49336 // cmovne (jne if we don't have CMOV)
49337 // When we can't use the CMOV instruction, it might increase branch
49338 // mispredicts.
49339 // When we can use CMOV, or when there is no mispredict, this improves
49340 // throughput and reduces register pressure.
49341 //
49342 if (CC == X86::COND_NE) {
49343 SDValue Flags;
49344 X86::CondCode CC0, CC1;
49345 bool isAndSetCC;
49346 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49347 if (isAndSetCC) {
49348 std::swap(FalseOp, TrueOp);
49351 }
49352
49353 SDValue LOps[] = {FalseOp, TrueOp,
49354 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49355 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49356 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49357 Flags};
49358 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49359 return CMOV;
49360 }
49361 }
49362
49363 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49364 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49365 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49366 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49367 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49368 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49369 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49370 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49371 SDValue Add = TrueOp;
49372 SDValue Const = FalseOp;
49373 // Canonicalize the condition code for easier matching and output.
49374 if (CC == X86::COND_E)
49375 std::swap(Add, Const);
49376
49377 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49378 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49379 Add.getResNo() == 0 && Add.hasOneUse() &&
49380 Add.getOperand(1) == Cond.getOperand(0)) {
49381 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49382 Add.getOperand(1));
49383 }
49384
49385 // We might have replaced the constant in the cmov with the LHS of the
49386 // compare. If so change it to the RHS of the compare.
49387 if (Const == Cond.getOperand(0))
49388 Const = Cond.getOperand(1);
49389
49390 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49391 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49392 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49393 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49394 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49395 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49396 // This should constant fold.
49397 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49398 SDValue CMov =
49399 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49400 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49401 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49402 }
49403 }
49404
49405 return SDValue();
49406}
49407
49408/// Different mul shrinking modes.
49410
49412 EVT VT = N->getOperand(0).getValueType();
49413 if (VT.getScalarSizeInBits() != 32)
49414 return false;
49415
49416 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49417 unsigned SignBits[2] = {1, 1};
49418 bool IsPositive[2] = {false, false};
49419 for (unsigned i = 0; i < 2; i++) {
49420 SDValue Opd = N->getOperand(i);
49421
49422 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49423 IsPositive[i] = DAG.SignBitIsZero(Opd);
49424 }
49425
49426 bool AllPositive = IsPositive[0] && IsPositive[1];
49427 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49428 // When ranges are from -128 ~ 127, use MULS8 mode.
49429 if (MinSignBits >= 25)
49431 // When ranges are from 0 ~ 255, use MULU8 mode.
49432 else if (AllPositive && MinSignBits >= 24)
49434 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49435 else if (MinSignBits >= 17)
49437 // When ranges are from 0 ~ 65535, use MULU16 mode.
49438 else if (AllPositive && MinSignBits >= 16)
49440 else
49441 return false;
49442 return true;
49443}
49444
49445/// When the operands of vector mul are extended from smaller size values,
49446/// like i8 and i16, the type of mul may be shrinked to generate more
49447/// efficient code. Two typical patterns are handled:
49448/// Pattern1:
49449/// %2 = sext/zext <N x i8> %1 to <N x i32>
49450/// %4 = sext/zext <N x i8> %3 to <N x i32>
49451// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49452/// %5 = mul <N x i32> %2, %4
49453///
49454/// Pattern2:
49455/// %2 = zext/sext <N x i16> %1 to <N x i32>
49456/// %4 = zext/sext <N x i16> %3 to <N x i32>
49457/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49458/// %5 = mul <N x i32> %2, %4
49459///
49460/// There are four mul shrinking modes:
49461/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49462/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49463/// generate pmullw+sext32 for it (MULS8 mode).
49464/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49465/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49466/// generate pmullw+zext32 for it (MULU8 mode).
49467/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49468/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49469/// generate pmullw+pmulhw for it (MULS16 mode).
49470/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49471/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49472/// generate pmullw+pmulhuw for it (MULU16 mode).
49474 const X86Subtarget &Subtarget) {
49475 // Check for legality
49476 // pmullw/pmulhw are not supported by SSE.
49477 if (!Subtarget.hasSSE2())
49478 return SDValue();
49479
49480 // Check for profitability
49481 // pmulld is supported since SSE41. It is better to use pmulld
49482 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49483 // the expansion.
49484 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49485 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49486 return SDValue();
49487
49489 if (!canReduceVMulWidth(N, DAG, Mode))
49490 return SDValue();
49491
49492 SDValue N0 = N->getOperand(0);
49493 SDValue N1 = N->getOperand(1);
49494 EVT VT = N->getOperand(0).getValueType();
49495 unsigned NumElts = VT.getVectorNumElements();
49496 if ((NumElts % 2) != 0)
49497 return SDValue();
49498
49499 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49500
49501 // Shrink the operands of mul.
49502 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49503 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49504
49505 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49506 // lower part is needed.
49507 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49511 DL, VT, MulLo);
49512
49513 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49514 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49515 // the higher part is also needed.
49516 SDValue MulHi =
49518 ReducedVT, NewN0, NewN1);
49519
49520 // Repack the lower part and higher part result of mul into a wider
49521 // result.
49522 // Generate shuffle functioning as punpcklwd.
49523 SmallVector<int, 16> ShuffleMask(NumElts);
49524 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49525 ShuffleMask[2 * i] = i;
49526 ShuffleMask[2 * i + 1] = i + NumElts;
49527 }
49528 SDValue ResLo =
49529 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49530 ResLo = DAG.getBitcast(ResVT, ResLo);
49531 // Generate shuffle functioning as punpckhwd.
49532 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49533 ShuffleMask[2 * i] = i + NumElts / 2;
49534 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49535 }
49536 SDValue ResHi =
49537 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49538 ResHi = DAG.getBitcast(ResVT, ResHi);
49539 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49540}
49541
49543 EVT VT, const SDLoc &DL) {
49544
49545 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49546 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49547 DAG.getConstant(Mult, DL, VT));
49548 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49549 DAG.getConstant(Shift, DL, MVT::i8));
49550 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49551 N->getOperand(0));
49552 return Result;
49553 };
49554
49555 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49556 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49557 DAG.getConstant(Mul1, DL, VT));
49558 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49559 DAG.getConstant(Mul2, DL, VT));
49560 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49561 N->getOperand(0));
49562 return Result;
49563 };
49564
49565 switch (MulAmt) {
49566 default:
49567 break;
49568 case 11:
49569 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49570 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49571 case 21:
49572 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49573 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49574 case 41:
49575 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49576 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49577 case 22:
49578 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49579 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49580 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49581 case 19:
49582 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49583 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49584 case 37:
49585 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49586 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49587 case 73:
49588 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49589 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49590 case 13:
49591 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49592 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49593 case 23:
49594 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49595 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49596 case 26:
49597 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49598 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49599 case 28:
49600 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49601 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49602 case 29:
49603 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49604 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49605 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49606 }
49607
49608 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49609 // by a single LEA.
49610 // First check if this a sum of two power of 2s because that's easy. Then
49611 // count how many zeros are up to the first bit.
49612 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49613 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49614 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49615 if (ScaleShift >= 1 && ScaleShift < 4) {
49616 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49617 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49618 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49619 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49620 DAG.getConstant(ScaleShift, DL, MVT::i8));
49621 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49622 }
49623 }
49624
49625 return SDValue();
49626}
49627
49628// If the upper 17 bits of either element are zero and the other element are
49629// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49630// PMULLD, except on KNL.
49632 SelectionDAG &DAG,
49633 const X86Subtarget &Subtarget) {
49634 if (!Subtarget.hasSSE2())
49635 return SDValue();
49636
49637 if (Subtarget.isPMADDWDSlow())
49638 return SDValue();
49639
49640 EVT VT = N->getValueType(0);
49641
49642 // Only support vXi32 vectors.
49643 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49644 return SDValue();
49645
49646 // Make sure the type is legal or can split/widen to a legal type.
49647 // With AVX512 but without BWI, we would need to split v32i16.
49648 unsigned NumElts = VT.getVectorNumElements();
49649 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49650 return SDValue();
49651
49652 // With AVX512 but without BWI, we would need to split v32i16.
49653 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49654 return SDValue();
49655
49656 SDValue N0 = N->getOperand(0);
49657 SDValue N1 = N->getOperand(1);
49658
49659 // If we are zero/sign extending two steps without SSE4.1, its better to
49660 // reduce the vmul width instead.
49661 if (!Subtarget.hasSSE41() &&
49662 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49663 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49664 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49665 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49666 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49667 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49668 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49669 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49670 return SDValue();
49671
49672 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49673 // the vmul width instead.
49674 if (!Subtarget.hasSSE41() &&
49675 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49676 N0.getOperand(0).getValueSizeInBits() > 128) &&
49677 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49678 N1.getOperand(0).getValueSizeInBits() > 128))
49679 return SDValue();
49680
49681 // Sign bits must extend down to the lowest i16.
49682 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49683 DAG.ComputeMaxSignificantBits(N0) > 16)
49684 return SDValue();
49685
49686 // At least one of the elements must be zero in the upper 17 bits, or can be
49687 // safely made zero without altering the final result.
49688 auto GetZeroableOp = [&](SDValue Op) {
49689 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49690 if (DAG.MaskedValueIsZero(Op, Mask17))
49691 return Op;
49692 // Mask off upper 16-bits of sign-extended constants.
49694 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49695 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49696 SDValue Src = Op.getOperand(0);
49697 // Convert sext(vXi16) to zext(vXi16).
49698 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49699 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49700 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49701 // which will expand the extension.
49702 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49703 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49704 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49705 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49706 }
49707 }
49708 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49709 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49710 N->isOnlyUserOf(Op.getNode())) {
49711 SDValue Src = Op.getOperand(0);
49712 if (Src.getScalarValueSizeInBits() == 16)
49713 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49714 }
49715 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49716 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49717 N->isOnlyUserOf(Op.getNode())) {
49718 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49719 Op.getOperand(1));
49720 }
49721 return SDValue();
49722 };
49723 SDValue ZeroN0 = GetZeroableOp(N0);
49724 SDValue ZeroN1 = GetZeroableOp(N1);
49725 if (!ZeroN0 && !ZeroN1)
49726 return SDValue();
49727 N0 = ZeroN0 ? ZeroN0 : N0;
49728 N1 = ZeroN1 ? ZeroN1 : N1;
49729
49730 // Use SplitOpsAndApply to handle AVX splitting.
49731 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49733 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49734 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49735 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49736 DAG.getBitcast(OpVT, Ops[0]),
49737 DAG.getBitcast(OpVT, Ops[1]));
49738 };
49739 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49740}
49741
49743 const X86Subtarget &Subtarget) {
49744 if (!Subtarget.hasSSE2())
49745 return SDValue();
49746
49747 EVT VT = N->getValueType(0);
49748
49749 // Only support vXi64 vectors.
49750 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49751 VT.getVectorNumElements() < 2 ||
49753 return SDValue();
49754
49755 SDValue N0 = N->getOperand(0);
49756 SDValue N1 = N->getOperand(1);
49757
49758 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49759 // 32-bits. We can lower with this if the sign bits stretch that far.
49760 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49761 DAG.ComputeNumSignBits(N1) > 32) {
49762 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49764 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49765 };
49766 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49767 /*CheckBWI*/ false);
49768 }
49769
49770 // If the upper bits are zero we can use a single pmuludq.
49771 APInt Mask = APInt::getHighBitsSet(64, 32);
49772 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49773 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49775 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49776 };
49777 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49778 /*CheckBWI*/ false);
49779 }
49780
49781 return SDValue();
49782}
49783
49786 const X86Subtarget &Subtarget) {
49787 EVT VT = N->getValueType(0);
49788 SDLoc DL(N);
49789
49790 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49791 return V;
49792
49793 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49794 return V;
49795
49796 if (DCI.isBeforeLegalize() && VT.isVector())
49797 return reduceVMULWidth(N, DL, DAG, Subtarget);
49798
49799 if (VT != MVT::i64 && VT != MVT::i32 &&
49800 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49801 return SDValue();
49802
49803 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49804 if (!Known1.isConstant())
49805 return SDValue();
49806
49807 const APInt &C = Known1.getConstant();
49808 if (C.isZero())
49809 return DAG.getConstant(0, DL, VT);
49810
49811 if (C.isAllOnes())
49812 return DAG.getNegative(N->getOperand(0), DL, VT);
49813
49814 if (isPowerOf2_64(C.getZExtValue()))
49815 return SDValue();
49816
49817 // Optimize a single multiply with constant into two operations in order to
49818 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49820 return SDValue();
49821
49822 // An imul is usually smaller than the alternative sequence.
49824 return SDValue();
49825
49826 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49827 return SDValue();
49828
49829 int64_t SignMulAmt = C.getSExtValue();
49830 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49831 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49832
49833 SDValue NewMul = SDValue();
49834 if (VT == MVT::i64 || VT == MVT::i32) {
49835 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49836 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49837 DAG.getConstant(AbsMulAmt, DL, VT));
49838 if (SignMulAmt < 0)
49839 NewMul = DAG.getNegative(NewMul, DL, VT);
49840
49841 return NewMul;
49842 }
49843
49844 uint64_t MulAmt1 = 0;
49845 uint64_t MulAmt2 = 0;
49846 if ((AbsMulAmt % 9) == 0) {
49847 MulAmt1 = 9;
49848 MulAmt2 = AbsMulAmt / 9;
49849 } else if ((AbsMulAmt % 5) == 0) {
49850 MulAmt1 = 5;
49851 MulAmt2 = AbsMulAmt / 5;
49852 } else if ((AbsMulAmt % 3) == 0) {
49853 MulAmt1 = 3;
49854 MulAmt2 = AbsMulAmt / 3;
49855 }
49856
49857 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49858 if (MulAmt2 &&
49859 (isPowerOf2_64(MulAmt2) ||
49860 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49861
49862 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49863 N->user_begin()->getOpcode() == ISD::ADD))
49864 // If second multiplifer is pow2, issue it first. We want the multiply
49865 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49866 // use is an add. Only do this for positive multiply amounts since the
49867 // negate would prevent it from being used as an address mode anyway.
49868 std::swap(MulAmt1, MulAmt2);
49869
49870 if (isPowerOf2_64(MulAmt1))
49871 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49872 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49873 else
49874 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49875 DAG.getConstant(MulAmt1, DL, VT));
49876
49877 if (isPowerOf2_64(MulAmt2))
49878 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49879 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49880 else
49881 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49882 DAG.getConstant(MulAmt2, DL, VT));
49883
49884 // Negate the result.
49885 if (SignMulAmt < 0)
49886 NewMul = DAG.getNegative(NewMul, DL, VT);
49887 } else if (!Subtarget.slowLEA())
49888 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49889 }
49890 if (!NewMul) {
49891 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49892 if (isPowerOf2_64(AbsMulAmt - 1)) {
49893 // (mul x, 2^N + 1) => (add (shl x, N), x)
49894 NewMul = DAG.getNode(
49895 ISD::ADD, DL, VT, N->getOperand(0),
49896 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49897 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49898 if (SignMulAmt < 0)
49899 NewMul = DAG.getNegative(NewMul, DL, VT);
49900 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49901 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49902 NewMul =
49903 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49904 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49905 // To negate, reverse the operands of the subtract.
49906 if (SignMulAmt < 0)
49907 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49908 else
49909 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49910 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49911 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49912 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49913 NewMul =
49914 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49915 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49916 NewMul = DAG.getNode(
49917 ISD::ADD, DL, VT, NewMul,
49918 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49919 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49920 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49921 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49922 NewMul =
49923 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49924 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49925 NewMul = DAG.getNode(
49926 ISD::SUB, DL, VT, NewMul,
49927 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49928 } else if (SignMulAmt >= 0 && VT.isVector() &&
49929 Subtarget.fastImmVectorShift()) {
49930 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49931 uint64_t ShiftAmt1;
49932 std::optional<unsigned> Opc;
49933 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49934 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49935 Opc = ISD::ADD;
49936 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49937 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49938 Opc = ISD::SUB;
49939 }
49940
49941 if (Opc) {
49942 SDValue Shift1 =
49943 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49944 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49945 SDValue Shift2 =
49946 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49947 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49948 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49949 }
49950 }
49951 }
49952
49953 return NewMul;
49954}
49955
49956// Try to form a MULHU or MULHS node by looking for
49957// (srl (mul ext, ext), 16)
49958// TODO: This is X86 specific because we want to be able to handle wide types
49959// before type legalization. But we can only do it if the vector will be
49960// legalized via widening/splitting. Type legalization can't handle promotion
49961// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49962// combiner.
49964 const SDLoc &DL,
49965 const X86Subtarget &Subtarget) {
49966 using namespace SDPatternMatch;
49967 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49968 "SRL or SRA node is required here!");
49969
49970 if (!Subtarget.hasSSE2())
49971 return SDValue();
49972
49973 // Input type should be at least vXi32.
49974 EVT VT = N->getValueType(0);
49975 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49976 return SDValue();
49977
49978 // The operation must be a multiply shifted right by 16.
49979 SDValue LHS, RHS;
49980 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49981 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49982 return SDValue();
49983
49984 unsigned ExtOpc = LHS.getOpcode();
49985 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49986 RHS.getOpcode() != ExtOpc)
49987 return SDValue();
49988
49989 // Peek through the extends.
49990 LHS = LHS.getOperand(0);
49991 RHS = RHS.getOperand(0);
49992
49993 // Ensure the input types match.
49994 EVT MulVT = LHS.getValueType();
49995 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49996 return SDValue();
49997
49998 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49999 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
50000
50001 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50002 return DAG.getNode(ExtOpc, DL, VT, Mulh);
50003}
50004
50006 const X86Subtarget &Subtarget) {
50007 using namespace llvm::SDPatternMatch;
50008 SDValue N0 = N->getOperand(0);
50009 SDValue N1 = N->getOperand(1);
50011 EVT VT = N0.getValueType();
50012 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50013 SDLoc DL(N);
50014
50015 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50016 // with out-of-bounds clamping.
50017 if (N0.getOpcode() == ISD::VSELECT &&
50018 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
50019 SDValue Cond = N0.getOperand(0);
50020 SDValue N00 = N0.getOperand(1);
50021 SDValue N01 = N0.getOperand(2);
50022 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
50024 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50026 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
50027 }
50028 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
50030 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50032 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
50033 }
50034 }
50035
50036 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
50037 // since the result of setcc_c is all zero's or all ones.
50038 if (VT.isInteger() && !VT.isVector() &&
50039 N1C && N0.getOpcode() == ISD::AND &&
50040 N0.getOperand(1).getOpcode() == ISD::Constant) {
50041 SDValue N00 = N0.getOperand(0);
50042 APInt Mask = N0.getConstantOperandAPInt(1);
50043 Mask <<= N1C->getAPIntValue();
50044 bool MaskOK = false;
50045 // We can handle cases concerning bit-widening nodes containing setcc_c if
50046 // we carefully interrogate the mask to make sure we are semantics
50047 // preserving.
50048 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
50049 // of the underlying setcc_c operation if the setcc_c was zero extended.
50050 // Consider the following example:
50051 // zext(setcc_c) -> i32 0x0000FFFF
50052 // c1 -> i32 0x0000FFFF
50053 // c2 -> i32 0x00000001
50054 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
50055 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
50056 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
50057 MaskOK = true;
50058 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
50060 MaskOK = true;
50061 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
50062 N00.getOpcode() == ISD::ANY_EXTEND) &&
50064 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
50065 }
50066 if (MaskOK && Mask != 0)
50067 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
50068 }
50069
50070 return SDValue();
50071}
50072
50074 const X86Subtarget &Subtarget) {
50075 using namespace llvm::SDPatternMatch;
50076 SDValue N0 = N->getOperand(0);
50077 SDValue N1 = N->getOperand(1);
50078 EVT VT = N0.getValueType();
50079 unsigned Size = VT.getSizeInBits();
50080 SDLoc DL(N);
50081
50082 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50083 return V;
50084
50085 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
50086 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
50087 SDValue ShrAmtVal;
50088 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
50090 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
50091 }
50092
50093 // fold (SRA (SHL X, ShlConst), SraConst)
50094 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50095 // or (sext_in_reg X)
50096 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50097 // depending on relation between SraConst and ShlConst.
50098 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50099 // us to do the sext_in_reg from corresponding bit.
50100
50101 // sexts in X86 are MOVs. The MOVs have the same code size
50102 // as above SHIFTs (only SHIFT on 1 has lower code size).
50103 // However the MOVs have 2 advantages to a SHIFT:
50104 // 1. MOVs can write to a register that differs from source
50105 // 2. MOVs accept memory operands
50106
50107 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50108 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50110 return SDValue();
50111
50112 SDValue N00 = N0.getOperand(0);
50113 SDValue N01 = N0.getOperand(1);
50114 APInt ShlConst = N01->getAsAPIntVal();
50115 APInt SraConst = N1->getAsAPIntVal();
50116 EVT CVT = N1.getValueType();
50117
50118 if (CVT != N01.getValueType())
50119 return SDValue();
50120 if (SraConst.isNegative())
50121 return SDValue();
50122
50123 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50124 unsigned ShiftSize = SVT.getSizeInBits();
50125 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50126 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50127 continue;
50128 SDValue NN =
50129 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50130 if (SraConst.eq(ShlConst))
50131 return NN;
50132 if (SraConst.ult(ShlConst))
50133 return DAG.getNode(ISD::SHL, DL, VT, NN,
50134 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50135 return DAG.getNode(ISD::SRA, DL, VT, NN,
50136 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50137 }
50138 return SDValue();
50139}
50140
50143 const X86Subtarget &Subtarget) {
50144 using namespace llvm::SDPatternMatch;
50145 SDValue N0 = N->getOperand(0);
50146 SDValue N1 = N->getOperand(1);
50147 EVT VT = N0.getValueType();
50148 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50149 SDLoc DL(N);
50150
50151 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50152 return V;
50153
50154 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50155 // with out-of-bounds clamping.
50156 if (N0.getOpcode() == ISD::VSELECT &&
50157 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50158 SDValue Cond = N0.getOperand(0);
50159 SDValue N00 = N0.getOperand(1);
50160 SDValue N01 = N0.getOperand(2);
50161 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50163 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50165 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50166 }
50167 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50169 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50171 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50172 }
50173 }
50174
50175 // Only do this on the last DAG combine as it can interfere with other
50176 // combines.
50177 if (!DCI.isAfterLegalizeDAG())
50178 return SDValue();
50179
50180 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50181 // TODO: This is a generic DAG combine that became an x86-only combine to
50182 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50183 // and-not ('andn').
50184 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50185 return SDValue();
50186
50187 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50188 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50189 if (!ShiftC || !AndC)
50190 return SDValue();
50191
50192 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50193 // transform should reduce code size. It may also enable secondary transforms
50194 // from improved known-bits analysis or instruction selection.
50195 APInt MaskVal = AndC->getAPIntValue();
50196
50197 // If this can be matched by a zero extend, don't optimize.
50198 if (MaskVal.isMask()) {
50199 unsigned TO = MaskVal.countr_one();
50200 if (TO >= 8 && isPowerOf2_32(TO))
50201 return SDValue();
50202 }
50203
50204 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50205 unsigned OldMaskSize = MaskVal.getSignificantBits();
50206 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50207 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50208 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50209 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50210 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50211 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50212 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50213 }
50214 return SDValue();
50215}
50216
50218 const X86Subtarget &Subtarget) {
50219 unsigned Opcode = N->getOpcode();
50220 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50221
50222 SDLoc DL(N);
50223 EVT VT = N->getValueType(0);
50224 SDValue N0 = N->getOperand(0);
50225 SDValue N1 = N->getOperand(1);
50226 EVT SrcVT = N0.getValueType();
50227
50228 SDValue BC0 =
50229 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50230 SDValue BC1 =
50231 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50232
50233 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50234 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50235 // truncation trees that help us avoid lane crossing shuffles.
50236 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50237 // TODO: We don't handle vXf64 shuffles yet.
50238 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50239 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50241 SmallVector<int> ShuffleMask, ScaledMask;
50242 SDValue Vec = peekThroughBitcasts(BCSrc);
50243 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50245 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50246 // shuffle to a v4X64 width - we can probably relax this in the future.
50247 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50248 ShuffleOps[0].getValueType().is256BitVector() &&
50249 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50250 SDValue Lo, Hi;
50251 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50252 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50253 Lo = DAG.getBitcast(SrcVT, Lo);
50254 Hi = DAG.getBitcast(SrcVT, Hi);
50255 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50256 Res = DAG.getBitcast(ShufVT, Res);
50257 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50258 return DAG.getBitcast(VT, Res);
50259 }
50260 }
50261 }
50262 }
50263
50264 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50265 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50266 // If either/both ops are a shuffle that can scale to v2x64,
50267 // then see if we can perform this as a v4x32 post shuffle.
50268 SmallVector<SDValue> Ops0, Ops1;
50269 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50270 bool IsShuf0 =
50271 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50272 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50273 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50274 bool IsShuf1 =
50275 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50276 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50277 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50278 if (IsShuf0 || IsShuf1) {
50279 if (!IsShuf0) {
50280 Ops0.assign({BC0});
50281 ScaledMask0.assign({0, 1});
50282 }
50283 if (!IsShuf1) {
50284 Ops1.assign({BC1});
50285 ScaledMask1.assign({0, 1});
50286 }
50287
50288 SDValue LHS, RHS;
50289 int PostShuffle[4] = {-1, -1, -1, -1};
50290 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50291 if (M < 0)
50292 return true;
50293 Idx = M % 2;
50294 SDValue Src = Ops[M / 2];
50295 if (!LHS || LHS == Src) {
50296 LHS = Src;
50297 return true;
50298 }
50299 if (!RHS || RHS == Src) {
50300 Idx += 2;
50301 RHS = Src;
50302 return true;
50303 }
50304 return false;
50305 };
50306 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50307 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50308 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50309 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50310 LHS = DAG.getBitcast(SrcVT, LHS);
50311 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50312 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50313 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50314 Res = DAG.getBitcast(ShufVT, Res);
50315 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50316 return DAG.getBitcast(VT, Res);
50317 }
50318 }
50319 }
50320
50321 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50322 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50323 SmallVector<int> Mask0, Mask1;
50324 SmallVector<SDValue> Ops0, Ops1;
50325 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50326 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50327 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50328 !Ops0.empty() && !Ops1.empty() &&
50329 all_of(Ops0,
50330 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50331 all_of(Ops1,
50332 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50333 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50334 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50335 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50336 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50337 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50338 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50339 if ((Op00 == Op11) && (Op01 == Op10)) {
50340 std::swap(Op10, Op11);
50342 }
50343 if ((Op00 == Op10) && (Op01 == Op11)) {
50344 const int Map[4] = {0, 2, 1, 3};
50345 SmallVector<int, 4> ShuffleMask(
50346 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50347 Map[ScaledMask1[1]]});
50348 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50349 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50350 DAG.getBitcast(SrcVT, Op01));
50351 Res = DAG.getBitcast(ShufVT, Res);
50352 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50353 return DAG.getBitcast(VT, Res);
50354 }
50355 }
50356 }
50357
50358 return SDValue();
50359}
50360
50363 const X86Subtarget &Subtarget) {
50364 unsigned Opcode = N->getOpcode();
50365 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50366 "Unexpected pack opcode");
50367
50368 EVT VT = N->getValueType(0);
50369 SDValue N0 = N->getOperand(0);
50370 SDValue N1 = N->getOperand(1);
50371 unsigned NumDstElts = VT.getVectorNumElements();
50372 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50373 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50374 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50375 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50376 "Unexpected PACKSS/PACKUS input type");
50377
50378 bool IsSigned = (X86ISD::PACKSS == Opcode);
50379
50380 // Constant Folding.
50381 APInt UndefElts0, UndefElts1;
50382 SmallVector<APInt, 32> EltBits0, EltBits1;
50383 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50384 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50385 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50386 /*AllowWholeUndefs*/ true,
50387 /*AllowPartialUndefs*/ true) &&
50388 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50389 /*AllowWholeUndefs*/ true,
50390 /*AllowPartialUndefs*/ true)) {
50391 unsigned NumLanes = VT.getSizeInBits() / 128;
50392 unsigned NumSrcElts = NumDstElts / 2;
50393 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50394 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50395
50396 APInt Undefs(NumDstElts, 0);
50397 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50398 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50399 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50400 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50401 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50402 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50403
50404 if (UndefElts[SrcIdx]) {
50405 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50406 continue;
50407 }
50408
50409 APInt &Val = EltBits[SrcIdx];
50410 if (IsSigned) {
50411 // PACKSS: Truncate signed value with signed saturation.
50412 // Source values less than dst minint are saturated to minint.
50413 // Source values greater than dst maxint are saturated to maxint.
50414 Val = Val.truncSSat(DstBitsPerElt);
50415 } else {
50416 // PACKUS: Truncate signed value with unsigned saturation.
50417 // Source values less than zero are saturated to zero.
50418 // Source values greater than dst maxuint are saturated to maxuint.
50419 // NOTE: This is different from APInt::truncUSat.
50420 if (Val.isIntN(DstBitsPerElt))
50421 Val = Val.trunc(DstBitsPerElt);
50422 else if (Val.isNegative())
50423 Val = APInt::getZero(DstBitsPerElt);
50424 else
50425 Val = APInt::getAllOnes(DstBitsPerElt);
50426 }
50427 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50428 }
50429 }
50430
50431 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50432 }
50433
50434 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50435 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50436 return V;
50437
50438 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50439 // Currently limit this to allsignbits cases only.
50440 if (IsSigned &&
50441 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50442 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50443 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50444 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50445 if (Not0 && Not1) {
50446 SDLoc DL(N);
50447 MVT SrcVT = N0.getSimpleValueType();
50448 SDValue Pack =
50449 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50450 DAG.getBitcast(SrcVT, Not1));
50451 return DAG.getNOT(DL, Pack, VT);
50452 }
50453 }
50454
50455 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50456 // truncate to create a larger truncate.
50457 if (Subtarget.hasAVX512() &&
50458 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50459 N0.getOperand(0).getValueType() == MVT::v8i32) {
50460 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50461 (!IsSigned &&
50462 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50463 if (Subtarget.hasVLX())
50464 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50465
50466 // Widen input to v16i32 so we can truncate that.
50467 SDLoc dl(N);
50468 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50469 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50470 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50471 }
50472 }
50473
50474 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50475 if (VT.is128BitVector()) {
50476 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50477 SDValue Src0, Src1;
50478 if (N0.getOpcode() == ExtOpc &&
50480 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50481 Src0 = N0.getOperand(0);
50482 }
50483 if (N1.getOpcode() == ExtOpc &&
50485 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50486 Src1 = N1.getOperand(0);
50487 }
50488 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50489 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50490 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50491 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50492 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50493 }
50494
50495 // Try again with pack(*_extend_vector_inreg, undef).
50496 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50498 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50499 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50500 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50501 DAG);
50502 }
50503
50504 // Attempt to combine as shuffle.
50505 SDValue Op(N, 0);
50506 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50507 return Res;
50508
50509 return SDValue();
50510}
50511
50514 const X86Subtarget &Subtarget) {
50515 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50516 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50517 "Unexpected horizontal add/sub opcode");
50518
50519 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50520 MVT VT = N->getSimpleValueType(0);
50521 SDValue LHS = N->getOperand(0);
50522 SDValue RHS = N->getOperand(1);
50523
50524 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50525 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50526 LHS.getOpcode() == RHS.getOpcode() &&
50527 LHS.getValueType() == RHS.getValueType() &&
50528 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50529 SDValue LHS0 = LHS.getOperand(0);
50530 SDValue LHS1 = LHS.getOperand(1);
50531 SDValue RHS0 = RHS.getOperand(0);
50532 SDValue RHS1 = RHS.getOperand(1);
50533 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50534 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50535 SDLoc DL(N);
50536 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50537 LHS0.isUndef() ? LHS1 : LHS0,
50538 RHS0.isUndef() ? RHS1 : RHS0);
50539 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50540 Res = DAG.getBitcast(ShufVT, Res);
50541 SDValue NewLHS =
50542 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50543 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50544 SDValue NewRHS =
50545 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50546 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50547 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50548 DAG.getBitcast(VT, NewRHS));
50549 }
50550 }
50551 }
50552
50553 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50554 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50555 return V;
50556
50557 return SDValue();
50558}
50559
50562 const X86Subtarget &Subtarget) {
50563 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50564 X86ISD::VSRL == N->getOpcode()) &&
50565 "Unexpected shift opcode");
50566 EVT VT = N->getValueType(0);
50567 SDValue N0 = N->getOperand(0);
50568 SDValue N1 = N->getOperand(1);
50569
50570 // Shift zero -> zero.
50572 return DAG.getConstant(0, SDLoc(N), VT);
50573
50574 // Detect constant shift amounts.
50575 APInt UndefElts;
50576 SmallVector<APInt, 32> EltBits;
50577 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50578 /*AllowWholeUndefs*/ true,
50579 /*AllowPartialUndefs*/ false)) {
50580 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50581 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50582 EltBits[0].getZExtValue(), DAG);
50583 }
50584
50585 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50586 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50587 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50588 return SDValue(N, 0);
50589
50590 return SDValue();
50591}
50592
50595 const X86Subtarget &Subtarget) {
50596 unsigned Opcode = N->getOpcode();
50597 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50598 X86ISD::VSRLI == Opcode) &&
50599 "Unexpected shift opcode");
50600 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50601 EVT VT = N->getValueType(0);
50602 SDValue N0 = N->getOperand(0);
50603 SDValue N1 = N->getOperand(1);
50604 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50605 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50606 "Unexpected value type");
50607 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50608
50609 // (shift undef, X) -> 0
50610 if (N0.isUndef())
50611 return DAG.getConstant(0, SDLoc(N), VT);
50612
50613 // Out of range logical bit shifts are guaranteed to be zero.
50614 // Out of range arithmetic bit shifts splat the sign bit.
50615 unsigned ShiftVal = N->getConstantOperandVal(1);
50616 if (ShiftVal >= NumBitsPerElt) {
50617 if (LogicalShift)
50618 return DAG.getConstant(0, SDLoc(N), VT);
50619 ShiftVal = NumBitsPerElt - 1;
50620 }
50621
50622 // (shift X, 0) -> X
50623 if (!ShiftVal)
50624 return N0;
50625
50626 // (shift 0, C) -> 0
50628 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50629 // result are all zeros, not undef.
50630 return DAG.getConstant(0, SDLoc(N), VT);
50631
50632 // (VSRAI -1, C) -> -1
50633 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50634 // N0 is all ones or undef. We guarantee that the bits shifted into the
50635 // result are all ones, not undef.
50636 return DAG.getAllOnesConstant(SDLoc(N), VT);
50637
50638 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50639 unsigned NewShiftVal = Amt0 + Amt1;
50640 if (NewShiftVal >= NumBitsPerElt) {
50641 // Out of range logical bit shifts are guaranteed to be zero.
50642 // Out of range arithmetic bit shifts splat the sign bit.
50643 if (LogicalShift)
50644 return DAG.getConstant(0, SDLoc(N), VT);
50645 NewShiftVal = NumBitsPerElt - 1;
50646 }
50647 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50648 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50649 };
50650
50651 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50652 if (Opcode == N0.getOpcode())
50653 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50654
50655 // (shl (add X, X), C) -> (shl X, (C + 1))
50656 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50657 N0.getOperand(0) == N0.getOperand(1))
50658 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50659
50660 // We can decode 'whole byte' logical bit shifts as shuffles.
50661 if (LogicalShift && (ShiftVal % 8) == 0) {
50662 SDValue Op(N, 0);
50663 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50664 return Res;
50665 }
50666
50667 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50668 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50669 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50670 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50671 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50672 N0.getOpcode() == X86ISD::PSHUFD &&
50673 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50674 N0->hasOneUse()) {
50676 if (BC.getOpcode() == X86ISD::VSHLI &&
50677 BC.getScalarValueSizeInBits() == 64 &&
50678 BC.getConstantOperandVal(1) == 63) {
50679 SDLoc DL(N);
50680 SDValue Src = BC.getOperand(0);
50681 Src = DAG.getBitcast(VT, Src);
50682 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50683 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50684 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50685 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50686 return Src;
50687 }
50688 }
50689
50690 auto TryConstantFold = [&](SDValue V) {
50691 APInt UndefElts;
50692 SmallVector<APInt, 32> EltBits;
50693 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50694 /*AllowWholeUndefs*/ true,
50695 /*AllowPartialUndefs*/ true))
50696 return SDValue();
50697 assert(EltBits.size() == VT.getVectorNumElements() &&
50698 "Unexpected shift value type");
50699 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50700 // created an undef input due to no input bits being demanded, but user
50701 // still expects 0 in other bits.
50702 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50703 APInt &Elt = EltBits[i];
50704 if (UndefElts[i])
50705 Elt = 0;
50706 else if (X86ISD::VSHLI == Opcode)
50707 Elt <<= ShiftVal;
50708 else if (X86ISD::VSRAI == Opcode)
50709 Elt.ashrInPlace(ShiftVal);
50710 else
50711 Elt.lshrInPlace(ShiftVal);
50712 }
50713 // Reset undef elements since they were zeroed above.
50714 UndefElts = 0;
50715 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50716 };
50717
50718 // Constant Folding.
50719 if (N->isOnlyUserOf(N0.getNode())) {
50720 if (SDValue C = TryConstantFold(N0))
50721 return C;
50722
50723 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50724 // Don't break NOT patterns.
50726 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50727 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50729 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50730 SDLoc DL(N);
50731 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50732 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50733 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50734 }
50735 }
50736 }
50737
50738 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50739 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50740 DCI))
50741 return SDValue(N, 0);
50742
50743 return SDValue();
50744}
50745
50748 const X86Subtarget &Subtarget) {
50749 EVT VT = N->getValueType(0);
50750 unsigned Opcode = N->getOpcode();
50751 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50752 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50753 Opcode == ISD::INSERT_VECTOR_ELT) &&
50754 "Unexpected vector insertion");
50755
50756 SDValue Vec = N->getOperand(0);
50757 SDValue Scl = N->getOperand(1);
50758 SDValue Idx = N->getOperand(2);
50759
50760 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50761 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50762 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50763
50764 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50765 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50766 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50767 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50768 APInt::getAllOnes(NumBitsPerElt), DCI))
50769 return SDValue(N, 0);
50770 }
50771
50772 // Attempt to combine insertion patterns to a shuffle.
50773 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50774 SDValue Op(N, 0);
50775 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50776 return Res;
50777 }
50778
50779 return SDValue();
50780}
50781
50782/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50783/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50784/// OR -> CMPNEQSS.
50787 const X86Subtarget &Subtarget) {
50788 unsigned opcode;
50789
50790 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50791 // we're requiring SSE2 for both.
50792 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50793 SDValue N0 = N->getOperand(0);
50794 SDValue N1 = N->getOperand(1);
50795 SDValue CMP0 = N0.getOperand(1);
50796 SDValue CMP1 = N1.getOperand(1);
50797 SDLoc DL(N);
50798
50799 // The SETCCs should both refer to the same CMP.
50800 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50801 return SDValue();
50802
50803 SDValue CMP00 = CMP0->getOperand(0);
50804 SDValue CMP01 = CMP0->getOperand(1);
50805 EVT VT = CMP00.getValueType();
50806
50807 if (VT == MVT::f32 || VT == MVT::f64 ||
50808 (VT == MVT::f16 && Subtarget.hasFP16())) {
50809 bool ExpectingFlags = false;
50810 // Check for any users that want flags:
50811 for (const SDNode *U : N->users()) {
50812 if (ExpectingFlags)
50813 break;
50814
50815 switch (U->getOpcode()) {
50816 default:
50817 case ISD::BR_CC:
50818 case ISD::BRCOND:
50819 case ISD::SELECT:
50820 ExpectingFlags = true;
50821 break;
50822 case ISD::CopyToReg:
50823 case ISD::SIGN_EXTEND:
50824 case ISD::ZERO_EXTEND:
50825 case ISD::ANY_EXTEND:
50826 break;
50827 }
50828 }
50829
50830 if (!ExpectingFlags) {
50831 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50832 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50833
50834 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50835 X86::CondCode tmp = cc0;
50836 cc0 = cc1;
50837 cc1 = tmp;
50838 }
50839
50840 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50841 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50842 // FIXME: need symbolic constants for these magic numbers.
50843 // See X86ATTInstPrinter.cpp:printSSECC().
50844 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50845 if (Subtarget.hasAVX512()) {
50846 SDValue FSetCC =
50847 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50848 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50849 // Need to fill with zeros to ensure the bitcast will produce zeroes
50850 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50851 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50852 DAG.getConstant(0, DL, MVT::v16i1),
50853 FSetCC, DAG.getVectorIdxConstant(0, DL));
50854 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50855 N->getSimpleValueType(0));
50856 }
50857 SDValue OnesOrZeroesF =
50858 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50859 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50860
50861 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50862 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50863
50864 if (is64BitFP && !Subtarget.is64Bit()) {
50865 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50866 // 64-bit integer, since that's not a legal type. Since
50867 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50868 // bits, but can do this little dance to extract the lowest 32 bits
50869 // and work with those going forward.
50870 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50871 MVT::v2f64, OnesOrZeroesF);
50872 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50873 OnesOrZeroesF =
50874 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50875 DAG.getVectorIdxConstant(0, DL));
50876 IntVT = MVT::i32;
50877 }
50878
50879 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50880 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50881 DAG.getConstant(1, DL, IntVT));
50882 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50883 ANDed);
50884 return OneBitOfTruth;
50885 }
50886 }
50887 }
50888 }
50889 return SDValue();
50890}
50891
50892/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50894 SelectionDAG &DAG) {
50895 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50896
50897 MVT VT = N->getSimpleValueType(0);
50898 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50899 return SDValue();
50900
50901 SDValue X, Y;
50902 SDValue N0 = N->getOperand(0);
50903 SDValue N1 = N->getOperand(1);
50904
50905 if (SDValue Not = IsNOT(N0, DAG)) {
50906 X = Not;
50907 Y = N1;
50908 } else if (SDValue Not = IsNOT(N1, DAG)) {
50909 X = Not;
50910 Y = N0;
50911 } else
50912 return SDValue();
50913
50914 X = DAG.getBitcast(VT, X);
50915 Y = DAG.getBitcast(VT, Y);
50916 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50917}
50918
50919/// Try to fold:
50920/// and (vector_shuffle<Z,...,Z>
50921/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50922/// ->
50923/// andnp (vector_shuffle<Z,...,Z>
50924/// (insert_vector_elt undef, X, Z), undef), Y
50926 const X86Subtarget &Subtarget) {
50927 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50928
50929 EVT VT = N->getValueType(0);
50930 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50931 // value and require extra moves.
50932 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50933 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50934 return SDValue();
50935
50936 auto GetNot = [&DAG](SDValue V) {
50938 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50939 // end-users are ISD::AND including cases
50940 // (and(extract_vector_element(SVN), Y)).
50941 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50942 !SVN->getOperand(1).isUndef()) {
50943 return SDValue();
50944 }
50945 SDValue IVEN = SVN->getOperand(0);
50946 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50947 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50948 return SDValue();
50949 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50950 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50951 return SDValue();
50952 SDValue Src = IVEN.getOperand(1);
50953 if (SDValue Not = IsNOT(Src, DAG)) {
50954 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50955 SDValue NotIVEN =
50957 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50958 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50959 SVN->getOperand(1), SVN->getMask());
50960 }
50961 return SDValue();
50962 };
50963
50964 SDValue X, Y;
50965 SDValue N0 = N->getOperand(0);
50966 SDValue N1 = N->getOperand(1);
50967 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50968
50969 if (SDValue Not = GetNot(N0)) {
50970 X = Not;
50971 Y = N1;
50972 } else if (SDValue Not = GetNot(N1)) {
50973 X = Not;
50974 Y = N0;
50975 } else
50976 return SDValue();
50977
50978 X = DAG.getBitcast(VT, X);
50979 Y = DAG.getBitcast(VT, Y);
50980 SDLoc DL(N);
50981
50982 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50983 // AVX2.
50984 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50986 SDValue LoX, HiX;
50987 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50988 SDValue LoY, HiY;
50989 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50990 EVT SplitVT = LoX.getValueType();
50991 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50992 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50993 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50994 }
50995
50996 if (TLI.isTypeLegal(VT))
50997 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50998
50999 return SDValue();
51000}
51001
51002// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
51003// logical operations, like in the example below.
51004// or (and (truncate x, truncate y)),
51005// (xor (truncate z, build_vector (constants)))
51006// Given a target type \p VT, we generate
51007// or (and x, y), (xor z, zext(build_vector (constants)))
51008// given x, y and z are of type \p VT. We can do so, if operands are either
51009// truncates from VT types, the second operand is a vector of constants, can
51010// be recursively promoted or is an existing extension we can extend further.
51012 SelectionDAG &DAG,
51013 const X86Subtarget &Subtarget,
51014 unsigned Depth) {
51015 // Limit recursion to avoid excessive compile times.
51017 return SDValue();
51018
51019 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
51020 return SDValue();
51021
51022 SDValue N0 = N.getOperand(0);
51023 SDValue N1 = N.getOperand(1);
51024
51025 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51026 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
51027 return SDValue();
51028
51029 if (SDValue NN0 =
51030 PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
51031 N0 = NN0;
51032 else {
51033 // The left side has to be a 'trunc'.
51034 bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
51035 N0.getOperand(0).getValueType() == VT;
51036 if (LHSTrunc)
51037 N0 = N0.getOperand(0);
51038 else
51039 return SDValue();
51040 }
51041
51042 if (SDValue NN1 =
51043 PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
51044 N1 = NN1;
51045 else {
51046 // The right side has to be a 'trunc', a (foldable) constant or an
51047 // existing extension we can extend further.
51048 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
51049 N1.getOperand(0).getValueType() == VT;
51050 if (RHSTrunc)
51051 N1 = N1.getOperand(0);
51052 else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
51053 Subtarget.hasInt256() && N1.hasOneUse())
51054 N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
51055 else if (SDValue Cst =
51057 N1 = Cst;
51058 else
51059 return SDValue();
51060 }
51061
51062 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
51063}
51064
51065// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
51066// register. In most cases we actually compare or select YMM-sized registers
51067// and mixing the two types creates horrible code. This method optimizes
51068// some of the transition sequences.
51069// Even with AVX-512 this is still useful for removing casts around logical
51070// operations on vXi1 mask types.
51072 SelectionDAG &DAG,
51073 const X86Subtarget &Subtarget) {
51074 EVT VT = N.getValueType();
51075 assert(VT.isVector() && "Expected vector type");
51076 assert((N.getOpcode() == ISD::ANY_EXTEND ||
51077 N.getOpcode() == ISD::ZERO_EXTEND ||
51078 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
51079
51080 SDValue Narrow = N.getOperand(0);
51081 EVT NarrowVT = Narrow.getValueType();
51082
51083 // Generate the wide operation.
51084 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
51085 if (!Op)
51086 return SDValue();
51087 switch (N.getOpcode()) {
51088 default: llvm_unreachable("Unexpected opcode");
51089 case ISD::ANY_EXTEND:
51090 return Op;
51091 case ISD::ZERO_EXTEND:
51092 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
51093 case ISD::SIGN_EXTEND:
51094 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
51095 Op, DAG.getValueType(NarrowVT));
51096 }
51097}
51098
51099static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51100 unsigned FPOpcode;
51101 switch (Opcode) {
51102 // clang-format off
51103 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51104 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51105 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51106 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51107 // clang-format on
51108 }
51109 return FPOpcode;
51110}
51111
51112/// If both input operands of a logic op are being cast from floating-point
51113/// types or FP compares, try to convert this into a floating-point logic node
51114/// to avoid unnecessary moves from SSE to integer registers.
51115static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51116 SDValue N0, SDValue N1,
51117 SelectionDAG &DAG,
51119 const X86Subtarget &Subtarget) {
51120 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51121 "Unexpected bit opcode");
51122
51123 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51124 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51125 return SDValue();
51126
51127 SDValue N00 = N0.getOperand(0);
51128 SDValue N10 = N1.getOperand(0);
51129 EVT N00Type = N00.getValueType();
51130 EVT N10Type = N10.getValueType();
51131
51132 // Ensure that both types are the same and are legal scalar fp types.
51133 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51134 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51135 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51136 return SDValue();
51137
51138 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51139 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51140 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51141 return DAG.getBitcast(VT, FPLogic);
51142 }
51143
51144 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51145 !N1.hasOneUse())
51146 return SDValue();
51147
51148 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51149 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51150
51151 // The vector ISA for FP predicates is incomplete before AVX, so converting
51152 // COMIS* to CMPS* may not be a win before AVX.
51153 if (!Subtarget.hasAVX() &&
51154 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51155 return SDValue();
51156
51157 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51158 // and vector logic:
51159 // logic (setcc N00, N01), (setcc N10, N11) -->
51160 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51161 unsigned NumElts = 128 / N00Type.getSizeInBits();
51162 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51163 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51164 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51165 SDValue N01 = N0.getOperand(1);
51166 SDValue N11 = N1.getOperand(1);
51167 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51168 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51169 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51170 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51171 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51172 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51173 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51174 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51175}
51176
51177// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51178// to reduce XMM->GPR traffic.
51179static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51180 SDValue N1, SelectionDAG &DAG) {
51181 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51182 "Unexpected bit opcode");
51183
51184 // Both operands must be single use MOVMSK.
51185 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51186 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51187 return SDValue();
51188
51189 SDValue Vec0 = N0.getOperand(0);
51190 SDValue Vec1 = N1.getOperand(0);
51191 EVT VecVT0 = Vec0.getValueType();
51192 EVT VecVT1 = Vec1.getValueType();
51193
51194 // Both MOVMSK operands must be from vectors of the same size and same element
51195 // size, but its OK for a fp/int diff.
51196 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51197 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51198 return SDValue();
51199
51200 unsigned VecOpc =
51202 SDValue Result =
51203 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51204 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51205}
51206
51207// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51208// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51209// handles in InstCombine.
51210static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51211 SDValue N0, SDValue N1,
51212 SelectionDAG &DAG) {
51213 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51214 "Unexpected bit opcode");
51215
51216 // Both operands must be single use.
51217 if (!N0.hasOneUse() || !N1.hasOneUse())
51218 return SDValue();
51219
51220 // Search for matching shifts.
51223
51224 unsigned BCOpc = BC0.getOpcode();
51225 EVT BCVT = BC0.getValueType();
51226 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51227 return SDValue();
51228
51229 switch (BCOpc) {
51230 case X86ISD::VSHLI:
51231 case X86ISD::VSRLI:
51232 case X86ISD::VSRAI: {
51233 if (BC0.getOperand(1) != BC1.getOperand(1))
51234 return SDValue();
51235 SDValue BitOp =
51236 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51237 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51238 return DAG.getBitcast(VT, Shift);
51239 }
51240 }
51241
51242 return SDValue();
51243}
51244
51245// Attempt to fold:
51246// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51247// TODO: Handle PACKUS handling.
51248static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51249 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51250 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51251 "Unexpected bit opcode");
51252
51253 // Both operands must be single use.
51254 if (!N0.hasOneUse() || !N1.hasOneUse())
51255 return SDValue();
51256
51257 // Search for matching packs.
51260
51261 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51262 return SDValue();
51263
51264 MVT DstVT = N0.getSimpleValueType();
51265 if (DstVT != N1.getSimpleValueType())
51266 return SDValue();
51267
51268 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51269 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51270
51271 // Limit to allsignbits packing.
51272 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51273 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51274 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51275 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51276 return SDValue();
51277
51278 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51279 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51280 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51281}
51282
51283/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51284/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51285/// with a shift-right to eliminate loading the vector constant mask value.
51287 SelectionDAG &DAG,
51288 const X86Subtarget &Subtarget) {
51289 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51290 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51291 EVT VT = Op0.getValueType();
51292 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51293 return SDValue();
51294
51295 // Try to convert an "is positive" signbit masking operation into arithmetic
51296 // shift and "andn". This saves a materialization of a -1 vector constant.
51297 // The "is negative" variant should be handled more generally because it only
51298 // requires "and" rather than "andn":
51299 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51300 //
51301 // This is limited to the original type to avoid producing even more bitcasts.
51302 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51303 // will be profitable.
51304 if (N->getValueType(0) == VT &&
51305 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51306 SDValue X, Y;
51307 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51308 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51309 X = Op1.getOperand(0);
51310 Y = Op0;
51311 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51312 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51313 X = Op0.getOperand(0);
51314 Y = Op1;
51315 }
51316 if (X && Y) {
51317 SDValue Sra =
51319 VT.getScalarSizeInBits() - 1, DAG);
51320 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51321 }
51322 }
51323
51324 APInt SplatVal;
51325 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51326 return SDValue();
51327
51328 // Don't prevent creation of ANDN.
51329 if (isBitwiseNot(Op0))
51330 return SDValue();
51331
51332 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51333 return SDValue();
51334
51335 unsigned EltBitWidth = VT.getScalarSizeInBits();
51336 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51337 return SDValue();
51338
51339 unsigned ShiftVal = SplatVal.countr_one();
51340 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51341 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51342 return DAG.getBitcast(N->getValueType(0), Shift);
51343}
51344
51345// Get the index node from the lowered DAG of a GEP IR instruction with one
51346// indexing dimension.
51348 if (Ld->isIndexed())
51349 return SDValue();
51350
51351 SDValue Base = Ld->getBasePtr();
51352 if (Base.getOpcode() != ISD::ADD)
51353 return SDValue();
51354
51355 SDValue ShiftedIndex = Base.getOperand(0);
51356 if (ShiftedIndex.getOpcode() != ISD::SHL)
51357 return SDValue();
51358
51359 return ShiftedIndex.getOperand(0);
51360}
51361
51362static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51363 return Subtarget.hasBMI2() &&
51364 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51365}
51366
51367/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51368/// This undoes the inverse fold performed in InstCombine
51370 SelectionDAG &DAG) {
51371 using namespace llvm::SDPatternMatch;
51372 MVT VT = N->getSimpleValueType(0);
51373 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51374 return SDValue();
51375
51376 SDValue X, Y, Z;
51377 if (sd_match(N, m_And(m_Value(X),
51378 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51379 // Don't fold if Y or Z are constants to prevent infinite loops.
51382 return DAG.getNode(
51383 ISD::AND, DL, VT, X,
51384 DAG.getNOT(
51385 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51386 }
51387
51388 return SDValue();
51389}
51390
51391// This function recognizes cases where X86 bzhi instruction can replace and
51392// 'and-load' sequence.
51393// In case of loading integer value from an array of constants which is defined
51394// as follows:
51395//
51396// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51397//
51398// then applying a bitwise and on the result with another input.
51399// It's equivalent to performing bzhi (zero high bits) on the input, with the
51400// same index of the load.
51402 const X86Subtarget &Subtarget) {
51403 MVT VT = Node->getSimpleValueType(0);
51404 SDLoc dl(Node);
51405
51406 // Check if subtarget has BZHI instruction for the node's type
51407 if (!hasBZHI(Subtarget, VT))
51408 return SDValue();
51409
51410 // Try matching the pattern for both operands.
51411 for (unsigned i = 0; i < 2; i++) {
51412 // continue if the operand is not a load instruction
51413 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51414 if (!Ld)
51415 continue;
51416 const Value *MemOp = Ld->getMemOperand()->getValue();
51417 if (!MemOp)
51418 continue;
51419 // Get the Node which indexes into the array.
51421 if (!Index)
51422 continue;
51423
51424 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51425 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51426 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51427 Constant *Init = GV->getInitializer();
51428 Type *Ty = Init->getType();
51430 !Ty->getArrayElementType()->isIntegerTy() ||
51431 Ty->getArrayElementType()->getScalarSizeInBits() !=
51432 VT.getSizeInBits() ||
51433 Ty->getArrayNumElements() >
51434 Ty->getArrayElementType()->getScalarSizeInBits())
51435 continue;
51436
51437 // Check if the array's constant elements are suitable to our case.
51438 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51439 bool ConstantsMatch = true;
51440 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51441 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51442 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51443 ConstantsMatch = false;
51444 break;
51445 }
51446 }
51447 if (!ConstantsMatch)
51448 continue;
51449
51450 // Do the transformation (For 32-bit type):
51451 // -> (and (load arr[idx]), inp)
51452 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51453 // that will be replaced with one bzhi instruction.
51454 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51455 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51456
51457 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51458 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51459 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51460
51461 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51462 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51463 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51464 }
51465 }
51466 }
51467 }
51468 return SDValue();
51469}
51470
51471// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51472// Where C is a mask containing the same number of bits as the setcc and
51473// where the setcc will freely 0 upper bits of k-register. We can replace the
51474// undef in the concat with 0s and remove the AND. This mainly helps with
51475// v2i1/v4i1 setcc being casted to scalar.
51477 const X86Subtarget &Subtarget) {
51478 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51479
51480 EVT VT = N->getValueType(0);
51481
51482 // Make sure this is an AND with constant. We will check the value of the
51483 // constant later.
51484 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51485 if (!C1)
51486 return SDValue();
51487
51488 // This is implied by the ConstantSDNode.
51489 assert(!VT.isVector() && "Expected scalar VT!");
51490
51491 SDValue Src = N->getOperand(0);
51492 if (!Src.hasOneUse())
51493 return SDValue();
51494
51495 // (Optionally) peek through any_extend().
51496 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51497 if (!Src.getOperand(0).hasOneUse())
51498 return SDValue();
51499 Src = Src.getOperand(0);
51500 }
51501
51502 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51503 return SDValue();
51504
51505 Src = Src.getOperand(0);
51506 EVT SrcVT = Src.getValueType();
51507
51508 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51509 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51510 !TLI.isTypeLegal(SrcVT))
51511 return SDValue();
51512
51513 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51514 return SDValue();
51515
51516 // We only care about the first subvector of the concat, we expect the
51517 // other subvectors to be ignored due to the AND if we make the change.
51518 SDValue SubVec = Src.getOperand(0);
51519 EVT SubVecVT = SubVec.getValueType();
51520
51521 // The RHS of the AND should be a mask with as many bits as SubVec.
51522 if (!TLI.isTypeLegal(SubVecVT) ||
51523 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51524 return SDValue();
51525
51526 // First subvector should be a setcc with a legal result type or a
51527 // AND containing at least one setcc with a legal result type.
51528 auto IsLegalSetCC = [&](SDValue V) {
51529 if (V.getOpcode() != ISD::SETCC)
51530 return false;
51531 EVT SetccVT = V.getOperand(0).getValueType();
51532 if (!TLI.isTypeLegal(SetccVT) ||
51533 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51534 return false;
51535 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51536 return false;
51537 return true;
51538 };
51539 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51540 (IsLegalSetCC(SubVec.getOperand(0)) ||
51541 IsLegalSetCC(SubVec.getOperand(1))))))
51542 return SDValue();
51543
51544 // We passed all the checks. Rebuild the concat_vectors with zeroes
51545 // and cast it back to VT.
51546 SDLoc dl(N);
51547 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51548 DAG.getConstant(0, dl, SubVecVT));
51549 Ops[0] = SubVec;
51550 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51551 Ops);
51552 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51553 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51554}
51555
51557 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51558 // We don't want to go crazy with the recursion here. This isn't a super
51559 // important optimization.
51560 static constexpr unsigned kMaxDepth = 2;
51561
51562 // Only do this re-ordering if op has one use.
51563 if (!Op.hasOneUse())
51564 return SDValue();
51565
51566 SDLoc DL(Op);
51567 // If we hit another assosiative op, recurse further.
51568 if (Op.getOpcode() == Opc) {
51569 // Done recursing.
51570 if (Depth++ >= kMaxDepth)
51571 return SDValue();
51572
51573 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51574 if (SDValue R =
51575 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51576 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51577 Op.getOperand(1 - OpIdx));
51578
51579 } else if (Op.getOpcode() == ISD::SUB) {
51580 if (Opc == ISD::AND) {
51581 // BLSI: (and x, (sub 0, x))
51582 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51583 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51584 }
51585 // Opc must be ISD::AND or ISD::XOR
51586 // BLSR: (and x, (sub x, 1))
51587 // BLSMSK: (xor x, (sub x, 1))
51588 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51589 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51590
51591 } else if (Op.getOpcode() == ISD::ADD) {
51592 // Opc must be ISD::AND or ISD::XOR
51593 // BLSR: (and x, (add x, -1))
51594 // BLSMSK: (xor x, (add x, -1))
51595 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51596 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51597 }
51598 return SDValue();
51599}
51600
51602 const X86Subtarget &Subtarget) {
51603 EVT VT = N->getValueType(0);
51604 // Make sure this node is a candidate for BMI instructions.
51605 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51606 (VT != MVT::i32 && VT != MVT::i64))
51607 return SDValue();
51608
51609 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51610
51611 // Try and match LHS and RHS.
51612 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51613 if (SDValue OpMatch =
51614 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51615 N->getOperand(1 - OpIdx), 0))
51616 return OpMatch;
51617 return SDValue();
51618}
51619
51620/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51622 SelectionDAG &DAG,
51623 const X86Subtarget &Subtarget) {
51624 using namespace llvm::SDPatternMatch;
51625
51626 EVT VT = And->getValueType(0);
51627 // Make sure this node is a candidate for BMI instructions.
51628 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51629 return SDValue();
51630
51631 SDValue X;
51632 SDValue Y;
51635 m_Value(Y))))
51636 return SDValue();
51637
51638 SDValue BLSMSK =
51639 DAG.getNode(ISD::XOR, DL, VT, X,
51640 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51641 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51642 return AndN;
51643}
51644
51646 SelectionDAG &DAG,
51648 const X86Subtarget &ST) {
51649 // cmp(setcc(cc, X), 0)
51650 // brcond ne
51651 // ->
51652 // X
51653 // brcond cc
51654
51655 // sub(setcc(cc, X), 1)
51656 // brcond ne
51657 // ->
51658 // X
51659 // brcond ~cc
51660 //
51661 // if only flag has users
51662
51663 SDValue SetCC = N->getOperand(0);
51664
51665 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51666 return SDValue();
51667
51668 // Check the only user of flag is `brcond ne`.
51669 SDNode *BrCond = *Flag->user_begin();
51670 if (BrCond->getOpcode() != X86ISD::BRCOND)
51671 return SDValue();
51672 unsigned CondNo = 2;
51673 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51675 return SDValue();
51676
51677 SDValue X = SetCC.getOperand(1);
51678 // sub has two results while X only have one. DAG combine assumes the value
51679 // type matches.
51680 if (N->getOpcode() == X86ISD::SUB)
51681 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51682
51683 SDValue CCN = SetCC.getOperand(0);
51684 X86::CondCode CC =
51685 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51687 // Update CC for the consumer of the flag.
51688 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51689 // checking if the second condition evaluates to true. When comparing the
51690 // result with 1, we are checking uf the second condition evaluates to false.
51692 if (isNullConstant(N->getOperand(1)))
51693 Ops[CondNo] = CCN;
51694 else if (isOneConstant(N->getOperand(1)))
51695 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51696 else
51697 llvm_unreachable("expect constant 0 or 1");
51698
51699 SDValue NewBrCond =
51700 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51701 // Avoid self-assign error b/c CC1 can be `e/ne`.
51702 if (BrCond != NewBrCond.getNode())
51703 DCI.CombineTo(BrCond, NewBrCond);
51704 return X;
51705}
51706
51709 const X86Subtarget &ST) {
51710 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51711 // ->
51712 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51713
51714 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51715 // ->
51716 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51717 //
51718 // where cflags is determined by cc1.
51719
51720 if (!ST.hasCCMP())
51721 return SDValue();
51722
51723 SDValue SetCC0 = N->getOperand(0);
51724 SDValue SetCC1 = N->getOperand(1);
51725 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51726 SetCC1.getOpcode() != X86ISD::SETCC)
51727 return SDValue();
51728
51729 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51730 SDValue Op = V.getOperand(1);
51731 unsigned Opc = Op.getOpcode();
51732 if (Opc == X86ISD::SUB)
51733 return X86ISD::CCMP;
51734 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51735 return X86ISD::CTEST;
51736 return 0U;
51737 };
51738
51739 unsigned NewOpc = 0;
51740
51741 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51742 // appear on the right.
51743 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51744 std::swap(SetCC0, SetCC1);
51745 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51746 return SDValue();
51747 }
51748
51749 X86::CondCode CC0 =
51750 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51751 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51752 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51753 return SDValue();
51754
51755 bool IsOR = N->getOpcode() == ISD::OR;
51756
51757 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51758 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51759 // operator is OR. Similar for CC1.
51760 SDValue SrcCC =
51762 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51763 : SetCC0.getOperand(0);
51764 SDValue CC1N = SetCC1.getOperand(0);
51765 X86::CondCode CC1 =
51766 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51768 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51769 SDLoc DL(N);
51770 SDValue CFlags = DAG.getTargetConstant(
51771 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51772 SDValue Sub = SetCC1.getOperand(1);
51773
51774 // Replace any uses of the old flag produced by SUB/CMP with the new one
51775 // produced by CCMP/CTEST.
51776 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51777 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51778 {Sub.getOperand(0), Sub.getOperand(1),
51779 CFlags, SrcCC, SetCC0.getOperand(1)})
51780 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51781 {Sub.getOperand(0), Sub.getOperand(0),
51782 CFlags, SrcCC, SetCC0.getOperand(1)});
51783
51784 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51785}
51786
51789 const X86Subtarget &Subtarget) {
51790 using namespace SDPatternMatch;
51791
51792 SDValue N0 = N->getOperand(0);
51793 SDValue N1 = N->getOperand(1);
51794 EVT VT = N->getValueType(0);
51795 SDLoc dl(N);
51796 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51797
51798 // If this is SSE1 only convert to FAND to avoid scalarization.
51799 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51800 return DAG.getBitcast(MVT::v4i32,
51801 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51802 DAG.getBitcast(MVT::v4f32, N0),
51803 DAG.getBitcast(MVT::v4f32, N1)));
51804 }
51805
51806 // Use a 32-bit and+zext if upper bits known zero.
51807 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51808 APInt HiMask = APInt::getHighBitsSet(64, 32);
51809 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51810 DAG.MaskedValueIsZero(N0, HiMask)) {
51811 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51812 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51813 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51814 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51815 }
51816 }
51817
51818 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51819 // TODO: Support multiple SrcOps.
51820 if (VT == MVT::i1) {
51822 SmallVector<APInt, 2> SrcPartials;
51823 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51824 SrcOps.size() == 1) {
51825 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51826 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51827 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51828 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51829 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51830 if (Mask) {
51831 assert(SrcPartials[0].getBitWidth() == NumElts &&
51832 "Unexpected partial reduction mask");
51833 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51834 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51835 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51836 }
51837 }
51838 }
51839
51840 // InstCombine converts:
51841 // `(-x << C0) & C1`
51842 // to
51843 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51844 // This saves an IR instruction but on x86 the neg/shift version is preferable
51845 // so undo the transform.
51846
51847 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51848 // TODO: We don't actually need a splat for this, we just need the checks to
51849 // hold for each element.
51850 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51851 /*AllowTruncation*/ false);
51852 ConstantSDNode *N01C =
51853 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51854 /*AllowTruncation*/ false);
51855 if (N1C && N01C) {
51856 const APInt &MulC = N01C->getAPIntValue();
51857 const APInt &AndC = N1C->getAPIntValue();
51858 APInt MulCLowBit = MulC & (-MulC);
51859 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51860 (MulCLowBit + MulC).isPowerOf2()) {
51861 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51862 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51863 assert(MulCLowBitLog != -1 &&
51864 "Isolated lowbit is somehow not a power of 2!");
51865 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51866 DAG.getConstant(MulCLowBitLog, dl, VT));
51867 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51868 }
51869 }
51870 }
51871
51872 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51873 return SetCC;
51874
51875 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51876 return V;
51877
51878 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51879 return R;
51880
51881 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51882 return R;
51883
51884 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51885 return R;
51886
51887 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51888 DAG, DCI, Subtarget))
51889 return FPLogic;
51890
51891 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51892 return R;
51893
51894 if (DCI.isBeforeLegalizeOps())
51895 return SDValue();
51896
51897 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51898 return R;
51899
51900 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51901 return R;
51902
51903 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51904 return ShiftRight;
51905
51906 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51907 return R;
51908
51909 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51910 return R;
51911
51912 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51913 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51914 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51915 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51916 unsigned Opc0 = N0.getOpcode();
51917 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51919 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51920 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51921 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51922 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51923 }
51924 }
51925
51926 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51927 // to make use of predicated selects.
51928 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51929 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51930 SDValue X, Y;
51931 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51932 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51933 (VT.is512BitVector() || Subtarget.hasVLX()) &&
51934 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
51937 m_Value(Y), m_SpecificVT(CondVT),
51938 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51939 return DAG.getSelect(dl, VT, Y, X,
51940 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51941 }
51942 }
51943
51944 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51945 // avoids slow variable shift (moving shift amount to ECX etc.)
51946 if (isOneConstant(N1) && N0->hasOneUse()) {
51947 SDValue Src = N0;
51948 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51949 Src.getOpcode() == ISD::TRUNCATE) &&
51950 Src.getOperand(0)->hasOneUse())
51951 Src = Src.getOperand(0);
51952 bool ContainsNOT = false;
51953 X86::CondCode X86CC = X86::COND_B;
51954 // Peek through AND(NOT(SRL(X,Y)),1).
51955 if (isBitwiseNot(Src)) {
51956 Src = Src.getOperand(0);
51957 X86CC = X86::COND_AE;
51958 ContainsNOT = true;
51959 }
51960 if (Src.getOpcode() == ISD::SRL &&
51961 !isa<ConstantSDNode>(Src.getOperand(1))) {
51962 SDValue BitNo = Src.getOperand(1);
51963 Src = Src.getOperand(0);
51964 // Peek through AND(SRL(NOT(X),Y),1).
51965 if (isBitwiseNot(Src)) {
51966 Src = Src.getOperand(0);
51967 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51968 ContainsNOT = true;
51969 }
51970 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51971 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51972 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51973 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51974 }
51975 }
51976
51977 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51978 // Attempt to recursively combine a bitmask AND with shuffles.
51979 SDValue Op(N, 0);
51980 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51981 return Res;
51982
51983 // If either operand is a constant mask, then only the elements that aren't
51984 // zero are actually demanded by the other operand.
51985 auto GetDemandedMasks = [&](SDValue Op) {
51986 APInt UndefElts;
51987 SmallVector<APInt> EltBits;
51988 int NumElts = VT.getVectorNumElements();
51989 int EltSizeInBits = VT.getScalarSizeInBits();
51990 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51991 APInt DemandedElts = APInt::getAllOnes(NumElts);
51992 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51993 EltBits)) {
51994 DemandedBits.clearAllBits();
51995 DemandedElts.clearAllBits();
51996 for (int I = 0; I != NumElts; ++I) {
51997 if (UndefElts[I]) {
51998 // We can't assume an undef src element gives an undef dst - the
51999 // other src might be zero.
52000 DemandedBits.setAllBits();
52001 DemandedElts.setBit(I);
52002 } else if (!EltBits[I].isZero()) {
52003 DemandedBits |= EltBits[I];
52004 DemandedElts.setBit(I);
52005 }
52006 }
52007 }
52008 return std::make_pair(DemandedBits, DemandedElts);
52009 };
52010 APInt Bits0, Elts0;
52011 APInt Bits1, Elts1;
52012 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52013 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
52014
52015 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52016 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52017 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52018 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52019 if (N->getOpcode() != ISD::DELETED_NODE)
52020 DCI.AddToWorklist(N);
52021 return SDValue(N, 0);
52022 }
52023
52024 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
52025 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
52026 if (NewN0 || NewN1)
52027 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
52028 NewN1 ? NewN1 : N1);
52029 }
52030
52031 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
52032 if ((VT.getScalarSizeInBits() % 8) == 0 &&
52034 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
52035 SDValue BitMask = N1;
52036 SDValue SrcVec = N0.getOperand(0);
52037 EVT SrcVecVT = SrcVec.getValueType();
52038
52039 // Check that the constant bitmask masks whole bytes.
52040 APInt UndefElts;
52041 SmallVector<APInt, 64> EltBits;
52042 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
52043 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
52044 llvm::all_of(EltBits, [](const APInt &M) {
52045 return M.isZero() || M.isAllOnes();
52046 })) {
52047 unsigned NumElts = SrcVecVT.getVectorNumElements();
52048 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
52049 unsigned Idx = N0.getConstantOperandVal(1);
52050
52051 // Create a root shuffle mask from the byte mask and the extracted index.
52052 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
52053 for (unsigned i = 0; i != Scale; ++i) {
52054 if (UndefElts[i])
52055 continue;
52056 int VecIdx = Scale * Idx + i;
52057 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
52058 }
52059
52061 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
52062 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
52063 /*AllowVariableCrossLaneMask=*/true,
52064 /*AllowVariablePerLaneMask=*/true,
52065 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
52066 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
52067 N0.getOperand(1));
52068 }
52069 }
52070
52071 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52072 return R;
52073
52074 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
52075 return R;
52076
52077 return SDValue();
52078}
52079
52080// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
52082 SelectionDAG &DAG,
52083 const X86Subtarget &Subtarget) {
52084 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52085
52086 MVT VT = N->getSimpleValueType(0);
52087 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52088 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
52089 return SDValue();
52090
52091 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
52092 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
52093 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
52094 return SDValue();
52095
52096 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
52097 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
52098 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
52099 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
52100 return SDValue();
52101
52102 // Attempt to extract constant byte masks.
52103 APInt UndefElts0, UndefElts1;
52104 SmallVector<APInt, 32> EltBits0, EltBits1;
52105 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52106 /*AllowWholeUndefs*/ false,
52107 /*AllowPartialUndefs*/ false))
52108 return SDValue();
52109 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52110 /*AllowWholeUndefs*/ false,
52111 /*AllowPartialUndefs*/ false))
52112 return SDValue();
52113
52114 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52115 // TODO - add UNDEF elts support.
52116 if (UndefElts0[i] || UndefElts1[i])
52117 return SDValue();
52118 if (EltBits0[i] != ~EltBits1[i])
52119 return SDValue();
52120 }
52121
52122 if (useVPTERNLOG(Subtarget, VT)) {
52123 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52124 // VPTERNLOG is only available as vXi32/64-bit types.
52125 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52126 MVT OpVT =
52127 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52128 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52129 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52130 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52131 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52132 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52133 DAG, Subtarget);
52134 return DAG.getBitcast(VT, Res);
52135 }
52136
52137 SDValue X = N->getOperand(0);
52138 SDValue Y =
52139 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52140 DAG.getBitcast(VT, N1.getOperand(0)));
52141 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52142}
52143
52144// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52145// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52146// Waiting for ANDNP combine allows other combines to happen that prevent
52147// matching.
52148static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52149 using namespace SDPatternMatch;
52150 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52151 m_And(m_Deferred(Mask), m_Value(Y))));
52152}
52153
52154// Try to fold:
52155// (or (and (m, y), (pandn m, x)))
52156// into:
52157// (vselect m, x, y)
52158// As a special case, try to fold:
52159// (or (and (m, (sub 0, x)), (pandn m, x)))
52160// into:
52161// (sub (xor X, M), M)
52163 SelectionDAG &DAG,
52164 const X86Subtarget &Subtarget) {
52165 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52166
52167 EVT VT = N->getValueType(0);
52168 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52169 (VT.is256BitVector() && Subtarget.hasInt256())))
52170 return SDValue();
52171
52172 SDValue X, Y, Mask;
52173 if (!matchLogicBlend(N, X, Y, Mask))
52174 return SDValue();
52175
52176 // Validate that X, Y, and Mask are bitcasts, and see through them.
52177 Mask = peekThroughBitcasts(Mask);
52180
52181 EVT MaskVT = Mask.getValueType();
52182 unsigned EltBits = MaskVT.getScalarSizeInBits();
52183
52184 // TODO: Attempt to handle floating point cases as well?
52185 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52186 return SDValue();
52187
52188 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52189 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52190 DAG, Subtarget))
52191 return Res;
52192
52193 // PBLENDVB is only available on SSE 4.1.
52194 if (!Subtarget.hasSSE41())
52195 return SDValue();
52196
52197 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52198 if (Subtarget.hasVLX())
52199 return SDValue();
52200
52201 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52202
52203 X = DAG.getBitcast(BlendVT, X);
52204 Y = DAG.getBitcast(BlendVT, Y);
52205 Mask = DAG.getBitcast(BlendVT, Mask);
52206 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52207 return DAG.getBitcast(VT, Mask);
52208}
52209
52210// Helper function for combineOrCmpEqZeroToCtlzSrl
52211// Transforms:
52212// seteq(cmp x, 0)
52213// into:
52214// srl(ctlz x), log2(bitsize(x))
52215// Input pattern is checked by caller.
52217 SDValue Cmp = Op.getOperand(1);
52218 EVT VT = Cmp.getOperand(0).getValueType();
52219 unsigned Log2b = Log2_32(VT.getSizeInBits());
52220 SDLoc dl(Op);
52221 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52222 // The result of the shift is true or false, and on X86, the 32-bit
52223 // encoding of shr and lzcnt is more desirable.
52224 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52225 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52226 DAG.getConstant(Log2b, dl, MVT::i8));
52227 return Scc;
52228}
52229
52230// Try to transform:
52231// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52232// into:
52233// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52234// Will also attempt to match more generic cases, eg:
52235// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52236// Only applies if the target supports the FastLZCNT feature.
52239 const X86Subtarget &Subtarget) {
52240 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52241 return SDValue();
52242
52243 auto isORCandidate = [](SDValue N) {
52244 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52245 };
52246
52247 // Check the zero extend is extending to 32-bit or more. The code generated by
52248 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52249 // instructions to clear the upper bits.
52250 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52251 !isORCandidate(N->getOperand(0)))
52252 return SDValue();
52253
52254 // Check the node matches: setcc(eq, cmp 0)
52255 auto isSetCCCandidate = [](SDValue N) {
52256 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52257 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52258 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52259 isNullConstant(N->getOperand(1).getOperand(1)) &&
52260 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52261 };
52262
52263 SDNode *OR = N->getOperand(0).getNode();
52264 SDValue LHS = OR->getOperand(0);
52265 SDValue RHS = OR->getOperand(1);
52266
52267 // Save nodes matching or(or, setcc(eq, cmp 0)).
52269 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52270 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52271 ORNodes.push_back(OR);
52272 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52273 LHS = OR->getOperand(0);
52274 RHS = OR->getOperand(1);
52275 }
52276
52277 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52278 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52279 !isORCandidate(SDValue(OR, 0)))
52280 return SDValue();
52281
52282 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52283 // to
52284 // or(srl(ctlz),srl(ctlz)).
52285 // The dag combiner can then fold it into:
52286 // srl(or(ctlz, ctlz)).
52287 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52288 SDValue Ret, NewRHS;
52289 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52290 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52291
52292 if (!Ret)
52293 return SDValue();
52294
52295 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52296 while (!ORNodes.empty()) {
52297 OR = ORNodes.pop_back_val();
52298 LHS = OR->getOperand(0);
52299 RHS = OR->getOperand(1);
52300 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52301 if (RHS->getOpcode() == ISD::OR)
52302 std::swap(LHS, RHS);
52303 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52304 if (!NewRHS)
52305 return SDValue();
52306 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52307 }
52308
52309 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52310}
52311
52312/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52313/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52314/// with CMP+{ADC, SBB}.
52315/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52316static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52317 SDValue X, SDValue Y,
52318 SelectionDAG &DAG,
52319 bool ZeroSecondOpOnly = false) {
52320 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52321 return SDValue();
52322
52323 // Look through a one-use zext.
52324 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52325 Y = Y.getOperand(0);
52326
52327 X86::CondCode CC;
52328 SDValue EFLAGS;
52329 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52330 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52331 EFLAGS = Y.getOperand(1);
52332 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52333 Y.hasOneUse()) {
52334 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52335 }
52336
52337 if (!EFLAGS)
52338 return SDValue();
52339
52340 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52341 // the general case below.
52342 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52343 if (ConstantX && !ZeroSecondOpOnly) {
52344 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52345 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52346 // This is a complicated way to get -1 or 0 from the carry flag:
52347 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52348 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52349 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52350 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52351 EFLAGS);
52352 }
52353
52354 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52355 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52356 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52357 EFLAGS.getValueType().isInteger() &&
52358 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52359 // Swap the operands of a SUB, and we have the same pattern as above.
52360 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52361 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52362 SDValue NewSub = DAG.getNode(
52363 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52364 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52365 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52366 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52367 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52368 NewEFLAGS);
52369 }
52370 }
52371 }
52372
52373 if (CC == X86::COND_B) {
52374 // X + SETB Z --> adc X, 0
52375 // X - SETB Z --> sbb X, 0
52376 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52377 DAG.getVTList(VT, MVT::i32), X,
52378 DAG.getConstant(0, DL, VT), EFLAGS);
52379 }
52380
52381 if (ZeroSecondOpOnly)
52382 return SDValue();
52383
52384 if (CC == X86::COND_A) {
52385 // Try to convert COND_A into COND_B in an attempt to facilitate
52386 // materializing "setb reg".
52387 //
52388 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52389 // cannot take an immediate as its first operand.
52390 //
52391 // If EFLAGS is from a CMP that compares the same operands as the earlier
52392 // SUB producing X (i.e. CMP X, Y), we can directly use the carry flag with
52393 // SBB/ADC without creating a flipped SUB.
52394 if (EFLAGS.getOpcode() == X86ISD::CMP &&
52395 EFLAGS.getValueType().isInteger() && X == EFLAGS.getOperand(0)) {
52396 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52397 DAG.getVTList(VT, MVT::i32), X,
52398 DAG.getConstant(0, DL, VT), EFLAGS);
52399 }
52400
52401 if (EFLAGS.getOpcode() == X86ISD::SUB &&
52402 EFLAGS.getValueType().isInteger() &&
52403 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52404 // Only create NewSub if we know one of the folds will succeed to avoid
52405 // introducing a temporary node that may persist and affect one-use checks
52406 // below.
52407 if (EFLAGS.getNode()->hasOneUse()) {
52408 SDValue NewSub = DAG.getNode(
52409 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52410 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52411 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52412 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52413 DAG.getVTList(VT, MVT::i32), X,
52414 DAG.getConstant(0, DL, VT), NewEFLAGS);
52415 }
52416
52417 if (IsSub && X == EFLAGS.getValue(0)) {
52418 SDValue NewSub = DAG.getNode(
52419 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52420 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52421 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52422 return DAG.getNode(X86ISD::SBB, DL, DAG.getVTList(VT, MVT::i32),
52423 EFLAGS.getOperand(0), EFLAGS.getOperand(1),
52424 NewEFLAGS);
52425 }
52426 }
52427 }
52428
52429 if (CC == X86::COND_AE) {
52430 // X + SETAE --> sbb X, -1
52431 // X - SETAE --> adc X, -1
52432 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52433 DAG.getVTList(VT, MVT::i32), X,
52434 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52435 }
52436
52437 if (CC == X86::COND_BE) {
52438 // X + SETBE --> sbb X, -1
52439 // X - SETBE --> adc X, -1
52440 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52441 // materializing "setae reg".
52442 //
52443 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52444 // cannot take an immediate as its first operand.
52445 //
52446 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52447 EFLAGS.getValueType().isInteger() &&
52448 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52449 SDValue NewSub =
52450 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52451 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52452 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52453 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52454 DAG.getVTList(VT, MVT::i32), X,
52455 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52456 }
52457 }
52458
52459 if (CC != X86::COND_E && CC != X86::COND_NE)
52460 return SDValue();
52461
52462 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52463 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52464 !EFLAGS.getOperand(0).getValueType().isInteger())
52465 return SDValue();
52466
52467 SDValue Z = EFLAGS.getOperand(0);
52468 EVT ZVT = Z.getValueType();
52469
52470 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52471 // the general case below.
52472 if (ConstantX) {
52473 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52474 // fake operands:
52475 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52476 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52477 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52478 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52479 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52480 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52481 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52482 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52483 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52484 SDValue(Neg.getNode(), 1));
52485 }
52486
52487 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52488 // with fake operands:
52489 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52490 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52491 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52492 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52493 SDValue One = DAG.getConstant(1, DL, ZVT);
52494 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52495 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52496 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52497 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52498 Cmp1.getValue(1));
52499 }
52500 }
52501
52502 // (cmp Z, 1) sets the carry flag if Z is 0.
52503 SDValue One = DAG.getConstant(1, DL, ZVT);
52504 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52505 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52506
52507 // Add the flags type for ADC/SBB nodes.
52508 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52509
52510 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52511 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52512 if (CC == X86::COND_NE)
52513 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52514 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52515
52516 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52517 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52518 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52519 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52520}
52521
52522/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52523/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52524/// with CMP+{ADC, SBB}.
52526 SelectionDAG &DAG) {
52527 bool IsSub = N->getOpcode() == ISD::SUB;
52528 SDValue X = N->getOperand(0);
52529 SDValue Y = N->getOperand(1);
52530 EVT VT = N->getValueType(0);
52531
52532 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52533 return ADCOrSBB;
52534
52535 // Commute and try again (negate the result for subtracts).
52536 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52537 if (IsSub)
52538 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52539 return ADCOrSBB;
52540 }
52541
52542 return SDValue();
52543}
52544
52545static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52546 SDValue N0, SDValue N1,
52547 SelectionDAG &DAG) {
52548 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52549
52550 // Delegate to combineAddOrSubToADCOrSBB if we have:
52551 //
52552 // (xor/or (zero_extend (setcc)) imm)
52553 //
52554 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52555 // equivalent to a SUB/ADD, respectively.
52556 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52557 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52558 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52559 bool IsSub = Opc == ISD::XOR;
52560 bool N1COdd = N1C->getZExtValue() & 1;
52561 if (IsSub ? N1COdd : !N1COdd)
52562 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52563 return R;
52564 }
52565 }
52566
52567 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52568 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52569 N0.getOperand(0).getOpcode() == ISD::AND &&
52573 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52574 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52575 N0.getOperand(0).getOperand(1));
52576 }
52577
52578 return SDValue();
52579}
52580
52583 const X86Subtarget &Subtarget) {
52584 SDValue N0 = N->getOperand(0);
52585 SDValue N1 = N->getOperand(1);
52586 EVT VT = N->getValueType(0);
52587 SDLoc dl(N);
52588 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52589
52590 // If this is SSE1 only convert to FOR to avoid scalarization.
52591 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52592 return DAG.getBitcast(MVT::v4i32,
52593 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52594 DAG.getBitcast(MVT::v4f32, N0),
52595 DAG.getBitcast(MVT::v4f32, N1)));
52596 }
52597
52598 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52599 // TODO: Support multiple SrcOps.
52600 if (VT == MVT::i1) {
52602 SmallVector<APInt, 2> SrcPartials;
52603 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52604 SrcOps.size() == 1) {
52605 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52606 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52607 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52608 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52609 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52610 if (Mask) {
52611 assert(SrcPartials[0].getBitWidth() == NumElts &&
52612 "Unexpected partial reduction mask");
52613 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52614 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52615 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52616 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52617 }
52618 }
52619 }
52620
52621 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52622 return SetCC;
52623
52624 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52625 return R;
52626
52627 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52628 return R;
52629
52630 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52631 return R;
52632
52633 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52634 DAG, DCI, Subtarget))
52635 return FPLogic;
52636
52637 if (DCI.isBeforeLegalizeOps())
52638 return SDValue();
52639
52640 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52641 return R;
52642
52643 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52644 return R;
52645
52646 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52647 return R;
52648
52649 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52650 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52651 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52652 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52653 uint64_t Val = CN->getZExtValue();
52654 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52655 Val == 8) {
52656 SDValue NotCond;
52657 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52658 N0.getOperand(1).hasOneUse()) {
52661 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52662 } else if (N0.getOpcode() == ISD::SUB &&
52663 isNullConstant(N0.getOperand(0))) {
52664 SDValue Cond = N0.getOperand(1);
52665 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52666 Cond = Cond.getOperand(0);
52667 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52668 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52670 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52671 }
52672 }
52673
52674 if (NotCond) {
52675 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52676 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52677 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52678 return R;
52679 }
52680 }
52681 }
52682 }
52683
52684 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52685 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52686 // iff the upper elements of the non-shifted arg are zero.
52687 // KUNPCK require 16+ bool vector elements.
52688 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52689 unsigned NumElts = VT.getVectorNumElements();
52690 unsigned HalfElts = NumElts / 2;
52691 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52692 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52693 N1.getConstantOperandAPInt(1) == HalfElts &&
52694 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52695 return DAG.getNode(
52696 ISD::CONCAT_VECTORS, dl, VT,
52697 extractSubVector(N0, 0, DAG, dl, HalfElts),
52698 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52699 }
52700 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52701 N0.getConstantOperandAPInt(1) == HalfElts &&
52702 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52703 return DAG.getNode(
52704 ISD::CONCAT_VECTORS, dl, VT,
52705 extractSubVector(N1, 0, DAG, dl, HalfElts),
52706 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52707 }
52708 }
52709
52710 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52711 // Attempt to recursively combine an OR of shuffles.
52712 SDValue Op(N, 0);
52713 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52714 return Res;
52715
52716 // If either operand is a constant mask, then only the elements that aren't
52717 // allones are actually demanded by the other operand.
52718 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52719 APInt UndefElts;
52720 SmallVector<APInt> EltBits;
52721 int NumElts = VT.getVectorNumElements();
52722 int EltSizeInBits = VT.getScalarSizeInBits();
52723 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52724 return false;
52725
52726 APInt DemandedElts = APInt::getZero(NumElts);
52727 for (int I = 0; I != NumElts; ++I)
52728 if (!EltBits[I].isAllOnes())
52729 DemandedElts.setBit(I);
52730
52731 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52732 };
52733 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52734 if (N->getOpcode() != ISD::DELETED_NODE)
52735 DCI.AddToWorklist(N);
52736 return SDValue(N, 0);
52737 }
52738 }
52739
52740 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52741 return R;
52742
52743 return SDValue();
52744}
52745
52746/// Try to turn tests against the signbit in the form of:
52747/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52748/// into:
52749/// SETGT(X, -1)
52751 SelectionDAG &DAG) {
52752 // This is only worth doing if the output type is i8 or i1.
52753 EVT ResultType = N->getValueType(0);
52754 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52755 return SDValue();
52756
52757 SDValue N0 = N->getOperand(0);
52758 SDValue N1 = N->getOperand(1);
52759
52760 // We should be performing an xor against a truncated shift.
52761 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52762 return SDValue();
52763
52764 // Make sure we are performing an xor against one.
52765 if (!isOneConstant(N1))
52766 return SDValue();
52767
52768 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52769 SDValue Shift = N0.getOperand(0);
52770 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52771 return SDValue();
52772
52773 // Make sure we are truncating from one of i16, i32 or i64.
52774 EVT ShiftTy = Shift.getValueType();
52775 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52776 return SDValue();
52777
52778 // Make sure the shift amount extracts the sign bit.
52779 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52780 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52781 return SDValue();
52782
52783 // Create a greater-than comparison against -1.
52784 // N.B. Using SETGE against 0 works but we want a canonical looking
52785 // comparison, using SETGT matches up with what TranslateX86CC.
52786 SDValue ShiftOp = Shift.getOperand(0);
52787 EVT ShiftOpTy = ShiftOp.getValueType();
52788 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52789 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52790 *DAG.getContext(), ResultType);
52791 SDValue Cond =
52792 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52793 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52794 if (SetCCResultType != ResultType)
52795 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52796 return Cond;
52797}
52798
52799/// Turn vector tests of the signbit in the form of:
52800/// xor (sra X, elt_size(X)-1), -1
52801/// into:
52802/// pcmpgt X, -1
52803///
52804/// This should be called before type legalization because the pattern may not
52805/// persist after that.
52807 const X86Subtarget &Subtarget) {
52808 EVT VT = N->getValueType(0);
52809 if (!VT.isSimple())
52810 return SDValue();
52811
52812 switch (VT.getSimpleVT().SimpleTy) {
52813 // clang-format off
52814 default: return SDValue();
52815 case MVT::v16i8:
52816 case MVT::v8i16:
52817 case MVT::v4i32:
52818 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52819 case MVT::v32i8:
52820 case MVT::v16i16:
52821 case MVT::v8i32:
52822 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52823 // clang-format on
52824 }
52825
52826 // There must be a shift right algebraic before the xor, and the xor must be a
52827 // 'not' operation.
52828 SDValue Shift = N->getOperand(0);
52829 SDValue Ones = N->getOperand(1);
52830 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52832 return SDValue();
52833
52834 // The shift should be smearing the sign bit across each vector element.
52835 auto *ShiftAmt =
52836 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52837 if (!ShiftAmt ||
52838 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52839 return SDValue();
52840
52841 // Create a greater-than comparison against -1. We don't use the more obvious
52842 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52843 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52844}
52845
52846/// Detect patterns of truncation with unsigned saturation:
52847///
52848/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52849/// Return the source value x to be truncated or SDValue() if the pattern was
52850/// not matched.
52851///
52852/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52853/// where C1 >= 0 and C2 is unsigned max of destination type.
52854///
52855/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52856/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52857///
52858/// These two patterns are equivalent to:
52859/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52860/// So return the smax(x, C1) value to be truncated or SDValue() if the
52861/// pattern was not matched.
52863 const SDLoc &DL) {
52864 using namespace llvm::SDPatternMatch;
52865 EVT InVT = In.getValueType();
52866
52867 // Saturation with truncation. We truncate from InVT to VT.
52869 "Unexpected types for truncate operation");
52870
52871 APInt C1, C2;
52873
52874 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52875 // the element size of the destination type.
52876 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52877 C2.isMask(VT.getScalarSizeInBits()))
52878 return UMin;
52879
52880 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52882 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52883 return SMin;
52884
52885 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52887 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52888 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52889
52890 return SDValue();
52891}
52892
52893/// Detect patterns of truncation with signed saturation:
52894/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52895/// signed_max_of_dest_type)) to dest_type)
52896/// or:
52897/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52898/// signed_min_of_dest_type)) to dest_type).
52899/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52900/// Return the source value to be truncated or SDValue() if the pattern was not
52901/// matched.
52902static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52903 using namespace llvm::SDPatternMatch;
52904 unsigned NumDstBits = VT.getScalarSizeInBits();
52905 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52906 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52907
52908 APInt SignedMax, SignedMin;
52909 if (MatchPackUS) {
52910 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52911 SignedMin = APInt::getZero(NumSrcBits);
52912 } else {
52913 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52914 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52915 }
52916
52917 SDValue SMin, SMax;
52918 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52919 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52920 return SMax;
52921
52922 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52923 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52924 return SMin;
52925
52926 return SDValue();
52927}
52928
52930 SelectionDAG &DAG,
52931 const X86Subtarget &Subtarget) {
52932 if (!Subtarget.hasSSE2() || !VT.isVector())
52933 return SDValue();
52934
52935 EVT SVT = VT.getVectorElementType();
52936 EVT InVT = In.getValueType();
52937 EVT InSVT = InVT.getVectorElementType();
52938
52939 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52940 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52941 // and concatenate at the same time. Then we can use a final vpmovuswb to
52942 // clip to 0-255.
52943 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52944 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52945 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52946 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52947 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52948 DL, DAG, Subtarget);
52949 assert(Mid && "Failed to pack!");
52950 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52951 }
52952 }
52953
52954 // vXi32 truncate instructions are available with AVX512F.
52955 // vXi16 truncate instructions are only available with AVX512BW.
52956 // For 256-bit or smaller vectors, we require VLX.
52957 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52958 // If the result type is 256-bits or larger and we have disable 512-bit
52959 // registers, we should go ahead and use the pack instructions if possible.
52960 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52961 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52962 (InVT.getSizeInBits() > 128) &&
52963 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52964 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52965
52966 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52968 (SVT == MVT::i8 || SVT == MVT::i16) &&
52969 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52970 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52971 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52972 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52973 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52974 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52975 DAG, Subtarget);
52976 assert(Mid && "Failed to pack!");
52978 Subtarget);
52979 assert(V && "Failed to pack!");
52980 return V;
52981 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52982 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52983 Subtarget);
52984 }
52985 if (SDValue SSatVal = detectSSatPattern(In, VT))
52986 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52987 Subtarget);
52988 }
52989
52990 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52991 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52992 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52993 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52994 unsigned TruncOpc = 0;
52995 SDValue SatVal;
52996 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52997 SatVal = SSatVal;
52998 TruncOpc = X86ISD::VTRUNCS;
52999 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
53000 SatVal = USatVal;
53001 TruncOpc = X86ISD::VTRUNCUS;
53002 }
53003 if (SatVal) {
53004 unsigned ResElts = VT.getVectorNumElements();
53005 // If the input type is less than 512 bits and we don't have VLX, we need
53006 // to widen to 512 bits.
53007 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
53008 unsigned NumConcats = 512 / InVT.getSizeInBits();
53009 ResElts *= NumConcats;
53010 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
53011 ConcatOps[0] = SatVal;
53012 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
53013 NumConcats * InVT.getVectorNumElements());
53014 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
53015 }
53016 // Widen the result if its narrower than 128 bits.
53017 if (ResElts * SVT.getSizeInBits() < 128)
53018 ResElts = 128 / SVT.getSizeInBits();
53019 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
53020 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
53021 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
53022 DAG.getVectorIdxConstant(0, DL));
53023 }
53024 }
53025
53026 return SDValue();
53027}
53028
53030 SelectionDAG &DAG,
53032 const X86Subtarget &Subtarget) {
53033 auto *Ld = cast<LoadSDNode>(N);
53034 EVT RegVT = Ld->getValueType(0);
53035 SDValue Ptr = Ld->getBasePtr();
53036 SDValue Chain = Ld->getChain();
53037 ISD::LoadExtType Ext = Ld->getExtensionType();
53038
53039 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
53040 return SDValue();
53041
53042 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
53043 return SDValue();
53044
53046 if (!LdC)
53047 return SDValue();
53048
53049 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
53050 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
53051 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
53052 if (Undefs[I])
53053 continue;
53054 if (UserUndefs[I] || Bits[I] != UserBits[I])
53055 return false;
53056 }
53057 return true;
53058 };
53059
53060 // Look through all other loads/broadcasts in the chain for another constant
53061 // pool entry.
53062 for (SDNode *User : Chain->users()) {
53063 auto *UserLd = dyn_cast<MemSDNode>(User);
53064 if (User != N && UserLd &&
53065 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
53066 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
53068 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
53069 User->getValueSizeInBits(0).getFixedValue() >
53070 RegVT.getFixedSizeInBits()) {
53071 EVT UserVT = User->getValueType(0);
53072 SDValue UserPtr = UserLd->getBasePtr();
53073 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
53074
53075 // See if we are loading a constant that matches in the lower
53076 // bits of a longer constant (but from a different constant pool ptr).
53077 if (UserC && UserPtr != Ptr) {
53078 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
53079 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
53080 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
53081 APInt Undefs, UserUndefs;
53082 SmallVector<APInt> Bits, UserBits;
53083 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
53084 UserVT.getScalarSizeInBits());
53085 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
53086 Bits) &&
53088 UserUndefs, UserBits)) {
53089 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
53091 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53092 RegVT.getSizeInBits());
53093 Extract = DAG.getBitcast(RegVT, Extract);
53094 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53095 }
53096 }
53097 }
53098 }
53099 }
53100 }
53101
53102 return SDValue();
53103}
53104
53107 const X86Subtarget &Subtarget) {
53108 auto *Ld = cast<LoadSDNode>(N);
53109 EVT RegVT = Ld->getValueType(0);
53110 EVT MemVT = Ld->getMemoryVT();
53111 SDLoc dl(Ld);
53112 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53113
53114 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
53115 // into two 16-byte operations. Also split non-temporal aligned loads on
53116 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
53117 ISD::LoadExtType Ext = Ld->getExtensionType();
53118 unsigned Fast;
53119 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
53120 Ext == ISD::NON_EXTLOAD &&
53121 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
53122 Ld->getAlign() >= Align(16)) ||
53123 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
53124 *Ld->getMemOperand(), &Fast) &&
53125 !Fast))) {
53126 unsigned NumElems = RegVT.getVectorNumElements();
53127 if (NumElems < 2)
53128 return SDValue();
53129
53130 unsigned HalfOffset = 16;
53131 SDValue Ptr1 = Ld->getBasePtr();
53132 SDValue Ptr2 =
53133 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53134 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53135 NumElems / 2);
53136 SDValue Load1 =
53137 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53138 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53139 SDValue Load2 =
53140 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53141 Ld->getPointerInfo().getWithOffset(HalfOffset),
53142 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53143 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53144 Load1.getValue(1), Load2.getValue(1));
53145
53146 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53147 return DCI.CombineTo(N, NewVec, TF, true);
53148 }
53149
53150 // Bool vector load - attempt to cast to an integer, as we have good
53151 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53152 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53153 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53154 unsigned NumElts = RegVT.getVectorNumElements();
53155 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53156 if (TLI.isTypeLegal(IntVT)) {
53157 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53158 Ld->getPointerInfo(), Ld->getBaseAlign(),
53159 Ld->getMemOperand()->getFlags());
53160 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53161 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53162 }
53163 }
53164
53165 // If we also broadcast this vector to a wider type, then just extract the
53166 // lowest subvector.
53167 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53168 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53169 SDValue Ptr = Ld->getBasePtr();
53170 SDValue Chain = Ld->getChain();
53171 for (SDNode *User : Chain->users()) {
53172 auto *UserLd = dyn_cast<MemSDNode>(User);
53173 if (User != N && UserLd &&
53174 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53175 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53176 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53177 User->hasAnyUseOfValue(0) &&
53178 User->getValueSizeInBits(0).getFixedValue() >
53179 RegVT.getFixedSizeInBits()) {
53181 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53182 RegVT.getSizeInBits());
53183 Extract = DAG.getBitcast(RegVT, Extract);
53184 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53185 }
53186 }
53187 }
53188
53189 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53190 return V;
53191
53192 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53193 unsigned AddrSpace = Ld->getAddressSpace();
53194 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53195 AddrSpace == X86AS::PTR32_UPTR) {
53196 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53197 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53198 SDValue Cast =
53199 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53200 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53201 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53202 Ld->getMemOperand()->getFlags());
53203 }
53204 }
53205
53206 return SDValue();
53207}
53208
53209/// If V is a build vector of boolean constants and exactly one of those
53210/// constants is true, return the operand index of that true element.
53211/// Otherwise, return -1.
53212static int getOneTrueElt(SDValue V) {
53213 // This needs to be a build vector of booleans.
53214 // TODO: Checking for the i1 type matches the IR definition for the mask,
53215 // but the mask check could be loosened to i8 or other types. That might
53216 // also require checking more than 'allOnesValue'; eg, the x86 HW
53217 // instructions only require that the MSB is set for each mask element.
53218 // The ISD::MSTORE comments/definition do not specify how the mask operand
53219 // is formatted.
53220 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53221 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53222 return -1;
53223
53224 int TrueIndex = -1;
53225 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53226 for (unsigned i = 0; i < NumElts; ++i) {
53227 const SDValue &Op = BV->getOperand(i);
53228 if (Op.isUndef())
53229 continue;
53230 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53231 if (!ConstNode)
53232 return -1;
53233 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53234 // If we already found a one, this is too many.
53235 if (TrueIndex >= 0)
53236 return -1;
53237 TrueIndex = i;
53238 }
53239 }
53240 return TrueIndex;
53241}
53242
53243/// Given a masked memory load/store operation, return true if it has one mask
53244/// bit set. If it has one mask bit set, then also return the memory address of
53245/// the scalar element to load/store, the vector index to insert/extract that
53246/// scalar element, and the alignment for the scalar memory access.
53248 SelectionDAG &DAG, SDValue &Addr,
53249 SDValue &Index, Align &Alignment,
53250 unsigned &Offset) {
53251 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53252 if (TrueMaskElt < 0)
53253 return false;
53254
53255 // Get the address of the one scalar element that is specified by the mask
53256 // using the appropriate offset from the base pointer.
53257 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53258 Offset = 0;
53259 Addr = MaskedOp->getBasePtr();
53260 if (TrueMaskElt != 0) {
53261 Offset = TrueMaskElt * EltVT.getStoreSize();
53263 SDLoc(MaskedOp));
53264 }
53265
53266 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53267 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53268 return true;
53269}
53270
53271/// If exactly one element of the mask is set for a non-extending masked load,
53272/// it is a scalar load and vector insert.
53273/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53274/// mask have already been optimized in IR, so we don't bother with those here.
53275static SDValue
53278 const X86Subtarget &Subtarget) {
53279 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53280 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53281 // However, some target hooks may need to be added to know when the transform
53282 // is profitable. Endianness would also have to be considered.
53283
53284 SDValue Addr, VecIndex;
53285 Align Alignment;
53286 unsigned Offset;
53287 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53288 return SDValue();
53289
53290 // Load the one scalar element that is specified by the mask using the
53291 // appropriate offset from the base pointer.
53292 SDLoc DL(ML);
53293 EVT VT = ML->getValueType(0);
53294 EVT EltVT = VT.getVectorElementType();
53295
53296 EVT CastVT = VT;
53297 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53298 EltVT = MVT::f64;
53299 CastVT = VT.changeVectorElementType(EltVT);
53300 }
53301
53302 SDValue Load =
53303 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53304 ML->getPointerInfo().getWithOffset(Offset),
53305 Alignment, ML->getMemOperand()->getFlags());
53306
53307 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53308
53309 // Insert the loaded element into the appropriate place in the vector.
53310 SDValue Insert =
53311 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53312 Insert = DAG.getBitcast(VT, Insert);
53313 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53314}
53315
53316static SDValue
53319 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53320 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53321 return SDValue();
53322
53323 SDLoc DL(ML);
53324 EVT VT = ML->getValueType(0);
53325
53326 // If we are loading the first and last elements of a vector, it is safe and
53327 // always faster to load the whole vector. Replace the masked load with a
53328 // vector load and select.
53329 unsigned NumElts = VT.getVectorNumElements();
53330 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53331 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53332 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53333 if (LoadFirstElt && LoadLastElt) {
53334 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53335 ML->getMemOperand());
53336 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53337 ML->getPassThru());
53338 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53339 }
53340
53341 // Convert a masked load with a constant mask into a masked load and a select.
53342 // This allows the select operation to use a faster kind of select instruction
53343 // (for example, vblendvps -> vblendps).
53344
53345 // Don't try this if the pass-through operand is already undefined. That would
53346 // cause an infinite loop because that's what we're about to create.
53347 if (ML->getPassThru().isUndef())
53348 return SDValue();
53349
53350 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53351 return SDValue();
53352
53353 // The new masked load has an undef pass-through operand. The select uses the
53354 // original pass-through operand.
53355 SDValue NewML = DAG.getMaskedLoad(
53356 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53357 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53358 ML->getAddressingMode(), ML->getExtensionType());
53359 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53360 ML->getPassThru());
53361
53362 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53363}
53364
53367 const X86Subtarget &Subtarget) {
53368 auto *Mld = cast<MaskedLoadSDNode>(N);
53369
53370 // TODO: Expanding load with constant mask may be optimized as well.
53371 if (Mld->isExpandingLoad())
53372 return SDValue();
53373
53374 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53375 if (SDValue ScalarLoad =
53376 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53377 return ScalarLoad;
53378
53379 // TODO: Do some AVX512 subsets benefit from this transform?
53380 if (!Subtarget.hasAVX512())
53381 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53382 return Blend;
53383 }
53384
53385 // If the mask value has been legalized to a non-boolean vector, try to
53386 // simplify ops leading up to it. We only demand the MSB of each lane.
53387 SDValue Mask = Mld->getMask();
53388 if (Mask.getScalarValueSizeInBits() != 1) {
53389 EVT VT = Mld->getValueType(0);
53390 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53392 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53393 if (N->getOpcode() != ISD::DELETED_NODE)
53394 DCI.AddToWorklist(N);
53395 return SDValue(N, 0);
53396 }
53397 if (SDValue NewMask =
53399 return DAG.getMaskedLoad(
53400 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53401 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53402 Mld->getAddressingMode(), Mld->getExtensionType());
53403 }
53404
53405 return SDValue();
53406}
53407
53408/// If exactly one element of the mask is set for a non-truncating masked store,
53409/// it is a vector extract and scalar store.
53410/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53411/// mask have already been optimized in IR, so we don't bother with those here.
53413 SelectionDAG &DAG,
53414 const X86Subtarget &Subtarget) {
53415 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53416 // However, some target hooks may need to be added to know when the transform
53417 // is profitable. Endianness would also have to be considered.
53418
53419 SDValue Addr, VecIndex;
53420 Align Alignment;
53421 unsigned Offset;
53422 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53423 return SDValue();
53424
53425 // Extract the one scalar element that is actually being stored.
53426 SDLoc DL(MS);
53427 SDValue Value = MS->getValue();
53428 EVT VT = Value.getValueType();
53429 EVT EltVT = VT.getVectorElementType();
53430 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53431 EltVT = MVT::f64;
53432 EVT CastVT = VT.changeVectorElementType(EltVT);
53433 Value = DAG.getBitcast(CastVT, Value);
53434 }
53435 SDValue Extract =
53436 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53437
53438 // Store that element at the appropriate offset from the base pointer.
53439 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53441 Alignment, MS->getMemOperand()->getFlags());
53442}
53443
53446 const X86Subtarget &Subtarget) {
53448 if (Mst->isCompressingStore())
53449 return SDValue();
53450
53451 EVT VT = Mst->getValue().getValueType();
53452 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53453
53454 if (Mst->isTruncatingStore())
53455 return SDValue();
53456
53457 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53458 return ScalarStore;
53459
53460 // If the mask value has been legalized to a non-boolean vector, try to
53461 // simplify ops leading up to it. We only demand the MSB of each lane.
53462 SDValue Mask = Mst->getMask();
53463 if (Mask.getScalarValueSizeInBits() != 1) {
53465 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53466 if (N->getOpcode() != ISD::DELETED_NODE)
53467 DCI.AddToWorklist(N);
53468 return SDValue(N, 0);
53469 }
53470 if (SDValue NewMask =
53472 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53473 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53474 Mst->getMemoryVT(), Mst->getMemOperand(),
53475 Mst->getAddressingMode());
53476 }
53477
53478 SDValue Value = Mst->getValue();
53479 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53480 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53481 Mst->getMemoryVT())) {
53482 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53483 Mst->getBasePtr(), Mst->getOffset(), Mask,
53484 Mst->getMemoryVT(), Mst->getMemOperand(),
53485 Mst->getAddressingMode(), true);
53486 }
53487
53488 return SDValue();
53489}
53490
53493 const X86Subtarget &Subtarget) {
53495 EVT StVT = St->getMemoryVT();
53496 SDLoc dl(St);
53497 SDValue StoredVal = St->getValue();
53498 EVT VT = StoredVal.getValueType();
53499 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53500
53501 // Convert a store of vXi1 into a store of iX and a bitcast.
53502 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53503 VT.getVectorElementType() == MVT::i1) {
53504
53506 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53507
53508 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53509 St->getPointerInfo(), St->getBaseAlign(),
53510 St->getMemOperand()->getFlags());
53511 }
53512
53513 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53514 // This will avoid a copy to k-register.
53515 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53516 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53517 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53518 SDValue Val = StoredVal.getOperand(0);
53519 // We must store zeros to the unused bits.
53520 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53521 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53522 St->getPointerInfo(), St->getBaseAlign(),
53523 St->getMemOperand()->getFlags());
53524 }
53525
53526 // Widen v2i1/v4i1 stores to v8i1.
53527 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53528 Subtarget.hasAVX512()) {
53529 unsigned NumConcats = 8 / VT.getVectorNumElements();
53530 // We must store zeros to the unused bits.
53531 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53532 Ops[0] = StoredVal;
53533 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53534 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53535 St->getPointerInfo(), St->getBaseAlign(),
53536 St->getMemOperand()->getFlags());
53537 }
53538
53539 // Turn vXi1 stores of constants into a scalar store.
53540 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53541 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53543 // If its a v64i1 store without 64-bit support, we need two stores.
53544 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53545 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53546 StoredVal->ops().slice(0, 32));
53548 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53549 StoredVal->ops().slice(32, 32));
53551
53552 SDValue Ptr0 = St->getBasePtr();
53553 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53554
53555 SDValue Ch0 =
53556 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53557 St->getBaseAlign(), St->getMemOperand()->getFlags());
53558 SDValue Ch1 = DAG.getStore(
53559 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53560 St->getBaseAlign(), St->getMemOperand()->getFlags());
53561 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53562 }
53563
53564 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53565 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53566 St->getPointerInfo(), St->getBaseAlign(),
53567 St->getMemOperand()->getFlags());
53568 }
53569
53570 // Convert scalar fabs/fneg load-store to integer equivalents.
53571 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53572 (StoredVal.getOpcode() == ISD::FABS ||
53573 StoredVal.getOpcode() == ISD::FNEG) &&
53574 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53575 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53576 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53577 if (TLI.isTypeLegal(IntVT)) {
53579 unsigned SignOp = ISD::XOR;
53580 if (StoredVal.getOpcode() == ISD::FABS) {
53581 SignMask = ~SignMask;
53582 SignOp = ISD::AND;
53583 }
53584 SDValue LogicOp = DAG.getNode(
53585 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53586 DAG.getConstant(SignMask, dl, IntVT));
53587 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53588 St->getPointerInfo(), St->getBaseAlign(),
53589 St->getMemOperand()->getFlags());
53590 }
53591 }
53592
53593 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53594 // Sandy Bridge, perform two 16-byte stores.
53595 unsigned Fast;
53596 if (VT.is256BitVector() && StVT == VT &&
53597 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53598 *St->getMemOperand(), &Fast) &&
53599 !Fast) {
53600 unsigned NumElems = VT.getVectorNumElements();
53601 if (NumElems < 2)
53602 return SDValue();
53603
53604 return splitVectorStore(St, DAG);
53605 }
53606
53607 // Split under-aligned vector non-temporal stores.
53608 if (St->isNonTemporal() && StVT == VT &&
53609 St->getAlign().value() < VT.getStoreSize()) {
53610 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53611 // vectors or the legalizer can scalarize it to use MOVNTI.
53612 if (VT.is256BitVector() || VT.is512BitVector()) {
53613 unsigned NumElems = VT.getVectorNumElements();
53614 if (NumElems < 2)
53615 return SDValue();
53616 return splitVectorStore(St, DAG);
53617 }
53618
53619 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53620 // to use MOVNTI.
53621 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53622 MVT NTVT = Subtarget.hasSSE4A()
53623 ? MVT::v2f64
53624 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53625 return scalarizeVectorStore(St, NTVT, DAG);
53626 }
53627 }
53628
53629 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53630 // supported, but avx512f is by extending to v16i32 and truncating.
53631 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53632 St->getValue().getOpcode() == ISD::TRUNCATE &&
53633 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53634 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53635 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53636 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53637 St->getValue().getOperand(0));
53638 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53639 MVT::v16i8, St->getMemOperand());
53640 }
53641
53642 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53643 if (!St->isTruncatingStore() &&
53644 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53645 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53646 StoredVal.hasOneUse() &&
53647 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53648 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53649 return EmitTruncSStore(IsSigned, St->getChain(),
53650 dl, StoredVal.getOperand(0), St->getBasePtr(),
53651 VT, St->getMemOperand(), DAG);
53652 }
53653
53654 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53655 if (!St->isTruncatingStore()) {
53656 auto IsExtractedElement = [](SDValue V) {
53657 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53658 V = V.getOperand(0);
53659 unsigned Opc = V.getOpcode();
53661 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53662 V.getOperand(0).hasOneUse())
53663 return V.getOperand(0);
53664 return SDValue();
53665 };
53666 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53667 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53668 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53669 SDValue Src = Trunc.getOperand(0);
53670 MVT DstVT = Trunc.getSimpleValueType();
53671 MVT SrcVT = Src.getSimpleValueType();
53672 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53673 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53674 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53675 if (NumTruncBits == VT.getSizeInBits() &&
53676 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53677 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53678 TruncVT, St->getMemOperand());
53679 }
53680 }
53681 }
53682 }
53683
53684 // Optimize trunc store (of multiple scalars) to shuffle and store.
53685 // First, pack all of the elements in one place. Next, store to memory
53686 // in fewer chunks.
53687 if (St->isTruncatingStore() && VT.isVector()) {
53688 if (TLI.isTruncStoreLegal(VT, StVT)) {
53689 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53690 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53691 dl, Val, St->getBasePtr(),
53692 St->getMemoryVT(), St->getMemOperand(), DAG);
53693 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53694 DAG, dl))
53695 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53696 dl, Val, St->getBasePtr(),
53697 St->getMemoryVT(), St->getMemOperand(), DAG);
53698 }
53699
53700 return SDValue();
53701 }
53702
53703 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53704 unsigned AddrSpace = St->getAddressSpace();
53705 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53706 AddrSpace == X86AS::PTR32_UPTR) {
53707 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53708 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53709 SDValue Cast =
53710 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53711 return DAG.getTruncStore(
53712 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53713 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53714 }
53715 }
53716
53717 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53718 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53719 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53720 Subtarget.hasCF() && St->isSimple()) {
53721 SDValue Cmov;
53722 if (StoredVal.getOpcode() == X86ISD::CMOV)
53723 Cmov = StoredVal;
53724 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53725 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53726 Cmov = StoredVal.getOperand(0);
53727 else
53728 return SDValue();
53729
53730 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53731 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53732 return SDValue();
53733
53734 bool InvertCC = false;
53735 SDValue V = SDValue(Ld, 0);
53736 if (V == Cmov.getOperand(1))
53737 InvertCC = true;
53738 else if (V != Cmov.getOperand(0))
53739 return SDValue();
53740
53741 SDVTList Tys = DAG.getVTList(MVT::Other);
53742 SDValue CC = Cmov.getOperand(2);
53743 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53744 if (InvertCC)
53745 CC = DAG.getTargetConstant(
53748 dl, MVT::i8);
53749 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53750 Cmov.getOperand(3)};
53751 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53752 St->getMemOperand());
53753 }
53754
53755 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53756 // the FP state in cases where an emms may be missing.
53757 // A preferable solution to the general problem is to figure out the right
53758 // places to insert EMMS. This qualifies as a quick hack.
53759
53760 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53761 if (VT.getSizeInBits() != 64)
53762 return SDValue();
53763
53764 const Function &F = DAG.getMachineFunction().getFunction();
53765 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53766 bool F64IsLegal =
53767 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53768
53769 if (!F64IsLegal || Subtarget.is64Bit())
53770 return SDValue();
53771
53772 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53773 cast<LoadSDNode>(St->getValue())->isSimple() &&
53774 St->getChain().hasOneUse() && St->isSimple()) {
53775 auto *Ld = cast<LoadSDNode>(St->getValue());
53776
53777 if (!ISD::isNormalLoad(Ld))
53778 return SDValue();
53779
53780 // Avoid the transformation if there are multiple uses of the loaded value.
53781 if (!Ld->hasNUsesOfValue(1, 0))
53782 return SDValue();
53783
53784 SDLoc LdDL(Ld);
53785 SDLoc StDL(N);
53786
53787 // Remove any range metadata as we're converting to f64 load/store.
53788 Ld->getMemOperand()->clearRanges();
53789
53790 // Lower to a single movq load/store pair.
53791 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53792 Ld->getBasePtr(), Ld->getMemOperand());
53793
53794 // Make sure new load is placed in same chain order.
53795 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53796 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53797 St->getMemOperand());
53798 }
53799
53800 // This is similar to the above case, but here we handle a scalar 64-bit
53801 // integer store that is extracted from a vector on a 32-bit target.
53802 // If we have SSE2, then we can treat it like a floating-point double
53803 // to get past legalization. The execution dependencies fixup pass will
53804 // choose the optimal machine instruction for the store if this really is
53805 // an integer or v2f32 rather than an f64.
53806 if (VT == MVT::i64 &&
53808 SDValue OldExtract = St->getOperand(1);
53809 SDValue ExtOp0 = OldExtract.getOperand(0);
53810 unsigned VecSize = ExtOp0.getValueSizeInBits();
53811 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53812 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53813 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53814 BitCast, OldExtract.getOperand(1));
53815 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53816 St->getPointerInfo(), St->getBaseAlign(),
53817 St->getMemOperand()->getFlags());
53818 }
53819
53820 return SDValue();
53821}
53822
53825 const X86Subtarget &Subtarget) {
53826 auto *St = cast<MemIntrinsicSDNode>(N);
53827
53828 SDValue StoredVal = N->getOperand(1);
53829 MVT VT = StoredVal.getSimpleValueType();
53830 EVT MemVT = St->getMemoryVT();
53831
53832 // Figure out which elements we demand.
53833 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53834 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53835
53836 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53837 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53838 if (N->getOpcode() != ISD::DELETED_NODE)
53839 DCI.AddToWorklist(N);
53840 return SDValue(N, 0);
53841 }
53842
53843 return SDValue();
53844}
53845
53846/// Return 'true' if this vector operation is "horizontal"
53847/// and return the operands for the horizontal operation in LHS and RHS. A
53848/// horizontal operation performs the binary operation on successive elements
53849/// of its first operand, then on successive elements of its second operand,
53850/// returning the resulting values in a vector. For example, if
53851/// A = < float a0, float a1, float a2, float a3 >
53852/// and
53853/// B = < float b0, float b1, float b2, float b3 >
53854/// then the result of doing a horizontal operation on A and B is
53855/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53856/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53857/// A horizontal-op B, for some already available A and B, and if so then LHS is
53858/// set to A, RHS to B, and the routine returns 'true'.
53859static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53860 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53861 bool IsCommutative,
53862 SmallVectorImpl<int> &PostShuffleMask,
53863 bool ForceHorizOp) {
53864 // If either operand is undef, bail out. The binop should be simplified.
53865 if (LHS.isUndef() || RHS.isUndef())
53866 return false;
53867
53868 // Look for the following pattern:
53869 // A = < float a0, float a1, float a2, float a3 >
53870 // B = < float b0, float b1, float b2, float b3 >
53871 // and
53872 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53873 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53874 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53875 // which is A horizontal-op B.
53876
53877 MVT VT = LHS.getSimpleValueType();
53878 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53879 "Unsupported vector type for horizontal add/sub");
53880 unsigned NumElts = VT.getVectorNumElements();
53881
53882 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53883 SmallVectorImpl<int> &ShuffleMask) {
53884 bool UseSubVector = false;
53885 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53886 Op.getOperand(0).getValueType().is256BitVector() &&
53887 llvm::isNullConstant(Op.getOperand(1))) {
53888 Op = Op.getOperand(0);
53889 UseSubVector = true;
53890 }
53892 SmallVector<int, 16> SrcMask, ScaledMask;
53894 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53895 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53896 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53897 })) {
53898 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53899 if (!UseSubVector && SrcOps.size() <= 2 &&
53900 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53901 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53902 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53903 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53904 }
53905 if (UseSubVector && SrcOps.size() == 1 &&
53906 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53907 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53908 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53909 ShuffleMask.assign(Mask.begin(), Mask.end());
53910 }
53911 }
53912 };
53913
53914 // View LHS in the form
53915 // LHS = VECTOR_SHUFFLE A, B, LMask
53916 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53917 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53918 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53919 SDValue A, B;
53921 GetShuffle(LHS, A, B, LMask);
53922
53923 // Likewise, view RHS in the form
53924 // RHS = VECTOR_SHUFFLE C, D, RMask
53925 SDValue C, D;
53927 GetShuffle(RHS, C, D, RMask);
53928
53929 // At least one of the operands should be a vector shuffle.
53930 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53931 if (NumShuffles == 0)
53932 return false;
53933
53934 if (LMask.empty()) {
53935 A = LHS;
53936 for (unsigned i = 0; i != NumElts; ++i)
53937 LMask.push_back(i);
53938 }
53939
53940 if (RMask.empty()) {
53941 C = RHS;
53942 for (unsigned i = 0; i != NumElts; ++i)
53943 RMask.push_back(i);
53944 }
53945
53946 // If we have an unary mask, ensure the other op is set to null.
53947 if (isUndefOrInRange(LMask, 0, NumElts))
53948 B = SDValue();
53949 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53950 A = SDValue();
53951
53952 if (isUndefOrInRange(RMask, 0, NumElts))
53953 D = SDValue();
53954 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53955 C = SDValue();
53956
53957 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53958 // RHS operands and shuffle mask.
53959 if (A != C) {
53960 std::swap(C, D);
53962 }
53963 // Check that the shuffles are both shuffling the same vectors.
53964 if (!(A == C && B == D))
53965 return false;
53966
53967 PostShuffleMask.clear();
53968 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53969
53970 // LHS and RHS are now:
53971 // LHS = shuffle A, B, LMask
53972 // RHS = shuffle A, B, RMask
53973 // Check that the masks correspond to performing a horizontal operation.
53974 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53975 // so we just repeat the inner loop if this is a 256-bit op.
53976 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53977 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53978 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53979 assert((NumEltsPer128BitChunk % 2 == 0) &&
53980 "Vector type should have an even number of elements in each lane");
53981 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53982 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53983 // Ignore undefined components.
53984 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53985 if (LIdx < 0 || RIdx < 0 ||
53986 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53987 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53988 continue;
53989
53990 // Check that successive odd/even elements are being operated on. If not,
53991 // this is not a horizontal operation.
53992 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53993 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53994 return false;
53995
53996 // Compute the post-shuffle mask index based on where the element
53997 // is stored in the HOP result, and where it needs to be moved to.
53998 int Base = LIdx & ~1u;
53999 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
54000 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
54001
54002 // The low half of the 128-bit result must choose from A.
54003 // The high half of the 128-bit result must choose from B,
54004 // unless B is undef. In that case, we are always choosing from A.
54005 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
54006 Index += NumEltsPer64BitChunk;
54007 PostShuffleMask[i + j] = Index;
54008 }
54009 }
54010
54011 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
54012 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
54013
54014 bool IsIdentityPostShuffle =
54015 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
54016 if (IsIdentityPostShuffle)
54017 PostShuffleMask.clear();
54018
54019 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
54020 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
54021 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
54022 return false;
54023
54024 // If the source nodes are already used in HorizOps then always accept this.
54025 // Shuffle folding should merge these back together.
54026 auto FoundHorizUser = [&](SDNode *User) {
54027 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
54028 };
54029 ForceHorizOp =
54030 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
54031 llvm::any_of(NewRHS->users(), FoundHorizUser));
54032
54033 // Assume a SingleSource HOP if we only shuffle one input and don't need to
54034 // shuffle the result.
54035 if (!ForceHorizOp &&
54036 !shouldUseHorizontalOp(NewLHS == NewRHS &&
54037 (NumShuffles < 2 || !IsIdentityPostShuffle),
54038 DAG, Subtarget))
54039 return false;
54040
54041 LHS = DAG.getBitcast(VT, NewLHS);
54042 RHS = DAG.getBitcast(VT, NewRHS);
54043 return true;
54044}
54045
54046// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
54048 const X86Subtarget &Subtarget) {
54049 EVT VT = N->getValueType(0);
54050 unsigned Opcode = N->getOpcode();
54051 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
54052 SmallVector<int, 8> PostShuffleMask;
54053
54054 auto MergableHorizOp = [N](unsigned HorizOpcode) {
54055 return N->hasOneUse() &&
54056 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
54057 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
54058 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
54059 };
54060
54061 switch (Opcode) {
54062 case ISD::FADD:
54063 case ISD::FSUB:
54064 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
54065 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
54066 SDValue LHS = N->getOperand(0);
54067 SDValue RHS = N->getOperand(1);
54068 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
54069 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54070 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54071 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
54072 if (!PostShuffleMask.empty())
54073 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54074 DAG.getUNDEF(VT), PostShuffleMask);
54075 return HorizBinOp;
54076 }
54077 }
54078 break;
54079 case ISD::ADD:
54080 case ISD::SUB:
54081 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
54082 VT == MVT::v16i16 || VT == MVT::v8i32)) {
54083 SDValue LHS = N->getOperand(0);
54084 SDValue RHS = N->getOperand(1);
54085 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
54086 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54087 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54088 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
54090 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
54091 };
54092 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
54093 {LHS, RHS}, HOpBuilder);
54094 if (!PostShuffleMask.empty())
54095 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54096 DAG.getUNDEF(VT), PostShuffleMask);
54097 return HorizBinOp;
54098 }
54099 }
54100 break;
54101 }
54102
54103 return SDValue();
54104}
54105
54106// Try to combine the following nodes
54107// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
54108// <i32 -2147483648[float -0.000000e+00]> 0
54109// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
54110// <(load 4 from constant-pool)> t0, t29
54111// [t30: v16i32 = bitcast t27]
54112// t6: v16i32 = xor t7, t27[t30]
54113// t11: v16f32 = bitcast t6
54114// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
54115// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
54116// t22: v16f32 = bitcast t7
54117// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
54118// t24: v32f16 = bitcast t23
54120 const X86Subtarget &Subtarget) {
54121 EVT VT = N->getValueType(0);
54122 SDValue LHS = N->getOperand(0);
54123 SDValue RHS = N->getOperand(1);
54124 int CombineOpcode =
54125 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
54126 auto combineConjugation = [&](SDValue &r) {
54127 if (LHS->getOpcode() == ISD::BITCAST) {
54128 SDValue XOR = LHS.getOperand(0);
54129 if (XOR->getOpcode() == ISD::XOR) {
54130 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54131 if (XORRHS.isConstant()) {
54132 APInt ConjugationInt32 = APInt(32, 0x80000000);
54133 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54134 if ((XORRHS.getBitWidth() == 32 &&
54135 XORRHS.getConstant() == ConjugationInt32) ||
54136 (XORRHS.getBitWidth() == 64 &&
54137 XORRHS.getConstant() == ConjugationInt64)) {
54138 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54139 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54140 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54141 r = DAG.getBitcast(VT, FCMulC);
54142 return true;
54143 }
54144 }
54145 }
54146 }
54147 return false;
54148 };
54149 SDValue Res;
54150 if (combineConjugation(Res))
54151 return Res;
54152 std::swap(LHS, RHS);
54153 if (combineConjugation(Res))
54154 return Res;
54155 return Res;
54156}
54157
54158// Try to combine the following nodes:
54159// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54161 const X86Subtarget &Subtarget) {
54162 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54164 Flags.hasAllowContract();
54165 };
54166
54167 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54168 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54169 Flags.hasNoSignedZeros();
54170 };
54171 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54172 APInt AI = APInt(32, 0x80008000);
54173 KnownBits Bits = DAG.computeKnownBits(Op);
54174 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54175 Bits.getConstant() == AI;
54176 };
54177
54178 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54179 !AllowContract(N->getFlags()))
54180 return SDValue();
54181
54182 EVT VT = N->getValueType(0);
54183 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54184 return SDValue();
54185
54186 SDValue LHS = N->getOperand(0);
54187 SDValue RHS = N->getOperand(1);
54188 bool IsConj;
54189 SDValue FAddOp1, MulOp0, MulOp1;
54190 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54191 &IsVectorAllNegativeZero,
54192 &HasNoSignedZero](SDValue N) -> bool {
54193 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54194 return false;
54195 SDValue Op0 = N.getOperand(0);
54196 unsigned Opcode = Op0.getOpcode();
54197 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54198 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54199 MulOp0 = Op0.getOperand(0);
54200 MulOp1 = Op0.getOperand(1);
54201 IsConj = Opcode == X86ISD::VFCMULC;
54202 return true;
54203 }
54204 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54206 HasNoSignedZero(Op0->getFlags())) ||
54207 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54208 MulOp0 = Op0.getOperand(0);
54209 MulOp1 = Op0.getOperand(1);
54210 IsConj = Opcode == X86ISD::VFCMADDC;
54211 return true;
54212 }
54213 }
54214 return false;
54215 };
54216
54217 if (GetCFmulFrom(LHS))
54218 FAddOp1 = RHS;
54219 else if (GetCFmulFrom(RHS))
54220 FAddOp1 = LHS;
54221 else
54222 return SDValue();
54223
54224 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54225 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54226 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54227 // FIXME: How do we handle when fast math flags of FADD are different from
54228 // CFMUL's?
54229 SDValue CFmul =
54230 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54231 return DAG.getBitcast(VT, CFmul);
54232}
54233
54234/// Do target-specific dag combines on floating-point adds/subs.
54236 const X86Subtarget &Subtarget) {
54237 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54238 return HOp;
54239
54240 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54241 return COp;
54242
54243 return SDValue();
54244}
54245
54247 const X86Subtarget &Subtarget) {
54248 EVT VT = N->getValueType(0);
54249 SDValue Src = N->getOperand(0);
54250 EVT SrcVT = Src.getValueType();
54251 SDLoc DL(N);
54252
54253 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54254
54255 // Let legalize expand this if it isn't a legal type yet.
54256 if (!TLI.isTypeLegal(VT))
54257 return SDValue();
54258
54259 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54260 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54261 return SDValue();
54262
54263 if (SrcVT == MVT::v2f16) {
54264 SrcVT = MVT::v4f16;
54265 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54266 DAG.getUNDEF(MVT::v2f16));
54267 }
54268
54269 if (SrcVT == MVT::v4f16) {
54270 SrcVT = MVT::v8f16;
54271 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54272 DAG.getUNDEF(MVT::v4f16));
54273 } else if (SrcVT == MVT::v2f32) {
54274 SrcVT = MVT::v4f32;
54275 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54276 DAG.getUNDEF(MVT::v2f32));
54277 } else {
54278 return SDValue();
54279 }
54280
54281 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54282}
54283
54284// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54285// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54286// are able to avoid generating code with MOVABS and large constants in certain
54287// cases.
54289 const SDLoc &DL) {
54290 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54291 std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
54292 if (!ValidSrlConst)
54293 return SDValue();
54294 unsigned SrlConstVal = *ValidSrlConst;
54295
54296 SDValue Op = N.getOperand(0);
54297 unsigned Opcode = Op.getOpcode();
54298 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54299 "Illegal truncation types");
54300
54301 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54302 !isa<ConstantSDNode>(Op.getOperand(1)))
54303 return SDValue();
54304 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54305
54306 if (SrlConstVal <= 32 ||
54307 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54308 return SDValue();
54309
54310 SDValue OpLhsSrl =
54311 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54312 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54313
54314 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54315 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54316 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54317
54318 if (Opcode == ISD::ADD) {
54319 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54320 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54321 }
54322 return NewOpNode;
54323}
54324
54325/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54326/// the codegen.
54327/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54328/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54329/// anything that is guaranteed to be transformed by DAGCombiner.
54331 const X86Subtarget &Subtarget,
54332 const SDLoc &DL) {
54333 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54334 SDValue Src = N->getOperand(0);
54335 unsigned SrcOpcode = Src.getOpcode();
54336 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54337
54338 EVT VT = N->getValueType(0);
54339 EVT SrcVT = Src.getValueType();
54340
54341 auto IsFreeTruncation = [VT](SDValue Op) {
54342 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54343
54344 // See if this has been extended from a smaller/equal size to
54345 // the truncation size, allowing a truncation to combine with the extend.
54346 unsigned Opcode = Op.getOpcode();
54347 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54348 Opcode == ISD::ZERO_EXTEND) &&
54349 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54350 return true;
54351
54352 // See if this is a single use constant which can be constant folded.
54353 // NOTE: We don't peek throught bitcasts here because there is currently
54354 // no support for constant folding truncate+bitcast+vector_of_constants. So
54355 // we'll just send up with a truncate on both operands which will
54356 // get turned back into (truncate (binop)) causing an infinite loop.
54357 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54358 };
54359
54360 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54361 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54362 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54363 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54364 };
54365
54366 // Don't combine if the operation has other uses.
54367 if (!Src.hasOneUse())
54368 return SDValue();
54369
54370 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54371 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54372
54373 if (!VT.isVector())
54374 return SDValue();
54375
54376 // In most cases its only worth pre-truncating if we're only facing the cost
54377 // of one truncation.
54378 // i.e. if one of the inputs will constant fold or the input is repeated.
54379 switch (SrcOpcode) {
54380 case ISD::MUL:
54381 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54382 // better to truncate if we have the chance.
54383 if (SrcVT.getScalarType() == MVT::i64 &&
54384 TLI.isOperationLegal(SrcOpcode, VT) &&
54385 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54386 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54387 [[fallthrough]];
54388 case ISD::AND:
54389 case ISD::XOR:
54390 case ISD::OR:
54391 case ISD::ADD:
54392 case ISD::SUB: {
54393 SDValue Op0 = Src.getOperand(0);
54394 SDValue Op1 = Src.getOperand(1);
54395 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54396 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54397 return TruncateArithmetic(Op0, Op1);
54398 break;
54399 }
54400 }
54401
54402 return SDValue();
54403}
54404
54405// Try to form a MULHU or MULHS node by looking for
54406// (trunc (srl (mul ext, ext), >= 16))
54407// TODO: This is X86 specific because we want to be able to handle wide types
54408// before type legalization. But we can only do it if the vector will be
54409// legalized via widening/splitting. Type legalization can't handle promotion
54410// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54411// combiner.
54412static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54413 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54414 using namespace llvm::SDPatternMatch;
54415
54416 if (!Subtarget.hasSSE2())
54417 return SDValue();
54418
54419 // Only handle vXi16 types that are at least 128-bits unless they will be
54420 // widened.
54421 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54422 return SDValue();
54423
54424 // Input type should be at least vXi32.
54425 EVT InVT = Src.getValueType();
54426 if (InVT.getVectorElementType().getSizeInBits() < 32)
54427 return SDValue();
54428
54429 // First instruction should be a right shift by 16 of a multiply.
54430 SDValue LHS, RHS;
54431 APInt ShiftAmt;
54432 if (!sd_match(Src,
54433 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54434 return SDValue();
54435
54436 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54437 return SDValue();
54438
54439 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54440
54441 // Count leading sign/zero bits on both inputs - if there are enough then
54442 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54443 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54444 // truncations may actually be free by peeking through to the ext source.
54445 auto IsSext = [&DAG](SDValue V) {
54446 return DAG.ComputeMaxSignificantBits(V) <= 16;
54447 };
54448 auto IsZext = [&DAG](SDValue V) {
54449 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54450 };
54451
54452 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54453 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54454 if (!IsSigned && !IsUnsigned)
54455 return SDValue();
54456
54457 // Check if both inputs are extensions, which will be removed by truncation.
54458 auto isOpTruncateFree = [](SDValue Op) {
54459 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54460 Op.getOpcode() == ISD::ZERO_EXTEND)
54461 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54462 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54463 };
54464 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54465
54466 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54467 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54468 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54469 // will have to split anyway.
54470 unsigned InSizeInBits = InVT.getSizeInBits();
54471 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54472 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54473 (InSizeInBits % 16) == 0) {
54474 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54475 InVT.getSizeInBits() / 16);
54476 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54477 DAG.getBitcast(BCVT, RHS));
54478 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54479 return DAG.getNode(ISD::SRL, DL, VT, Res,
54480 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54481 }
54482
54483 // Truncate back to source type.
54484 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54485 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54486
54487 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54488 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54489 return DAG.getNode(ISD::SRL, DL, VT, Res,
54490 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54491}
54492
54493// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54494// from one vector with signed bytes from another vector, adds together
54495// adjacent pairs of 16-bit products, and saturates the result before
54496// truncating to 16-bits.
54497//
54498// Which looks something like this:
54499// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54500// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54502 const X86Subtarget &Subtarget,
54503 const SDLoc &DL) {
54504 if (!VT.isVector() || !Subtarget.hasSSSE3())
54505 return SDValue();
54506
54507 unsigned NumElems = VT.getVectorNumElements();
54508 EVT ScalarVT = VT.getVectorElementType();
54509 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54510 return SDValue();
54511
54512 SDValue SSatVal = detectSSatPattern(In, VT);
54513 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54514 return SDValue();
54515
54516 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54517 // of multiplies from even/odd elements.
54518 SDValue N0 = SSatVal.getOperand(0);
54519 SDValue N1 = SSatVal.getOperand(1);
54520
54521 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54522 return SDValue();
54523
54524 SDValue N00 = N0.getOperand(0);
54525 SDValue N01 = N0.getOperand(1);
54526 SDValue N10 = N1.getOperand(0);
54527 SDValue N11 = N1.getOperand(1);
54528
54529 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54530 // Canonicalize zero_extend to LHS.
54531 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54532 std::swap(N00, N01);
54533 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54534 std::swap(N10, N11);
54535
54536 // Ensure we have a zero_extend and a sign_extend.
54537 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54538 N01.getOpcode() != ISD::SIGN_EXTEND ||
54539 N10.getOpcode() != ISD::ZERO_EXTEND ||
54540 N11.getOpcode() != ISD::SIGN_EXTEND)
54541 return SDValue();
54542
54543 // Peek through the extends.
54544 N00 = N00.getOperand(0);
54545 N01 = N01.getOperand(0);
54546 N10 = N10.getOperand(0);
54547 N11 = N11.getOperand(0);
54548
54549 // Ensure the extend is from vXi8.
54550 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54551 N01.getValueType().getVectorElementType() != MVT::i8 ||
54552 N10.getValueType().getVectorElementType() != MVT::i8 ||
54553 N11.getValueType().getVectorElementType() != MVT::i8)
54554 return SDValue();
54555
54556 // All inputs should be build_vectors.
54557 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54558 N01.getOpcode() != ISD::BUILD_VECTOR ||
54559 N10.getOpcode() != ISD::BUILD_VECTOR ||
54561 return SDValue();
54562
54563 // N00/N10 are zero extended. N01/N11 are sign extended.
54564
54565 // For each element, we need to ensure we have an odd element from one vector
54566 // multiplied by the odd element of another vector and the even element from
54567 // one of the same vectors being multiplied by the even element from the
54568 // other vector. So we need to make sure for each element i, this operator
54569 // is being performed:
54570 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54571 SDValue ZExtIn, SExtIn;
54572 for (unsigned i = 0; i != NumElems; ++i) {
54573 SDValue N00Elt = N00.getOperand(i);
54574 SDValue N01Elt = N01.getOperand(i);
54575 SDValue N10Elt = N10.getOperand(i);
54576 SDValue N11Elt = N11.getOperand(i);
54577 // TODO: Be more tolerant to undefs.
54578 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54579 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54580 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54582 return SDValue();
54583 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54584 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54585 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54586 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54587 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54588 return SDValue();
54589 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54590 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54591 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54592 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54593 // Add is commutative so indices can be reordered.
54594 if (IdxN00 > IdxN10) {
54595 std::swap(IdxN00, IdxN10);
54596 std::swap(IdxN01, IdxN11);
54597 }
54598 // N0 indices be the even element. N1 indices must be the next odd element.
54599 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54600 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54601 return SDValue();
54602 SDValue N00In = N00Elt.getOperand(0);
54603 SDValue N01In = N01Elt.getOperand(0);
54604 SDValue N10In = N10Elt.getOperand(0);
54605 SDValue N11In = N11Elt.getOperand(0);
54606 // First time we find an input capture it.
54607 if (!ZExtIn) {
54608 ZExtIn = N00In;
54609 SExtIn = N01In;
54610 }
54611 if (ZExtIn != N00In || SExtIn != N01In ||
54612 ZExtIn != N10In || SExtIn != N11In)
54613 return SDValue();
54614 }
54615
54616 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54617 EVT ExtVT = Ext.getValueType();
54618 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54619 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54620 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54621 DAG.getVectorIdxConstant(0, DL));
54622 }
54623 };
54624 ExtractVec(ZExtIn);
54625 ExtractVec(SExtIn);
54626
54627 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54629 // Shrink by adding truncate nodes and let DAGCombine fold with the
54630 // sources.
54631 EVT InVT = Ops[0].getValueType();
54632 assert(InVT.getScalarType() == MVT::i8 &&
54633 "Unexpected scalar element type");
54634 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54635 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54636 InVT.getVectorNumElements() / 2);
54637 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54638 };
54639 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54640 PMADDBuilder);
54641}
54642
54644 const X86Subtarget &Subtarget) {
54645 EVT VT = N->getValueType(0);
54646 SDValue Src = N->getOperand(0);
54647 SDLoc DL(N);
54648
54649 // Attempt to pre-truncate inputs to arithmetic ops instead.
54650 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54651 return V;
54652
54653 // Try to detect PMADD
54654 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54655 return PMAdd;
54656
54657 // Try to combine truncation with signed/unsigned saturation.
54658 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54659 return Val;
54660
54661 // Try to combine PMULHUW/PMULHW for vXi16.
54662 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54663 return V;
54664
54665 // The bitcast source is a direct mmx result.
54666 // Detect bitcasts between i32 to x86mmx
54667 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54668 SDValue BCSrc = Src.getOperand(0);
54669 if (BCSrc.getValueType() == MVT::x86mmx)
54670 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54671 }
54672
54673 return SDValue();
54674}
54675
54678 EVT VT = N->getValueType(0);
54679 SDValue In = N->getOperand(0);
54680 SDLoc DL(N);
54681
54682 if (SDValue SSatVal = detectSSatPattern(In, VT))
54683 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54684 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54685 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54686
54687 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54688 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54689 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54690 return SDValue(N, 0);
54691
54692 return SDValue();
54693}
54694
54695/// Returns the negated value if the node \p N flips sign of FP value.
54696///
54697/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54698/// or FSUB(0, x)
54699/// AVX512F does not have FXOR, so FNEG is lowered as
54700/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54701/// In this case we go though all bitcasts.
54702/// This also recognizes splat of a negated value and returns the splat of that
54703/// value.
54704static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54705 if (N->getOpcode() == ISD::FNEG)
54706 return N->getOperand(0);
54707
54708 // Don't recurse exponentially.
54710 return SDValue();
54711
54712 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54713
54715 EVT VT = Op->getValueType(0);
54716
54717 // Make sure the element size doesn't change.
54718 if (VT.getScalarSizeInBits() != ScalarSize)
54719 return SDValue();
54720
54721 unsigned Opc = Op.getOpcode();
54722 switch (Opc) {
54723 case ISD::VECTOR_SHUFFLE: {
54724 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54725 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54726 if (!Op.getOperand(1).isUndef())
54727 return SDValue();
54728 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54729 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54730 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54731 cast<ShuffleVectorSDNode>(Op)->getMask());
54732 break;
54733 }
54735 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54736 // -V, INDEX).
54737 SDValue InsVector = Op.getOperand(0);
54738 SDValue InsVal = Op.getOperand(1);
54739 if (!InsVector.isUndef())
54740 return SDValue();
54741 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54742 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54743 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54744 NegInsVal, Op.getOperand(2));
54745 break;
54746 }
54747 case ISD::FSUB:
54748 case ISD::XOR:
54749 case X86ISD::FXOR: {
54750 SDValue Op1 = Op.getOperand(1);
54751 SDValue Op0 = Op.getOperand(0);
54752
54753 // For XOR and FXOR, we want to check if constant
54754 // bits of Op1 are sign bit masks. For FSUB, we
54755 // have to check if constant bits of Op0 are sign
54756 // bit masks and hence we swap the operands.
54757 if (Opc == ISD::FSUB)
54758 std::swap(Op0, Op1);
54759
54760 APInt UndefElts;
54761 SmallVector<APInt, 16> EltBits;
54762 // Extract constant bits and see if they are all
54763 // sign bit masks. Ignore the undef elements.
54764 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54765 /* AllowWholeUndefs */ true,
54766 /* AllowPartialUndefs */ false)) {
54767 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54768 if (!UndefElts[I] && !EltBits[I].isSignMask())
54769 return SDValue();
54770
54771 // Only allow bitcast from correctly-sized constant.
54772 Op0 = peekThroughBitcasts(Op0);
54773 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54774 return Op0;
54775 }
54776 break;
54777 } // case
54778 } // switch
54779
54780 return SDValue();
54781}
54782
54783static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54784 bool NegRes) {
54785 if (NegMul) {
54786 switch (Opcode) {
54787 // clang-format off
54788 default: llvm_unreachable("Unexpected opcode");
54789 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54790 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54791 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54792 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54793 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54794 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54795 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54796 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54797 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54798 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54799 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54800 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54801 // clang-format on
54802 }
54803 }
54804
54805 if (NegAcc) {
54806 switch (Opcode) {
54807 // clang-format off
54808 default: llvm_unreachable("Unexpected opcode");
54809 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54810 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54811 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54812 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54813 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54814 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54815 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54816 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54817 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54818 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54819 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54820 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54821 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54822 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54823 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54824 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54825 // clang-format on
54826 }
54827 }
54828
54829 if (NegRes) {
54830 switch (Opcode) {
54831 // For accuracy reason, we never combine fneg and fma under strict FP.
54832 // clang-format off
54833 default: llvm_unreachable("Unexpected opcode");
54834 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54835 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54836 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54837 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54838 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54839 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54840 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54841 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54842 // clang-format on
54843 }
54844 }
54845
54846 return Opcode;
54847}
54848
54849/// Do target-specific dag combines on floating point negations.
54852 const X86Subtarget &Subtarget) {
54853 EVT OrigVT = N->getValueType(0);
54854 SDValue Arg = isFNEG(DAG, N);
54855 if (!Arg)
54856 return SDValue();
54857
54858 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54859 EVT VT = Arg.getValueType();
54860 EVT SVT = VT.getScalarType();
54861 SDLoc DL(N);
54862
54863 // Let legalize expand this if it isn't a legal type yet.
54864 if (!TLI.isTypeLegal(VT))
54865 return SDValue();
54866
54867 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54868 // use of a constant by performing (-0 - A*B) instead.
54869 // FIXME: Check rounding control flags as well once it becomes available.
54870 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54871 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54872 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54873 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54874 Arg.getOperand(1), Zero);
54875 return DAG.getBitcast(OrigVT, NewNode);
54876 }
54877
54879 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54880 if (SDValue NegArg =
54881 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54882 return DAG.getBitcast(OrigVT, NegArg);
54883
54884 return SDValue();
54885}
54886
54888 bool LegalOperations,
54889 bool ForCodeSize,
54891 unsigned Depth) const {
54892 // fneg patterns are removable even if they have multiple uses.
54893 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54895 return DAG.getBitcast(Op.getValueType(), Arg);
54896 }
54897
54898 EVT VT = Op.getValueType();
54899 EVT SVT = VT.getScalarType();
54900 unsigned Opc = Op.getOpcode();
54901 SDNodeFlags Flags = Op.getNode()->getFlags();
54902 switch (Opc) {
54903 case ISD::FMA:
54904 case X86ISD::FMSUB:
54905 case X86ISD::FNMADD:
54906 case X86ISD::FNMSUB:
54907 case X86ISD::FMADD_RND:
54908 case X86ISD::FMSUB_RND:
54909 case X86ISD::FNMADD_RND:
54910 case X86ISD::FNMSUB_RND: {
54911 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54912 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54914 break;
54915
54916 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54917 // if it may have signed zeros.
54918 if (!Flags.hasNoSignedZeros())
54919 break;
54920
54921 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54922 // keep temporary nodes alive.
54923 std::list<HandleSDNode> Handles;
54924
54925 // This is always negatible for free but we might be able to remove some
54926 // extra operand negations as well.
54927 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
54928 for (int i = 0; i != 3; ++i) {
54929 NewOps[i] = getCheaperNegatedExpression(
54930 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54931 if (!!NewOps[i])
54932 Handles.emplace_back(NewOps[i]);
54933 }
54934
54935 bool NegA = !!NewOps[0];
54936 bool NegB = !!NewOps[1];
54937 bool NegC = !!NewOps[2];
54938 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54939
54940 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54942
54943 // Fill in the non-negated ops with the original values.
54944 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54945 if (!NewOps[i])
54946 NewOps[i] = Op.getOperand(i);
54947 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54948 }
54949 case X86ISD::FRCP:
54950 if (SDValue NegOp0 =
54951 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54952 ForCodeSize, Cost, Depth + 1))
54953 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54954 break;
54955 }
54956
54957 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54958 ForCodeSize, Cost, Depth);
54959}
54960
54962 const X86Subtarget &Subtarget) {
54963 MVT VT = N->getSimpleValueType(0);
54964 // If we have integer vector types available, use the integer opcodes.
54965 if (!VT.isVector() || !Subtarget.hasSSE2())
54966 return SDValue();
54967
54968 SDLoc dl(N);
54970 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54971 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54972 unsigned IntOpcode;
54973 switch (N->getOpcode()) {
54974 // clang-format off
54975 default: llvm_unreachable("Unexpected FP logic op");
54976 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54977 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54978 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54979 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54980 // clang-format on
54981 }
54982 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54983 return DAG.getBitcast(VT, IntOp);
54984}
54985
54986/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54988 if (N->getOpcode() != ISD::XOR)
54989 return SDValue();
54990
54991 SDValue LHS = N->getOperand(0);
54992 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54993 return SDValue();
54994
54996 X86::CondCode(LHS->getConstantOperandVal(0)));
54997 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54998}
54999
55001 const X86Subtarget &Subtarget) {
55002 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
55003 "Invalid opcode for combing with CTLZ");
55004 if (Subtarget.hasFastLZCNT())
55005 return SDValue();
55006
55007 EVT VT = N->getValueType(0);
55008 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
55009 (VT != MVT::i64 || !Subtarget.is64Bit()))
55010 return SDValue();
55011
55012 SDValue N0 = N->getOperand(0);
55013 SDValue N1 = N->getOperand(1);
55014
55015 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
55017 return SDValue();
55018
55019 SDValue OpCTLZ;
55020 SDValue OpSizeTM1;
55021
55022 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
55023 OpCTLZ = N1;
55024 OpSizeTM1 = N0;
55025 } else if (N->getOpcode() == ISD::SUB) {
55026 return SDValue();
55027 } else {
55028 OpCTLZ = N0;
55029 OpSizeTM1 = N1;
55030 }
55031
55032 if (!OpCTLZ.hasOneUse())
55033 return SDValue();
55034 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
55035 if (!C)
55036 return SDValue();
55037
55038 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
55039 return SDValue();
55040 EVT OpVT = VT;
55041 SDValue Op = OpCTLZ.getOperand(0);
55042 if (VT == MVT::i8) {
55043 // Zero extend to i32 since there is not an i8 bsr.
55044 OpVT = MVT::i32;
55045 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
55046 }
55047
55048 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
55049 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
55050 if (VT == MVT::i8)
55051 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
55052
55053 return Op;
55054}
55055
55058 const X86Subtarget &Subtarget) {
55059 SDValue N0 = N->getOperand(0);
55060 SDValue N1 = N->getOperand(1);
55061 EVT VT = N->getValueType(0);
55062 SDLoc DL(N);
55063
55064 // If this is SSE1 only convert to FXOR to avoid scalarization.
55065 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
55066 return DAG.getBitcast(MVT::v4i32,
55067 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
55068 DAG.getBitcast(MVT::v4f32, N0),
55069 DAG.getBitcast(MVT::v4f32, N1)));
55070 }
55071
55072 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
55073 return Cmp;
55074
55075 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
55076 return R;
55077
55078 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
55079 return R;
55080
55081 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
55082 return R;
55083
55084 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
55085 DAG, DCI, Subtarget))
55086 return FPLogic;
55087
55088 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
55089 return R;
55090
55091 if (DCI.isBeforeLegalizeOps())
55092 return SDValue();
55093
55094 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
55095 return SetCC;
55096
55097 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
55098 return R;
55099
55100 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
55101 return RV;
55102
55103 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
55104 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55105 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
55106 N0.getOperand(0).getValueType().isVector() &&
55107 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55108 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
55109 return DAG.getBitcast(
55110 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
55111 }
55112
55113 // Handle AVX512 mask widening.
55114 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
55115 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
55116 VT.getVectorElementType() == MVT::i1 &&
55118 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
55119 return DAG.getNode(
55121 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
55122 N0.getOperand(2));
55123 }
55124
55125 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
55126 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55127 // TODO: Under what circumstances could this be performed in DAGCombine?
55128 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55129 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55130 SDValue TruncExtSrc = N0.getOperand(0);
55131 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55132 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55133 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55134 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55135 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55136 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55137 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55138 }
55139 }
55140
55141 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55142 return R;
55143
55144 return combineFneg(N, DAG, DCI, Subtarget);
55145}
55146
55149 const X86Subtarget &Subtarget) {
55150 SDValue N0 = N->getOperand(0);
55151 EVT VT = N->getValueType(0);
55152
55153 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55154 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55155 SDValue Src = N0.getOperand(0);
55156 EVT SrcVT = Src.getValueType();
55157 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55158 (DCI.isBeforeLegalize() ||
55159 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55160 Subtarget.hasSSSE3()) {
55161 unsigned NumElts = SrcVT.getVectorNumElements();
55162 SmallVector<int, 32> ReverseMask(NumElts);
55163 for (unsigned I = 0; I != NumElts; ++I)
55164 ReverseMask[I] = (NumElts - 1) - I;
55165 SDValue Rev =
55166 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55167 return DAG.getBitcast(VT, Rev);
55168 }
55169 }
55170
55171 return SDValue();
55172}
55173
55174// Various combines to try to convert to avgceilu.
55177 const X86Subtarget &Subtarget) {
55178 unsigned Opcode = N->getOpcode();
55179 SDValue N0 = N->getOperand(0);
55180 SDValue N1 = N->getOperand(1);
55181 EVT VT = N->getValueType(0);
55182 EVT SVT = VT.getScalarType();
55183 SDLoc DL(N);
55184
55185 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55186 // Only useful on vXi8 which doesn't have good SRA handling.
55187 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55189 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55190 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55191 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55192 return DAG.getNode(ISD::XOR, DL, VT,
55193 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55194 }
55195
55196 return SDValue();
55197}
55198
55201 const X86Subtarget &Subtarget) {
55202 EVT VT = N->getValueType(0);
55203 unsigned NumBits = VT.getSizeInBits();
55204
55205 // TODO - Constant Folding.
55206
55207 // Simplify the inputs.
55208 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55209 APInt DemandedMask(APInt::getAllOnes(NumBits));
55210 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55211 return SDValue(N, 0);
55212
55213 return SDValue();
55214}
55215
55217 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55218}
55219
55220/// If a value is a scalar FP zero or a vector FP zero (potentially including
55221/// undefined elements), return a zero constant that may be used to fold away
55222/// that value. In the case of a vector, the returned constant will not contain
55223/// undefined elements even if the input parameter does. This makes it suitable
55224/// to be used as a replacement operand with operations (eg, bitwise-and) where
55225/// an undef should not propagate.
55227 const X86Subtarget &Subtarget) {
55229 return SDValue();
55230
55231 if (V.getValueType().isVector())
55232 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55233
55234 return V;
55235}
55236
55238 const X86Subtarget &Subtarget) {
55239 SDValue N0 = N->getOperand(0);
55240 SDValue N1 = N->getOperand(1);
55241 EVT VT = N->getValueType(0);
55242 SDLoc DL(N);
55243
55244 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55245 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55246 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55247 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55248 return SDValue();
55249
55250 auto isAllOnesConstantFP = [](SDValue V) {
55251 if (V.getSimpleValueType().isVector())
55252 return ISD::isBuildVectorAllOnes(V.getNode());
55253 auto *C = dyn_cast<ConstantFPSDNode>(V);
55254 return C && C->getConstantFPValue()->isAllOnesValue();
55255 };
55256
55257 // fand (fxor X, -1), Y --> fandn X, Y
55258 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55259 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55260
55261 // fand X, (fxor Y, -1) --> fandn Y, X
55262 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55263 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55264
55265 return SDValue();
55266}
55267
55268/// Do target-specific dag combines on X86ISD::FAND nodes.
55270 const X86Subtarget &Subtarget) {
55271 // FAND(0.0, x) -> 0.0
55272 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55273 return V;
55274
55275 // FAND(x, 0.0) -> 0.0
55276 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55277 return V;
55278
55279 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55280 return V;
55281
55282 return lowerX86FPLogicOp(N, DAG, Subtarget);
55283}
55284
55285/// Do target-specific dag combines on X86ISD::FANDN nodes.
55287 const X86Subtarget &Subtarget) {
55288 // FANDN(0.0, x) -> x
55289 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55290 return N->getOperand(1);
55291
55292 // FANDN(x, 0.0) -> 0.0
55293 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55294 return V;
55295
55296 return lowerX86FPLogicOp(N, DAG, Subtarget);
55297}
55298
55299/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55302 const X86Subtarget &Subtarget) {
55303 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55304
55305 // F[X]OR(0.0, x) -> x
55306 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55307 return N->getOperand(1);
55308
55309 // F[X]OR(x, 0.0) -> x
55310 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55311 return N->getOperand(0);
55312
55313 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55314 return NewVal;
55315
55316 return lowerX86FPLogicOp(N, DAG, Subtarget);
55317}
55318
55319/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55321 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55322
55323 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55324 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55326 return SDValue();
55327
55328 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55329 // into FMINC and FMAXC, which are Commutative operations.
55330 unsigned NewOp = 0;
55331 switch (N->getOpcode()) {
55332 default: llvm_unreachable("unknown opcode");
55333 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55334 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55335 }
55336
55337 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55338 N->getOperand(0), N->getOperand(1));
55339}
55340
55342 const X86Subtarget &Subtarget) {
55343 EVT VT = N->getValueType(0);
55344 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55345 return SDValue();
55346
55347 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55348
55349 auto IsMinMaxLegal = [&](EVT VT) {
55350 if (!TLI.isTypeLegal(VT))
55351 return false;
55352 return VT.getScalarType() != MVT::f16 ||
55353 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55354 };
55355
55356 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55357 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55358 (Subtarget.hasFP16() && VT == MVT::f16) ||
55359 (VT.isVector() && IsMinMaxLegal(VT))))
55360 return SDValue();
55361
55362 SDValue Op0 = N->getOperand(0);
55363 SDValue Op1 = N->getOperand(1);
55364 SDLoc DL(N);
55365 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55366
55367 // If we don't have to respect NaN inputs, this is a direct translation to x86
55368 // min/max instructions.
55369 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55370 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55371
55372 // If one of the operands is known non-NaN use the native min/max instructions
55373 // with the non-NaN input as second operand.
55374 if (DAG.isKnownNeverNaN(Op1))
55375 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55376 if (DAG.isKnownNeverNaN(Op0))
55377 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55378
55379 // If we have to respect NaN inputs, this takes at least 3 instructions.
55380 // Favor a library call when operating on a scalar and minimizing code size.
55381 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55382 return SDValue();
55383
55384 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55385 VT);
55386
55387 // There are 4 possibilities involving NaN inputs, and these are the required
55388 // outputs:
55389 // Op1
55390 // Num NaN
55391 // ----------------
55392 // Num | Max | Op0 |
55393 // Op0 ----------------
55394 // NaN | Op1 | NaN |
55395 // ----------------
55396 //
55397 // The SSE FP max/min instructions were not designed for this case, but rather
55398 // to implement:
55399 // Min = Op1 < Op0 ? Op1 : Op0
55400 // Max = Op1 > Op0 ? Op1 : Op0
55401 //
55402 // So they always return Op0 if either input is a NaN. However, we can still
55403 // use those instructions for fmaxnum by selecting away a NaN input.
55404
55405 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55406 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55407 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55408
55409 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55410 // are NaN, the NaN value of Op1 is the result.
55411 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55412}
55413
55416 EVT VT = N->getValueType(0);
55417 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55418
55419 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55420 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55421 return SDValue(N, 0);
55422
55423 // Convert a full vector load into vzload when not all bits are needed.
55424 SDValue In = N->getOperand(0);
55425 MVT InVT = In.getSimpleValueType();
55426 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55427 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55428 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55429 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55430 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55431 MVT MemVT = MVT::getIntegerVT(NumBits);
55432 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55433 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55434 SDLoc dl(N);
55435 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55436 DAG.getBitcast(InVT, VZLoad));
55437 DCI.CombineTo(N, Convert);
55438 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55440 return SDValue(N, 0);
55441 }
55442 }
55443
55444 return SDValue();
55445}
55446
55450 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55451 EVT VT = N->getValueType(0);
55452
55453 // Convert a full vector load into vzload when not all bits are needed.
55454 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55455 MVT InVT = In.getSimpleValueType();
55456 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55457 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55458 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55459 LoadSDNode *LN = cast<LoadSDNode>(In);
55460 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55461 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55462 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55463 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55464 SDLoc dl(N);
55465 if (IsStrict) {
55466 SDValue Convert =
55467 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55468 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55469 DCI.CombineTo(N, Convert, Convert.getValue(1));
55470 } else {
55471 SDValue Convert =
55472 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55473 DCI.CombineTo(N, Convert);
55474 }
55475 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55477 return SDValue(N, 0);
55478 }
55479 }
55480
55481 return SDValue();
55482}
55483
55484/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55487 const X86Subtarget &Subtarget) {
55488 SDValue N0 = N->getOperand(0);
55489 SDValue N1 = N->getOperand(1);
55490 MVT VT = N->getSimpleValueType(0);
55491 int NumElts = VT.getVectorNumElements();
55492 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55493 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55494 SDLoc DL(N);
55495
55496 // ANDNP(undef, x) -> 0
55497 // ANDNP(x, undef) -> 0
55498 if (N0.isUndef() || N1.isUndef())
55499 return DAG.getConstant(0, DL, VT);
55500
55501 // ANDNP(0, x) -> x
55503 return N1;
55504
55505 // ANDNP(x, 0) -> 0
55507 return DAG.getConstant(0, DL, VT);
55508
55509 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55511 return DAG.getNOT(DL, N0, VT);
55512
55513 // Turn ANDNP back to AND if input is inverted.
55514 if (SDValue Not = IsNOT(N0, DAG))
55515 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55516
55517 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55518 // to make use of predicated selects.
55519 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55520 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55521 SDValue Src = N0.getOperand(0);
55522 EVT SrcVT = Src.getValueType();
55523 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55524 (VT.is512BitVector() || Subtarget.hasVLX()) &&
55525 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
55526 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55527 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55528 getZeroVector(VT, Subtarget, DAG, DL));
55529 }
55530
55531 // Constant Folding
55532 APInt Undefs0, Undefs1;
55533 SmallVector<APInt> EltBits0, EltBits1;
55534 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55535 /*AllowWholeUndefs*/ true,
55536 /*AllowPartialUndefs*/ true)) {
55537 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55538 /*AllowWholeUndefs*/ true,
55539 /*AllowPartialUndefs*/ true)) {
55540 SmallVector<APInt> ResultBits;
55541 for (int I = 0; I != NumElts; ++I)
55542 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55543 return getConstVector(ResultBits, VT, DAG, DL);
55544 }
55545
55546 // Constant fold NOT(N0) to allow us to use AND.
55547 // Ensure this is only performed if we can confirm that the bitcasted source
55548 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55549 if (N0->hasOneUse()) {
55551 if (BC0.getOpcode() != ISD::BITCAST) {
55552 for (APInt &Elt : EltBits0)
55553 Elt = ~Elt;
55554 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55555 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55556 }
55557 }
55558 }
55559
55560 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55561 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55562 SDValue Op(N, 0);
55563 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55564 return Res;
55565
55566 // If either operand is a constant mask, then only the elements that aren't
55567 // zero are actually demanded by the other operand.
55568 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55569 APInt UndefElts;
55570 SmallVector<APInt> EltBits;
55571 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55572 APInt DemandedElts = APInt::getAllOnes(NumElts);
55573 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55574 EltBits)) {
55575 DemandedBits.clearAllBits();
55576 DemandedElts.clearAllBits();
55577 for (int I = 0; I != NumElts; ++I) {
55578 if (UndefElts[I]) {
55579 // We can't assume an undef src element gives an undef dst - the
55580 // other src might be zero.
55581 DemandedBits.setAllBits();
55582 DemandedElts.setBit(I);
55583 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55584 (!Invert && !EltBits[I].isZero())) {
55585 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55586 DemandedElts.setBit(I);
55587 }
55588 }
55589 }
55590 return std::make_pair(DemandedBits, DemandedElts);
55591 };
55592 APInt Bits0, Elts0;
55593 APInt Bits1, Elts1;
55594 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55595 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55596
55597 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55598 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55599 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55600 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55601 if (N->getOpcode() != ISD::DELETED_NODE)
55602 DCI.AddToWorklist(N);
55603 return SDValue(N, 0);
55604 }
55605 }
55606
55607 // Folds for better commutativity:
55608 if (N1->hasOneUse()) {
55609 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55610 if (SDValue Not = IsNOT(N1, DAG))
55611 return DAG.getNOT(
55612 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55613
55614 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55615 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55616 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55618 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55619 EVT ShufVT = BC1.getValueType();
55620 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55621 DAG.getBitcast(ShufVT, N0));
55622 SDValue NewShuf =
55623 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55624 return DAG.getBitcast(VT, NewShuf);
55625 }
55626 }
55627 }
55628
55629 return SDValue();
55630}
55631
55634 SDValue N1 = N->getOperand(1);
55635
55636 // BT ignores high bits in the bit index operand.
55637 unsigned BitWidth = N1.getValueSizeInBits();
55639 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55640 if (N->getOpcode() != ISD::DELETED_NODE)
55641 DCI.AddToWorklist(N);
55642 return SDValue(N, 0);
55643 }
55644
55645 return SDValue();
55646}
55647
55650 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55651 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55652
55653 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55654 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55655 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55656 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55657 if (N->getOpcode() != ISD::DELETED_NODE)
55658 DCI.AddToWorklist(N);
55659 return SDValue(N, 0);
55660 }
55661
55662 // Convert a full vector load into vzload when not all bits are needed.
55663 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55664 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55665 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55666 SDLoc dl(N);
55667 if (IsStrict) {
55668 SDValue Convert = DAG.getNode(
55669 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55670 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55671 DCI.CombineTo(N, Convert, Convert.getValue(1));
55672 } else {
55673 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55674 DAG.getBitcast(MVT::v8i16, VZLoad));
55675 DCI.CombineTo(N, Convert);
55676 }
55677
55678 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55680 return SDValue(N, 0);
55681 }
55682 }
55683 }
55684
55685 return SDValue();
55686}
55687
55688// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55690 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55691
55692 EVT DstVT = N->getValueType(0);
55693
55694 SDValue N0 = N->getOperand(0);
55695 SDValue N1 = N->getOperand(1);
55696 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55697
55698 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55699 return SDValue();
55700
55701 // Look through single use any_extends / truncs.
55702 SDValue IntermediateBitwidthOp;
55703 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55704 N0.hasOneUse()) {
55705 IntermediateBitwidthOp = N0;
55706 N0 = N0.getOperand(0);
55707 }
55708
55709 // See if we have a single use cmov.
55710 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55711 return SDValue();
55712
55713 SDValue CMovOp0 = N0.getOperand(0);
55714 SDValue CMovOp1 = N0.getOperand(1);
55715
55716 // Make sure both operands are constants.
55717 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55718 !isa<ConstantSDNode>(CMovOp1.getNode()))
55719 return SDValue();
55720
55721 SDLoc DL(N);
55722
55723 // If we looked through an any_extend/trunc above, add one to the constants.
55724 if (IntermediateBitwidthOp) {
55725 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55726 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55727 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55728 }
55729
55730 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55731 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55732
55733 EVT CMovVT = DstVT;
55734 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55735 if (DstVT == MVT::i16) {
55736 CMovVT = MVT::i32;
55737 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55738 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55739 }
55740
55741 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55742 N0.getOperand(2), N0.getOperand(3));
55743
55744 if (CMovVT != DstVT)
55745 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55746
55747 return CMov;
55748}
55749
55751 const X86Subtarget &Subtarget) {
55752 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55753
55754 if (SDValue V = combineSextInRegCmov(N, DAG))
55755 return V;
55756
55757 EVT VT = N->getValueType(0);
55758 SDValue N0 = N->getOperand(0);
55759 SDValue N1 = N->getOperand(1);
55760 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55761 SDLoc dl(N);
55762
55763 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55764 // both SSE and AVX2 since there is no sign-extended shift right
55765 // operation on a vector with 64-bit elements.
55766 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55767 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55768 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55769 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55770 SDValue N00 = N0.getOperand(0);
55771
55772 // EXTLOAD has a better solution on AVX2,
55773 // it may be replaced with X86ISD::VSEXT node.
55774 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55775 if (!ISD::isNormalLoad(N00.getNode()))
55776 return SDValue();
55777
55778 // Attempt to promote any comparison mask ops before moving the
55779 // SIGN_EXTEND_INREG in the way.
55780 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55781 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55782
55783 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55784 SDValue Tmp =
55785 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55786 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55787 }
55788 }
55789 return SDValue();
55790}
55791
55792/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55793/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55794/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55795/// opportunities to combine math ops, use an LEA, or use a complex addressing
55796/// mode. This can eliminate extend, add, and shift instructions.
55798 const X86Subtarget &Subtarget) {
55799 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55800 Ext->getOpcode() != ISD::ZERO_EXTEND)
55801 return SDValue();
55802
55803 // TODO: This should be valid for other integer types.
55804 EVT VT = Ext->getValueType(0);
55805 if (VT != MVT::i64)
55806 return SDValue();
55807
55808 SDValue Add = Ext->getOperand(0);
55809 if (Add.getOpcode() != ISD::ADD)
55810 return SDValue();
55811
55812 SDValue AddOp0 = Add.getOperand(0);
55813 SDValue AddOp1 = Add.getOperand(1);
55814 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55815 bool NSW = Add->getFlags().hasNoSignedWrap();
55816 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55817 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55818 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55819
55820 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55821 // into the 'zext'
55822 if ((Sext && !NSW) || (!Sext && !NUW))
55823 return SDValue();
55824
55825 // Having a constant operand to the 'add' ensures that we are not increasing
55826 // the instruction count because the constant is extended for free below.
55827 // A constant operand can also become the displacement field of an LEA.
55828 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55829 if (!AddOp1C)
55830 return SDValue();
55831
55832 // Don't make the 'add' bigger if there's no hope of combining it with some
55833 // other 'add' or 'shl' instruction.
55834 // TODO: It may be profitable to generate simpler LEA instructions in place
55835 // of single 'add' instructions, but the cost model for selecting an LEA
55836 // currently has a high threshold.
55837 bool HasLEAPotential = false;
55838 for (auto *User : Ext->users()) {
55839 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55840 HasLEAPotential = true;
55841 break;
55842 }
55843 }
55844 if (!HasLEAPotential)
55845 return SDValue();
55846
55847 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55848 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55849 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55850 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55851
55852 // The wider add is guaranteed to not wrap because both operands are
55853 // sign-extended.
55854 SDNodeFlags Flags;
55855 Flags.setNoSignedWrap(NSW);
55856 Flags.setNoUnsignedWrap(NUW);
55857 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55858}
55859
55860// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55861// operands and the result of CMOV is not used anywhere else - promote CMOV
55862// itself instead of promoting its result. This could be beneficial, because:
55863// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55864// (or more) pseudo-CMOVs only when they go one-after-another and
55865// getting rid of result extension code after CMOV will help that.
55866// 2) Promotion of constant CMOV arguments is free, hence the
55867// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55868// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55869// promotion is also good in terms of code-size.
55870// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55871// promotion).
55873 SDValue CMovN = Extend->getOperand(0);
55874 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55875 return SDValue();
55876
55877 EVT TargetVT = Extend->getValueType(0);
55878 unsigned ExtendOpcode = Extend->getOpcode();
55879 SDLoc DL(Extend);
55880
55881 EVT VT = CMovN.getValueType();
55882 SDValue CMovOp0 = CMovN.getOperand(0);
55883 SDValue CMovOp1 = CMovN.getOperand(1);
55884
55885 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55886 !isa<ConstantSDNode>(CMovOp1.getNode()))
55887 return SDValue();
55888
55889 // Only extend to i32 or i64.
55890 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55891 return SDValue();
55892
55893 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55894 // are free.
55895 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55896 return SDValue();
55897
55898 // If this a zero extend to i64, we should only extend to i32 and use a free
55899 // zero extend to finish.
55900 EVT ExtendVT = TargetVT;
55901 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55902 ExtendVT = MVT::i32;
55903
55904 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55905 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55906
55907 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55908 CMovN.getOperand(2), CMovN.getOperand(3));
55909
55910 // Finish extending if needed.
55911 if (ExtendVT != TargetVT)
55912 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55913
55914 return Res;
55915}
55916
55917// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55918// result type.
55920 const X86Subtarget &Subtarget) {
55921 SDValue N0 = N->getOperand(0);
55922 EVT VT = N->getValueType(0);
55923 SDLoc dl(N);
55924
55925 // Only do this combine with AVX512 for vector extends.
55926 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55927 return SDValue();
55928
55929 // Only combine legal element types.
55930 EVT SVT = VT.getVectorElementType();
55931 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55932 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55933 return SDValue();
55934
55935 // We don't have CMPP Instruction for vxf16
55936 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55937 return SDValue();
55938 // We can only do this if the vector size in 256 bits or less.
55939 unsigned Size = VT.getSizeInBits();
55940 if (Size > 256 && Subtarget.useAVX512Regs())
55941 return SDValue();
55942
55943 EVT N00VT = N0.getOperand(0).getValueType();
55944
55945 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55946 // that's the only integer compares with we have.
55948 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55949 return SDValue();
55950
55951 // Only do this combine if the extension will be fully consumed by the setcc.
55952 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55953 if (Size != MatchingVecType.getSizeInBits())
55954 return SDValue();
55955
55956 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55957
55958 if (N->getOpcode() == ISD::ZERO_EXTEND)
55959 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55960
55961 return Res;
55962}
55963
55966 const X86Subtarget &Subtarget) {
55967 SDValue N0 = N->getOperand(0);
55968 EVT VT = N->getValueType(0);
55969 SDLoc DL(N);
55970
55971 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55972 if (!DCI.isBeforeLegalizeOps() &&
55974 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55975 N0->getOperand(1));
55976 bool ReplaceOtherUses = !N0.hasOneUse();
55977 DCI.CombineTo(N, Setcc);
55978 // Replace other uses with a truncate of the widened setcc_carry.
55979 if (ReplaceOtherUses) {
55980 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55981 N0.getValueType(), Setcc);
55982 DCI.CombineTo(N0.getNode(), Trunc);
55983 }
55984
55985 return SDValue(N, 0);
55986 }
55987
55988 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55989 return NewCMov;
55990
55991 if (!DCI.isBeforeLegalizeOps())
55992 return SDValue();
55993
55994 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55995 return V;
55996
55997 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55998 DAG, DCI, Subtarget))
55999 return V;
56000
56001 if (VT.isVector()) {
56002 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
56003 return R;
56004
56006 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
56007 }
56008
56009 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56010 return NewAdd;
56011
56012 return SDValue();
56013}
56014
56015// Inverting a constant vector is profitable if it can be eliminated and the
56016// inverted vector is already present in DAG. Otherwise, it will be loaded
56017// anyway.
56018//
56019// We determine which of the values can be completely eliminated and invert it.
56020// If both are eliminable, select a vector with the first negative element.
56023 "ConstantFP build vector expected");
56024 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
56025 // can eliminate it. Since this function is invoked for each FMA with this
56026 // vector.
56027 auto IsNotFMA = [](SDNode *User) {
56028 return User->getOpcode() != ISD::FMA &&
56029 User->getOpcode() != ISD::STRICT_FMA;
56030 };
56031 if (llvm::any_of(V->users(), IsNotFMA))
56032 return SDValue();
56033
56035 EVT VT = V.getValueType();
56036 EVT EltVT = VT.getVectorElementType();
56037 for (const SDValue &Op : V->op_values()) {
56038 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56039 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
56040 } else {
56041 assert(Op.isUndef());
56042 Ops.push_back(DAG.getUNDEF(EltVT));
56043 }
56044 }
56045
56047 if (!NV)
56048 return SDValue();
56049
56050 // If an inverted version cannot be eliminated, choose it instead of the
56051 // original version.
56052 if (llvm::any_of(NV->users(), IsNotFMA))
56053 return SDValue(NV, 0);
56054
56055 // If the inverted version also can be eliminated, we have to consistently
56056 // prefer one of the values. We prefer a constant with a negative value on
56057 // the first place.
56058 // N.B. We need to skip undefs that may precede a value.
56059 for (const SDValue &Op : V->op_values()) {
56060 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56061 if (Cst->isNegative())
56062 return SDValue();
56063 break;
56064 }
56065 }
56066 return SDValue(NV, 0);
56067}
56068
56071 const X86Subtarget &Subtarget) {
56072 SDLoc dl(N);
56073 EVT VT = N->getValueType(0);
56075 bool IsStrict = N->isTargetOpcode()
56076 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
56077 : N->isStrictFPOpcode();
56078
56079 // Let legalize expand this if it isn't a legal type yet.
56080 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56081 if (!TLI.isTypeLegal(VT))
56082 return SDValue();
56083
56084 SDValue A = N->getOperand(IsStrict ? 1 : 0);
56085 SDValue B = N->getOperand(IsStrict ? 2 : 1);
56086 SDValue C = N->getOperand(IsStrict ? 3 : 2);
56087
56088 // If the operation allows fast-math and the target does not support FMA,
56089 // split this into mul+add to avoid libcall(s).
56090 SDNodeFlags Flags = N->getFlags();
56091 if (!IsStrict && Flags.hasAllowReassociation() &&
56092 TLI.isOperationExpand(ISD::FMA, VT)) {
56093 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
56094 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
56095 }
56096
56097 EVT ScalarVT = VT.getScalarType();
56098 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
56099 !Subtarget.hasAnyFMA()) &&
56100 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
56101 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
56102 return SDValue();
56103
56104 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
56106 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56107 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
56108 CodeSize)) {
56109 V = NegV;
56110 return true;
56111 }
56112 // Look through extract_vector_elts. If it comes from an FNEG, create a
56113 // new extract from the FNEG input.
56114 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56115 isNullConstant(V.getOperand(1))) {
56116 SDValue Vec = V.getOperand(0);
56117 if (SDValue NegV = TLI.getCheaperNegatedExpression(
56118 Vec, DAG, LegalOperations, CodeSize)) {
56119 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
56120 NegV, V.getOperand(1));
56121 return true;
56122 }
56123 }
56124 // Lookup if there is an inverted version of constant vector V in DAG.
56125 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
56126 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
56127 V = NegV;
56128 return true;
56129 }
56130 }
56131 return false;
56132 };
56133
56134 // Do not convert the passthru input of scalar intrinsics.
56135 // FIXME: We could allow negations of the lower element only.
56136 bool NegA = invertIfNegative(A);
56137 // Create a dummy use for A so that in the process of negating B or C
56138 // recursively, it is not deleted.
56139 HandleSDNode NegAHandle(A);
56140 bool NegB = invertIfNegative(B);
56141 // Similar to A, get a handle on B.
56142 HandleSDNode NegBHandle(B);
56143 bool NegC = invertIfNegative(C);
56144
56145 if (!NegA && !NegB && !NegC)
56146 return SDValue();
56147
56148 unsigned NewOpcode =
56149 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56150
56151 // Propagate fast-math-flags to new FMA node.
56152 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56153 if (IsStrict) {
56154 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56155 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56156 {N->getOperand(0), A, B, C});
56157 } else {
56158 if (N->getNumOperands() == 4)
56159 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56160 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56161 }
56162}
56163
56164// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56165// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56168 SDLoc dl(N);
56169 EVT VT = N->getValueType(0);
56170 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56172 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56173
56174 SDValue N2 = N->getOperand(2);
56175
56176 SDValue NegN2 =
56177 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56178 if (!NegN2)
56179 return SDValue();
56180 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56181
56182 if (N->getNumOperands() == 4)
56183 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56184 NegN2, N->getOperand(3));
56185 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56186 NegN2);
56187}
56188
56189// Try to widen the build vector and bitcast it to the type of zext.
56190// This is a special case for the 128-bit vector types. Intention is to remove
56191// the zext and replace it with a bitcast the wider type. While lowering
56192// the bitcast is removed and extra commutation due to zext is avoided.
56193// For example:
56194// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56195// build_vector (x, 0, y, 0, z, w, 0)
56197
56198 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56199 return SDValue();
56200
56201 EVT ExtendVT = Extend->getValueType(0);
56202
56203 SDValue BV = Extend->getOperand(0);
56204 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56205 return SDValue();
56206
56207 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56208 // If the build vector has undef elements, we cannot widen it.
56209 // The widening would create a vector with more undef elements, which
56210 // is not valid.
56211 return SDValue();
56212 }
56213
56214 if (!all_of(BV->op_values(),
56215 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56216 // If the build vector any element other than \ISD::LOAD, we cannot widen
56217 // it.
56218 return SDValue();
56219 }
56220
56221 SDLoc dl(BV);
56222 EVT VT = BV.getValueType();
56223 EVT EltVT = BV.getOperand(0).getValueType();
56224 unsigned NumElts = VT.getVectorNumElements();
56225
56226 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56227
56228 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56230 return SDValue();
56231
56232 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56233 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56234
56235 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56236 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56237 // Fill the new elements with Zero.
56238 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56239 // Compute the step to place the elements in the right place and control the
56240 // iteration.
56241 unsigned step = WidenNumElts / NumElts;
56242 if (WidenVT.is128BitVector()) {
56243 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56244 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56245 i--, j -= step) {
56246 SDValue temp = NewOps[i];
56247 NewOps[i] = NewOps[j];
56248 NewOps[j] = temp;
56249 }
56250 // Create new build vector with WidenVT and NewOps
56251 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56252 // Replace the old build vector with the new one. Bitcast the
56253 // new build vector to the type of the zext.
56254 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56255 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56256 return NewBV;
56257 }
56258 }
56259 return SDValue();
56260}
56261
56264 const X86Subtarget &Subtarget) {
56265 SDLoc dl(N);
56266 SDValue N0 = N->getOperand(0);
56267 EVT VT = N->getValueType(0);
56268
56269 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56270 // FIXME: Is this needed? We don't seem to have any tests for it.
56271 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56273 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56274 N0->getOperand(1));
56275 bool ReplaceOtherUses = !N0.hasOneUse();
56276 DCI.CombineTo(N, Setcc);
56277 // Replace other uses with a truncate of the widened setcc_carry.
56278 if (ReplaceOtherUses) {
56279 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56280 N0.getValueType(), Setcc);
56281 DCI.CombineTo(N0.getNode(), Trunc);
56282 }
56283
56284 return SDValue(N, 0);
56285 }
56286
56287 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56288 return NewCMov;
56289
56290 if (DCI.isBeforeLegalizeOps())
56291 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56292 return V;
56293
56294 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56295 DAG, DCI, Subtarget))
56296 return V;
56297
56298 if (VT.isVector())
56299 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56300 return R;
56301
56302 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56303 return NewAdd;
56304
56305 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56306 return R;
56307
56308 // TODO: Combine with any target/faux shuffle.
56309 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56311 SDValue N00 = N0.getOperand(0);
56312 SDValue N01 = N0.getOperand(1);
56313 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56314 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56315 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56316 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56317 return concatSubVectors(N00, N01, DAG, dl);
56318 }
56319 }
56320
56321 if (SDValue V = widenBuildVec(N, DAG))
56322 return V;
56323
56324 return SDValue();
56325}
56326
56327/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56328/// pre-promote its result type since vXi1 vectors don't get promoted
56329/// during type legalization.
56332 const SDLoc &DL, SelectionDAG &DAG,
56333 const X86Subtarget &Subtarget) {
56334 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56335 VT.getVectorElementType() == MVT::i1 &&
56336 (OpVT.getVectorElementType() == MVT::i8 ||
56337 OpVT.getVectorElementType() == MVT::i16)) {
56338 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56339 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56340 }
56341 return SDValue();
56342}
56343
56344// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56345// eq/ne) is generated when using an integer as a mask. Instead of generating a
56346// broadcast + vptest, we can directly move the integer to a mask register.
56348 const SDLoc &DL, SelectionDAG &DAG,
56349 const X86Subtarget &Subtarget) {
56350 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56351 return SDValue();
56352
56353 if (!Subtarget.hasAVX512())
56354 return SDValue();
56355
56356 if (Op0.getOpcode() != ISD::AND)
56357 return SDValue();
56358
56359 SDValue Broadcast = Op0.getOperand(0);
56360 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56361 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56362 return SDValue();
56363
56364 SDValue Load = Op0.getOperand(1);
56365 EVT LoadVT = Load.getSimpleValueType();
56366
56367 APInt UndefElts;
56368 SmallVector<APInt, 32> EltBits;
56370 UndefElts, EltBits,
56371 /*AllowWholeUndefs*/ true,
56372 /*AllowPartialUndefs*/ false) ||
56373 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56374 return SDValue();
56375
56376 // Check if the constant pool contains only powers of 2 starting from some
56377 // 2^N. The table may also contain undefs because of widening of vector
56378 // operands.
56379 unsigned N = EltBits[0].logBase2();
56380 unsigned Len = UndefElts.getBitWidth();
56381 for (unsigned I = 1; I != Len; ++I) {
56382 if (UndefElts[I]) {
56383 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56384 return SDValue();
56385 break;
56386 }
56387
56388 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56389 return SDValue();
56390 }
56391
56392 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56393 SDValue BroadcastOp;
56394 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56395 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56396 Broadcast, DAG.getVectorIdxConstant(0, DL));
56397 } else {
56398 BroadcastOp = Broadcast.getOperand(0);
56399 if (BroadcastOp.getValueType().isVector())
56400 return SDValue();
56401 }
56402
56403 SDValue Masked = BroadcastOp;
56404 if (N != 0) {
56405 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56406 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56407
56408 if (NumDefinedElts > BroadcastOpBitWidth)
56409 return SDValue();
56410
56411 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56412 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56413 DAG.getConstant(N, DL, BroadcastOpVT));
56414 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56415 DAG.getConstant(Mask, DL, BroadcastOpVT));
56416 }
56417 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56418 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56419 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56420 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56421
56422 if (CC == ISD::SETEQ)
56423 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56424
56425 if (VT != MVT::v16i1)
56426 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56427 DAG.getVectorIdxConstant(0, DL));
56428
56429 return Bitcast;
56430}
56431
56434 const X86Subtarget &Subtarget) {
56435 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56436 const SDValue LHS = N->getOperand(0);
56437 const SDValue RHS = N->getOperand(1);
56438 EVT VT = N->getValueType(0);
56439 EVT OpVT = LHS.getValueType();
56440 SDLoc DL(N);
56441
56442 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56443 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56444 Subtarget))
56445 return V;
56446 }
56447
56448 if (VT == MVT::i1) {
56449 X86::CondCode X86CC;
56450 if (SDValue V =
56451 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56452 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56453 }
56454
56455 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56456 if (OpVT.isScalarInteger()) {
56457 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56458 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56459 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56460 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56461 if (N0.getOperand(0) == N1)
56462 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56463 N0.getOperand(1));
56464 if (N0.getOperand(1) == N1)
56465 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56466 N0.getOperand(0));
56467 }
56468 return SDValue();
56469 };
56470 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56471 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56472 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56473 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56474
56475 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56476 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56477 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56478 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56479 if (N0.getOperand(0) == N1)
56480 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56481 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56482 if (N0.getOperand(1) == N1)
56483 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56484 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56485 }
56486 return SDValue();
56487 };
56488 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56489 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56490 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56491 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56492
56493 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56494 // cmpne(trunc(x),C) --> cmpne(x,C)
56495 // iff x upper bits are zero.
56496 if (LHS.getOpcode() == ISD::TRUNCATE &&
56497 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56499 EVT SrcVT = LHS.getOperand(0).getValueType();
56501 OpVT.getScalarSizeInBits());
56502 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56503 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56504 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56505 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56506 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56507 }
56508
56509 // With C as a power of 2 and C != 0 and C != INT_MIN:
56510 // icmp eq Abs(X) C ->
56511 // (icmp eq A, C) | (icmp eq A, -C)
56512 // icmp ne Abs(X) C ->
56513 // (icmp ne A, C) & (icmp ne A, -C)
56514 // Both of these patterns can be better optimized in
56515 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56516 // integers which is checked above.
56517 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56518 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56519 const APInt &CInt = C->getAPIntValue();
56520 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56521 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56522 SDValue BaseOp = LHS.getOperand(0);
56523 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56524 SDValue SETCC1 = DAG.getSetCC(
56525 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56526 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56527 SETCC0, SETCC1);
56528 }
56529 }
56530 }
56531 }
56532 }
56533
56534 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56535 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56536 // Using temporaries to avoid messing up operand ordering for later
56537 // transformations if this doesn't work.
56538 SDValue Op0 = LHS;
56539 SDValue Op1 = RHS;
56540 ISD::CondCode TmpCC = CC;
56541 // Put build_vector on the right.
56542 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56543 std::swap(Op0, Op1);
56544 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56545 }
56546
56547 bool IsSEXT0 =
56548 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56549 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56550 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56551
56552 if (IsSEXT0 && IsVZero1) {
56553 assert(VT == Op0.getOperand(0).getValueType() &&
56554 "Unexpected operand type");
56555 if (TmpCC == ISD::SETGT)
56556 return DAG.getConstant(0, DL, VT);
56557 if (TmpCC == ISD::SETLE)
56558 return DAG.getConstant(1, DL, VT);
56559 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56560 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56561
56562 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56563 "Unexpected condition code!");
56564 return Op0.getOperand(0);
56565 }
56566
56567 if (IsVZero1)
56568 if (SDValue V =
56569 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56570 return V;
56571 }
56572
56573 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56574 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56575 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56576 // a mask, there are signed AVX512 comparisons).
56577 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56578 bool CanMakeSigned = false;
56579 if (ISD::isUnsignedIntSetCC(CC)) {
56580 KnownBits CmpKnown =
56582 // If we know LHS/RHS share the same sign bit at each element we can
56583 // make this signed.
56584 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56585 // across all lanes. So a pattern where the sign varies from lane to
56586 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56587 // missed. We could get around this by demanding each lane
56588 // independently, but this isn't the most important optimization and
56589 // that may eat into compile time.
56590 CanMakeSigned =
56591 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56592 }
56593 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56594 SDValue LHSOut = LHS;
56595 SDValue RHSOut = RHS;
56596 ISD::CondCode NewCC = CC;
56597 switch (CC) {
56598 case ISD::SETGE:
56599 case ISD::SETUGE:
56600 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56601 /*NSW*/ true))
56602 LHSOut = NewLHS;
56603 else if (SDValue NewRHS = incDecVectorConstant(
56604 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56605 RHSOut = NewRHS;
56606 else
56607 break;
56608
56609 [[fallthrough]];
56610 case ISD::SETUGT:
56611 NewCC = ISD::SETGT;
56612 break;
56613
56614 case ISD::SETLE:
56615 case ISD::SETULE:
56616 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56617 /*NSW*/ true))
56618 LHSOut = NewLHS;
56619 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56620 /*NSW*/ true))
56621 RHSOut = NewRHS;
56622 else
56623 break;
56624
56625 [[fallthrough]];
56626 case ISD::SETULT:
56627 // Will be swapped to SETGT in LowerVSETCC*.
56628 NewCC = ISD::SETLT;
56629 break;
56630 default:
56631 break;
56632 }
56633 if (NewCC != CC) {
56634 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56635 NewCC, DL, DAG, Subtarget))
56636 return R;
56637 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56638 }
56639 }
56640 }
56641
56642 if (SDValue R =
56643 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56644 return R;
56645
56646 // In the middle end transforms:
56647 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56648 // -> `(icmp ult (add x, -C), 2)`
56649 // Likewise inverted cases with `ugt`.
56650 //
56651 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56652 // in worse codegen. So, undo the middle-end transform and go back to `(or
56653 // (icmp eq), (icmp eq))` form.
56654 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56655 // the xmm approach.
56656 //
56657 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56658 // ne))` as it doesn't end up instruction positive.
56659 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56660 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56661 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56662 !Subtarget.hasAVX512() &&
56663 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56664 Subtarget.hasAVX2()) &&
56665 LHS.hasOneUse()) {
56666
56667 APInt CmpC;
56668 SDValue AddC = LHS.getOperand(1);
56669 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56671 // See which form we have depending on the constant/condition.
56672 SDValue C0 = SDValue();
56673 SDValue C1 = SDValue();
56674
56675 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56676 // we will end up generating an additional constant. Keeping in the
56677 // current form has a slight latency cost, but it probably worth saving a
56678 // constant.
56681 // Pass
56682 }
56683 // Normal Cases
56684 else if ((CC == ISD::SETULT && CmpC == 2) ||
56685 (CC == ISD::SETULE && CmpC == 1)) {
56686 // These will constant fold.
56687 C0 = DAG.getNegative(AddC, DL, OpVT);
56688 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56689 DAG.getAllOnesConstant(DL, OpVT));
56690 }
56691 // Inverted Cases
56692 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56693 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56694 // These will constant fold.
56695 C0 = DAG.getNOT(DL, AddC, OpVT);
56696 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56697 DAG.getAllOnesConstant(DL, OpVT));
56698 }
56699 if (C0 && C1) {
56700 SDValue NewLHS =
56701 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56702 SDValue NewRHS =
56703 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56704 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56705 }
56706 }
56707 }
56708
56709 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56710 // to avoid scalarization via legalization because v4i32 is not a legal type.
56711 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56712 LHS.getValueType() == MVT::v4f32)
56713 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56714
56715 // X pred 0.0 --> X pred -X
56716 // If the negation of X already exists, use it in the comparison. This removes
56717 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56718 // instructions in patterns with a 'select' node.
56720 SDVTList FNegVT = DAG.getVTList(OpVT);
56721 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56722 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56723 }
56724
56725 return SDValue();
56726}
56727
56730 const X86Subtarget &Subtarget) {
56731 SDValue Src = N->getOperand(0);
56732 MVT SrcVT = Src.getSimpleValueType();
56733 MVT VT = N->getSimpleValueType(0);
56734 unsigned NumBits = VT.getScalarSizeInBits();
56735 unsigned NumElts = SrcVT.getVectorNumElements();
56736 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56737 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56738
56739 // Perform constant folding.
56740 APInt UndefElts;
56741 SmallVector<APInt, 32> EltBits;
56742 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56743 /*AllowWholeUndefs*/ true,
56744 /*AllowPartialUndefs*/ true)) {
56745 APInt Imm(32, 0);
56746 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56747 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56748 Imm.setBit(Idx);
56749
56750 return DAG.getConstant(Imm, SDLoc(N), VT);
56751 }
56752
56753 // Look through int->fp bitcasts that don't change the element width.
56754 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56755 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56756 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56757 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56758
56759 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56760 // with scalar comparisons.
56761 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56762 SDLoc DL(N);
56763 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56764 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56765 return DAG.getNode(ISD::XOR, DL, VT,
56766 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56767 DAG.getConstant(NotMask, DL, VT));
56768 }
56769
56770 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56771 // results with scalar comparisons.
56772 if (Src.getOpcode() == X86ISD::PCMPGT &&
56773 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56774 SDLoc DL(N);
56775 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56776 return DAG.getNode(ISD::XOR, DL, VT,
56777 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56778 DAG.getConstant(NotMask, DL, VT));
56779 }
56780
56781 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56782 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56783 // iff pow2splat(c1).
56784 // Use KnownBits to determine if only a single bit is non-zero
56785 // in each element (pow2 or zero), and shift that bit to the msb.
56786 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56787 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56788 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56789 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56790 if (KnownLHS.countMaxPopulation() == 1 &&
56791 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56792 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56793 SDLoc DL(N);
56794 MVT ShiftVT = SrcVT;
56795 SDValue ShiftLHS = Src.getOperand(0);
56796 SDValue ShiftRHS = Src.getOperand(1);
56797 if (ShiftVT.getScalarType() == MVT::i8) {
56798 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56799 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56800 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56801 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56802 }
56803 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56804 ShiftLHS, ShiftAmt, DAG);
56805 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56806 ShiftRHS, ShiftAmt, DAG);
56807 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56808 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56809 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56810 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56811 }
56812 }
56813
56814 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56815 if (N->isOnlyUserOf(Src.getNode())) {
56817 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56818 APInt UndefElts;
56819 SmallVector<APInt, 32> EltBits;
56820 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56821 UndefElts, EltBits)) {
56822 APInt Mask = APInt::getZero(NumBits);
56823 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56824 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56825 Mask.setBit(Idx);
56826 }
56827 SDLoc DL(N);
56828 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56829 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56830 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56831 DAG.getConstant(Mask, DL, VT));
56832 }
56833 }
56834 }
56835
56836 // Simplify the inputs.
56837 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56838 APInt DemandedMask(APInt::getAllOnes(NumBits));
56839 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56840 return SDValue(N, 0);
56841
56842 return SDValue();
56843}
56844
56847 const X86Subtarget &Subtarget) {
56848 MVT VT = N->getSimpleValueType(0);
56849 unsigned NumBits = VT.getScalarSizeInBits();
56850
56851 // Simplify the inputs.
56852 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56853 APInt DemandedMask(APInt::getAllOnes(NumBits));
56854 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56855 return SDValue(N, 0);
56856
56857 return SDValue();
56858}
56859
56863 SDValue Mask = MemOp->getMask();
56864
56865 // With vector masks we only demand the upper bit of the mask.
56866 if (Mask.getScalarValueSizeInBits() != 1) {
56867 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56868 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56869 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56870 if (N->getOpcode() != ISD::DELETED_NODE)
56871 DCI.AddToWorklist(N);
56872 return SDValue(N, 0);
56873 }
56874 }
56875
56876 return SDValue();
56877}
56878
56880 SDValue Index, SDValue Base, SDValue Scale,
56881 SelectionDAG &DAG) {
56882 SDLoc DL(GorS);
56883
56884 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56885 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56886 Gather->getMask(), Base, Index, Scale } ;
56887 return DAG.getMaskedGather(Gather->getVTList(),
56888 Gather->getMemoryVT(), DL, Ops,
56889 Gather->getMemOperand(),
56890 Gather->getIndexType(),
56891 Gather->getExtensionType());
56892 }
56893 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56894 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56895 Scatter->getMask(), Base, Index, Scale };
56896 return DAG.getMaskedScatter(Scatter->getVTList(),
56897 Scatter->getMemoryVT(), DL,
56898 Ops, Scatter->getMemOperand(),
56899 Scatter->getIndexType(),
56900 Scatter->isTruncatingStore());
56901}
56902
56905 SDLoc DL(N);
56906 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56907 SDValue Index = GorS->getIndex();
56908 SDValue Base = GorS->getBasePtr();
56909 SDValue Scale = GorS->getScale();
56910 EVT IndexVT = Index.getValueType();
56911 EVT IndexSVT = IndexVT.getVectorElementType();
56912 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56913 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56914 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56915
56916 if (DCI.isBeforeLegalize()) {
56917 // Attempt to move shifted index into the address scale, allows further
56918 // index truncation below.
56919 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56920 isa<ConstantSDNode>(Scale)) {
56921 unsigned ScaleAmt = Scale->getAsZExtVal();
56922 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56923 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56924 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56925 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56926 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56927 if (N->getOpcode() != ISD::DELETED_NODE)
56928 DCI.AddToWorklist(N);
56929 return SDValue(N, 0);
56930 }
56931 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56932 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56933 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56934 SDValue ShAmt = Index.getOperand(1);
56935 SDValue NewShAmt =
56936 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56937 DAG.getConstant(1, DL, ShAmt.getValueType()));
56938 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56939 Index.getOperand(0), NewShAmt);
56940 SDValue NewScale =
56941 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56942 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56943 }
56944 }
56945 }
56946
56947 // Shrink indices if they are larger than 32-bits.
56948 // Only do this before legalize types since v2i64 could become v2i32.
56949 // FIXME: We could check that the type is legal if we're after legalize
56950 // types, but then we would need to construct test cases where that happens.
56951 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56952 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56953
56954 // FIXME: We could support more than just constant fold, but we need to
56955 // careful with costing. A truncate that can be optimized out would be
56956 // fine. Otherwise we might only want to create a truncate if it avoids
56957 // a split.
56958 if (SDValue TruncIndex =
56959 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56960 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56961
56962 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56963 // there are sufficient sign bits. Only do this before legalize types to
56964 // avoid creating illegal types in truncate.
56965 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56966 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56967 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56968 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56969 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56970 }
56971
56972 // Shrink if we remove an illegal type.
56973 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56974 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56975 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56976 }
56977 }
56978 }
56979
56980 // Try to move splat adders from the index operand to the base
56981 // pointer operand. Taking care to multiply by the scale. We can only do
56982 // this when index element type is the same as the pointer type.
56983 // Otherwise we need to be sure the math doesn't wrap before the scale.
56984 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56985 isa<ConstantSDNode>(Scale)) {
56986 uint64_t ScaleAmt = Scale->getAsZExtVal();
56987
56988 for (unsigned I = 0; I != 2; ++I)
56989 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56990 BitVector UndefElts;
56991 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56992 if (UndefElts.none()) {
56993 // If the splat value is constant we can add the scaled splat value
56994 // to the existing base.
56995 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56996 APInt Adder = C->getAPIntValue() * ScaleAmt;
56997 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56998 DAG.getConstant(Adder, DL, PtrVT));
56999 SDValue NewIndex = Index.getOperand(1 - I);
57000 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
57001 }
57002 // For non-constant cases, limit this to non-scaled cases.
57003 if (ScaleAmt == 1) {
57004 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
57005 SDValue NewIndex = Index.getOperand(1 - I);
57006 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
57007 }
57008 }
57009 }
57010 // It's also possible base is just a constant. In that case, just
57011 // replace it with 0 and move the displacement into the index.
57012 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
57013 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
57014 // Combine the constant build_vector and the constant base.
57015 Splat =
57016 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
57017 // Add to the other half of the original Index add.
57018 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
57019 Index.getOperand(1 - I), Splat);
57020 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
57021 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
57022 }
57023 }
57024 }
57025
57026 if (DCI.isBeforeLegalizeOps()) {
57027 // Make sure the index is either i32 or i64
57028 if (IndexWidth != 32 && IndexWidth != 64) {
57029 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
57030 IndexVT = IndexVT.changeVectorElementType(EltVT);
57031 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
57032 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
57033 }
57034 }
57035
57036 // With vector masks we only demand the upper bit of the mask.
57037 SDValue Mask = GorS->getMask();
57038 if (Mask.getScalarValueSizeInBits() != 1) {
57039 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
57040 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
57041 if (N->getOpcode() != ISD::DELETED_NODE)
57042 DCI.AddToWorklist(N);
57043 return SDValue(N, 0);
57044 }
57045 }
57046
57047 return SDValue();
57048}
57049
57050// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
57052 const X86Subtarget &Subtarget) {
57053 SDLoc DL(N);
57054 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
57055 SDValue EFLAGS = N->getOperand(1);
57056
57057 // Try to simplify the EFLAGS and condition code operands.
57058 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
57059 return getSETCC(CC, Flags, DL, DAG);
57060
57061 return SDValue();
57062}
57063
57064/// Optimize branch condition evaluation.
57066 const X86Subtarget &Subtarget) {
57067 SDLoc DL(N);
57068 SDValue EFLAGS = N->getOperand(3);
57069 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
57070
57071 // Try to simplify the EFLAGS and condition code operands.
57072 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
57073 // RAUW them under us.
57074 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
57075 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
57076 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
57077 N->getOperand(1), Cond, Flags);
57078 }
57079
57080 return SDValue();
57081}
57082
57083// TODO: Could we move this to DAGCombine?
57085 SelectionDAG &DAG) {
57086 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
57087 // to optimize away operation when it's from a constant.
57088 //
57089 // The general transformation is:
57090 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
57091 // AND(VECTOR_CMP(x,y), constant2)
57092 // constant2 = UNARYOP(constant)
57093
57094 // Early exit if this isn't a vector operation, the operand of the
57095 // unary operation isn't a bitwise AND, or if the sizes of the operations
57096 // aren't the same.
57097 EVT VT = N->getValueType(0);
57098 bool IsStrict = N->isStrictFPOpcode();
57099 unsigned NumEltBits = VT.getScalarSizeInBits();
57100 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57101 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
57102 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
57103 VT.getSizeInBits() != Op0.getValueSizeInBits())
57104 return SDValue();
57105
57106 // Now check that the other operand of the AND is a constant. We could
57107 // make the transformation for non-constant splats as well, but it's unclear
57108 // that would be a benefit as it would not eliminate any operations, just
57109 // perform one more step in scalar code before moving to the vector unit.
57110 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
57111 // Bail out if the vector isn't a constant.
57112 if (!BV->isConstant())
57113 return SDValue();
57114
57115 // Everything checks out. Build up the new and improved node.
57116 SDLoc DL(N);
57117 EVT IntVT = BV->getValueType(0);
57118 // Create a new constant of the appropriate type for the transformed
57119 // DAG.
57120 SDValue SourceConst;
57121 if (IsStrict)
57122 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
57123 {N->getOperand(0), SDValue(BV, 0)});
57124 else
57125 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
57126 // The AND node needs bitcasts to/from an integer vector type around it.
57127 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
57128 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57129 MaskConst);
57130 SDValue Res = DAG.getBitcast(VT, NewAnd);
57131 if (IsStrict)
57132 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57133 return Res;
57134 }
57135
57136 return SDValue();
57137}
57138
57139/// If we are converting a value to floating-point, try to replace scalar
57140/// truncate of an extracted vector element with a bitcast. This tries to keep
57141/// the sequence on XMM registers rather than moving between vector and GPRs.
57143 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57144 // to allow being called by any similar cast opcode.
57145 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57146 SDValue Trunc = N->getOperand(0);
57147 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57148 return SDValue();
57149
57150 SDValue ExtElt = Trunc.getOperand(0);
57151 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57152 !isNullConstant(ExtElt.getOperand(1)))
57153 return SDValue();
57154
57155 EVT TruncVT = Trunc.getValueType();
57156 EVT SrcVT = ExtElt.getValueType();
57157 unsigned DestWidth = TruncVT.getSizeInBits();
57158 unsigned SrcWidth = SrcVT.getSizeInBits();
57159 if (SrcWidth % DestWidth != 0)
57160 return SDValue();
57161
57162 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57163 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57164 unsigned VecWidth = SrcVecVT.getSizeInBits();
57165 unsigned NumElts = VecWidth / DestWidth;
57166 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57167 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57168 SDLoc DL(N);
57169 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57170 BitcastVec, ExtElt.getOperand(1));
57171 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57172}
57173
57175 const X86Subtarget &Subtarget) {
57176 bool IsStrict = N->isStrictFPOpcode();
57177 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57178 EVT VT = N->getValueType(0);
57179 EVT InVT = Op0.getValueType();
57180
57181 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57182 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57183 // if hasFP16 support:
57184 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57185 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57186 // else
57187 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57188 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57189 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57190 unsigned ScalarSize = InVT.getScalarSizeInBits();
57191 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57192 ScalarSize >= 64)
57193 return SDValue();
57194 SDLoc dl(N);
57195 EVT DstVT =
57197 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57198 : ScalarSize < 32 ? MVT::i32
57199 : MVT::i64,
57200 InVT.getVectorNumElements());
57201 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57202 if (IsStrict)
57203 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57204 {N->getOperand(0), P});
57205 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57206 }
57207
57208 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57209 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57210 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57211 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57212 VT.getScalarType() != MVT::f16) {
57213 SDLoc dl(N);
57214 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57215 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57216
57217 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57218 if (IsStrict)
57219 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57220 {N->getOperand(0), P});
57221 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57222 }
57223
57224 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57225 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57226 // the optimization here.
57227 SDNodeFlags Flags = N->getFlags();
57228 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57229 if (IsStrict)
57230 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57231 {N->getOperand(0), Op0});
57232 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57233 }
57234
57235 return SDValue();
57236}
57237
57240 const X86Subtarget &Subtarget) {
57241 // First try to optimize away the conversion entirely when it's
57242 // conditionally from a constant. Vectors only.
57243 bool IsStrict = N->isStrictFPOpcode();
57245 return Res;
57246
57247 // Now move on to more general possibilities.
57248 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57249 EVT VT = N->getValueType(0);
57250 EVT InVT = Op0.getValueType();
57251
57252 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57253 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57254 // if hasFP16 support:
57255 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57256 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57257 // else
57258 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57259 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57260 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57261 unsigned ScalarSize = InVT.getScalarSizeInBits();
57262 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57263 ScalarSize >= 64)
57264 return SDValue();
57265 SDLoc dl(N);
57266 EVT DstVT =
57268 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57269 : ScalarSize < 32 ? MVT::i32
57270 : MVT::i64,
57271 InVT.getVectorNumElements());
57272 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57273 if (IsStrict)
57274 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57275 {N->getOperand(0), P});
57276 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57277 }
57278
57279 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57280 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57281 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57282 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57283 VT.getScalarType() != MVT::f16) {
57284 SDLoc dl(N);
57285 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57286 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57287 if (IsStrict)
57288 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57289 {N->getOperand(0), P});
57290 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57291 }
57292
57293 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57294 // vectors and scalars, see if we know that the upper bits are all the sign
57295 // bit, in which case we can truncate the input to i32 and convert from that.
57296 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57297 unsigned BitWidth = InVT.getScalarSizeInBits();
57298 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57299 if (NumSignBits >= (BitWidth - 31)) {
57300 EVT TruncVT = MVT::i32;
57301 if (InVT.isVector())
57302 TruncVT = InVT.changeVectorElementType(TruncVT);
57303 SDLoc dl(N);
57304 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57305 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57306 if (IsStrict)
57307 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57308 {N->getOperand(0), Trunc});
57309 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57310 }
57311 // If we're after legalize and the type is v2i32 we need to shuffle and
57312 // use CVTSI2P.
57313 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57314 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57315 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57316 { 0, 2, -1, -1 });
57317 if (IsStrict)
57318 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57319 {N->getOperand(0), Shuf});
57320 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57321 }
57322 }
57323
57324 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57325 // a 32-bit target where SSE doesn't support i64->FP operations.
57326 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57327 Op0.getOpcode() == ISD::LOAD) {
57328 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57329
57330 // This transformation is not supported if the result type is f16 or f128.
57331 if (VT == MVT::f16 || VT == MVT::f128)
57332 return SDValue();
57333
57334 // If we have AVX512DQ we can use packed conversion instructions unless
57335 // the VT is f80.
57336 if (Subtarget.hasDQI() && VT != MVT::f80)
57337 return SDValue();
57338
57339 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57340 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57341 std::pair<SDValue, SDValue> Tmp =
57342 Subtarget.getTargetLowering()->BuildFILD(
57343 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57344 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57345 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57346 return Tmp.first;
57347 }
57348 }
57349
57350 if (IsStrict)
57351 return SDValue();
57352
57353 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57354 return V;
57355
57356 return SDValue();
57357}
57358
57360 const X86Subtarget &Subtarget) {
57361 EVT VT = N->getValueType(0);
57362 SDValue Src = N->getOperand(0);
57363 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57364 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57365 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57366
57367 return SDValue();
57368}
57369
57370// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57372 const X86Subtarget &Subtarget) {
57373 if (!Subtarget.hasAVX10_2())
57374 return SDValue();
57375
57376 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57377 EVT SrcVT = N->getOperand(0).getValueType();
57378 EVT DstVT = N->getValueType(0);
57379 SDLoc dl(N);
57380
57381 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57382 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57383
57384 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57385 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57386 N->getOperand(0), V2F32Value);
57387
57388 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57389 if (IsSigned)
57390 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57391
57392 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57393 }
57394 return SDValue();
57395}
57396
57398 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57399
57400 for (const SDNode *User : Flags->users()) {
57401 X86::CondCode CC;
57402 switch (User->getOpcode()) {
57403 default:
57404 // Be conservative.
57405 return true;
57406 case X86ISD::SETCC:
57408 CC = (X86::CondCode)User->getConstantOperandVal(0);
57409 break;
57410 case X86ISD::BRCOND:
57411 case X86ISD::CMOV:
57412 CC = (X86::CondCode)User->getConstantOperandVal(2);
57413 break;
57414 }
57415
57416 switch (CC) {
57417 // clang-format off
57418 default: break;
57419 case X86::COND_A: case X86::COND_AE:
57420 case X86::COND_B: case X86::COND_BE:
57421 case X86::COND_O: case X86::COND_NO:
57422 case X86::COND_G: case X86::COND_GE:
57423 case X86::COND_L: case X86::COND_LE:
57424 return true;
57425 // clang-format on
57426 }
57427 }
57428
57429 return false;
57430}
57431
57432static bool onlyZeroFlagUsed(SDValue Flags) {
57433 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57434
57435 for (const SDNode *User : Flags->users()) {
57436 unsigned CCOpNo;
57437 switch (User->getOpcode()) {
57438 default:
57439 // Be conservative.
57440 return false;
57441 case X86ISD::SETCC:
57443 CCOpNo = 0;
57444 break;
57445 case X86ISD::BRCOND:
57446 case X86ISD::CMOV:
57447 CCOpNo = 2;
57448 break;
57449 }
57450
57451 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57452 if (CC != X86::COND_E && CC != X86::COND_NE)
57453 return false;
57454 }
57455
57456 return true;
57457}
57458
57461 const X86Subtarget &Subtarget) {
57462 // Only handle test patterns.
57463 if (!isNullConstant(N->getOperand(1)))
57464 return SDValue();
57465
57466 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57467 // and use its flags directly.
57468 // TODO: Maybe we should try promoting compares that only use the zero flag
57469 // first if we can prove the upper bits with computeKnownBits?
57470 SDLoc dl(N);
57471 SDValue Op = N->getOperand(0);
57472 EVT VT = Op.getValueType();
57473 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57474
57475 if (SDValue CMP =
57476 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57477 return CMP;
57478
57479 // If we have a constant logical shift that's only used in a comparison
57480 // against zero turn it into an equivalent AND. This allows turning it into
57481 // a TEST instruction later.
57482 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57483 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57484 onlyZeroFlagUsed(SDValue(N, 0))) {
57485 unsigned BitWidth = VT.getSizeInBits();
57486 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57487 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57488 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57489 APInt Mask = Op.getOpcode() == ISD::SRL
57490 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57491 : APInt::getLowBitsSet(BitWidth, MaskBits);
57492 if (Mask.isSignedIntN(32)) {
57493 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57494 DAG.getConstant(Mask, dl, VT));
57495 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57496 DAG.getConstant(0, dl, VT));
57497 }
57498 }
57499 }
57500
57501 // If we're extracting from a avx512 bool vector and comparing against zero,
57502 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57503 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57504 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57505 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57506 SDValue Src = Op.getOperand(0);
57507 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57508 isNullConstant(Src.getOperand(1)) &&
57509 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57510 SDValue BoolVec = Src.getOperand(0);
57511 unsigned ShAmt = 0;
57512 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57513 ShAmt = BoolVec.getConstantOperandVal(1);
57514 BoolVec = BoolVec.getOperand(0);
57515 }
57516 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57517 EVT VecVT = BoolVec.getValueType();
57518 unsigned BitWidth = VecVT.getVectorNumElements();
57519 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57520 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57521 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57522 Op = DAG.getBitcast(BCVT, BoolVec);
57523 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57524 DAG.getConstant(Mask, dl, BCVT));
57525 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57526 DAG.getConstant(0, dl, BCVT));
57527 }
57528 }
57529 }
57530
57531 // Peek through any zero-extend if we're only testing for a zero result.
57532 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57533 SDValue Src = Op.getOperand(0);
57534 EVT SrcVT = Src.getValueType();
57535 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57536 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57537 DAG.getConstant(0, dl, SrcVT));
57538 }
57539
57540 // Look for a truncate.
57541 if (Op.getOpcode() != ISD::TRUNCATE)
57542 return SDValue();
57543
57544 SDValue Trunc = Op;
57545 Op = Op.getOperand(0);
57546
57547 // See if we can compare with zero against the truncation source,
57548 // which should help using the Z flag from many ops. Only do this for
57549 // i32 truncated op to prevent partial-reg compares of promoted ops.
57550 EVT OpVT = Op.getValueType();
57551 APInt UpperBits =
57553 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57554 onlyZeroFlagUsed(SDValue(N, 0))) {
57555 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57556 DAG.getConstant(0, dl, OpVT));
57557 }
57558
57559 // After this the truncate and arithmetic op must have a single use.
57560 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57561 return SDValue();
57562
57563 unsigned NewOpc;
57564 switch (Op.getOpcode()) {
57565 default: return SDValue();
57566 case ISD::AND:
57567 // Skip and with constant. We have special handling for and with immediate
57568 // during isel to generate test instructions.
57569 if (isa<ConstantSDNode>(Op.getOperand(1)))
57570 return SDValue();
57571 NewOpc = X86ISD::AND;
57572 break;
57573 case ISD::OR: NewOpc = X86ISD::OR; break;
57574 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57575 case ISD::ADD:
57576 // If the carry or overflow flag is used, we can't truncate.
57578 return SDValue();
57579 NewOpc = X86ISD::ADD;
57580 break;
57581 case ISD::SUB:
57582 // If the carry or overflow flag is used, we can't truncate.
57584 return SDValue();
57585 NewOpc = X86ISD::SUB;
57586 break;
57587 }
57588
57589 // We found an op we can narrow. Truncate its inputs.
57590 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57591 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57592
57593 // Use a X86 specific opcode to avoid DAG combine messing with it.
57594 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57595 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57596
57597 // For AND, keep a CMP so that we can match the test pattern.
57598 if (NewOpc == X86ISD::AND)
57599 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57600 DAG.getConstant(0, dl, VT));
57601
57602 // Return the flags.
57603 return Op.getValue(1);
57604}
57605
57608 const X86Subtarget &ST) {
57609 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57610 "Expected X86ISD::ADD or X86ISD::SUB");
57611
57612 SDLoc DL(N);
57613 SDValue LHS = N->getOperand(0);
57614 SDValue RHS = N->getOperand(1);
57615 MVT VT = LHS.getSimpleValueType();
57616 bool IsSub = X86ISD::SUB == N->getOpcode();
57617 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57618
57619 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57620 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57621 return CMP;
57622
57623 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57624 if (!N->hasAnyUseOfValue(1)) {
57625 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57626 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57627 }
57628
57629 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57630 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57631 SDValue Ops[] = {N0, N1};
57632 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57633 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57634 SDValue Op(N, 0);
57635 if (Negate) {
57636 // Bail if this is only used by a user of the x86 add/sub.
57637 if (GenericAddSub->hasOneUse() &&
57638 GenericAddSub->user_begin()->isOnlyUserOf(N))
57639 return;
57640 Op = DAG.getNegative(Op, DL, VT);
57641 }
57642 DCI.CombineTo(GenericAddSub, Op);
57643 }
57644 };
57645 MatchGeneric(LHS, RHS, false);
57646 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57647
57648 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57649 // EFLAGS result doesn't change.
57650 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57651 /*ZeroSecondOpOnly*/ true);
57652}
57653
57655 SDValue LHS = N->getOperand(0);
57656 SDValue RHS = N->getOperand(1);
57657 SDValue BorrowIn = N->getOperand(2);
57658
57659 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57660 MVT VT = N->getSimpleValueType(0);
57661 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57662 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57663 }
57664
57665 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57666 // iff the flag result is dead.
57667 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57668 !N->hasAnyUseOfValue(1))
57669 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57670 LHS.getOperand(1), BorrowIn);
57671
57672 return SDValue();
57673}
57674
57675// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57678 SDValue LHS = N->getOperand(0);
57679 SDValue RHS = N->getOperand(1);
57680 SDValue CarryIn = N->getOperand(2);
57681 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57682 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57683
57684 // Canonicalize constant to RHS.
57685 if (LHSC && !RHSC)
57686 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57687 CarryIn);
57688
57689 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57690 // the result is either zero or one (depending on the input carry bit).
57691 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57692 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57693 // We don't have a good way to replace an EFLAGS use, so only do this when
57694 // dead right now.
57695 SDValue(N, 1).use_empty()) {
57696 SDLoc DL(N);
57697 EVT VT = N->getValueType(0);
57698 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57699 SDValue Res1 = DAG.getNode(
57700 ISD::AND, DL, VT,
57702 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57703 DAG.getConstant(1, DL, VT));
57704 return DCI.CombineTo(N, Res1, CarryOut);
57705 }
57706
57707 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57708 // iff the flag result is dead.
57709 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57710 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57711 SDLoc DL(N);
57712 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57713 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57714 DAG.getConstant(0, DL, LHS.getValueType()),
57715 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57716 }
57717
57718 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57719 MVT VT = N->getSimpleValueType(0);
57720 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57721 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57722 }
57723
57724 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57725 // iff the flag result is dead.
57726 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57727 !N->hasAnyUseOfValue(1))
57728 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57729 LHS.getOperand(1), CarryIn);
57730
57731 return SDValue();
57732}
57733
57735 const SDLoc &DL, EVT VT,
57736 const X86Subtarget &Subtarget) {
57737 using namespace SDPatternMatch;
57738
57739 // Example of pattern we try to detect:
57740 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57741 //(add (build_vector (extract_elt t, 0),
57742 // (extract_elt t, 2),
57743 // (extract_elt t, 4),
57744 // (extract_elt t, 6)),
57745 // (build_vector (extract_elt t, 1),
57746 // (extract_elt t, 3),
57747 // (extract_elt t, 5),
57748 // (extract_elt t, 7)))
57749
57750 if (!Subtarget.hasSSE2())
57751 return SDValue();
57752
57753 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57754 VT.getVectorNumElements() < 4 ||
57756 return SDValue();
57757
57758 SDValue Op0, Op1, Accum;
57763 m_Value(Op1))))))
57764 return SDValue();
57765
57766 // Check if one of Op0,Op1 is of the form:
57767 // (build_vector (extract_elt Mul, 0),
57768 // (extract_elt Mul, 2),
57769 // (extract_elt Mul, 4),
57770 // ...
57771 // the other is of the form:
57772 // (build_vector (extract_elt Mul, 1),
57773 // (extract_elt Mul, 3),
57774 // (extract_elt Mul, 5),
57775 // ...
57776 // and identify Mul.
57777 SDValue Mul;
57778 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57779 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57780 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57781 // TODO: Be more tolerant to undefs.
57782 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57783 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57784 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57785 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57786 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57787 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57788 return SDValue();
57789 // Commutativity of mul allows factors of a product to reorder.
57790 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57791 std::swap(Idx0L, Idx1L);
57792 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57793 std::swap(Idx0H, Idx1H);
57794 // Commutativity of add allows pairs of factors to reorder.
57795 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57796 std::swap(Idx0L, Idx0H);
57797 std::swap(Idx1L, Idx1H);
57798 }
57799 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57800 Idx1H != 2 * i + 3)
57801 return SDValue();
57802 if (!Mul) {
57803 // First time an extract_elt's source vector is visited. Must be a MUL
57804 // with 2X number of vector elements than the BUILD_VECTOR.
57805 // Both extracts must be from same MUL.
57806 Mul = Vec0L;
57807 if (Mul.getOpcode() != ISD::MUL ||
57808 Mul.getValueType().getVectorNumElements() != 2 * e)
57809 return SDValue();
57810 }
57811 // Check that the extract is from the same MUL previously seen.
57812 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57813 return SDValue();
57814 }
57815
57816 // Check if the Mul source can be safely shrunk.
57818 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57820 return SDValue();
57821
57822 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57823 VT.getVectorNumElements() * 2);
57824 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57825 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57826
57827 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57829 EVT InVT = Ops[0].getValueType();
57830 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57831 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57832 InVT.getVectorNumElements() / 2);
57833 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57834 };
57835 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57836 if (Accum)
57837 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57838 return R;
57839}
57840
57841// Attempt to turn this pattern into PMADDWD.
57842// (add (mul (sext (build_vector)), (sext (build_vector))),
57843// (mul (sext (build_vector)), (sext (build_vector)))
57845 const SDLoc &DL, EVT VT,
57846 const X86Subtarget &Subtarget) {
57847 using namespace SDPatternMatch;
57848
57849 if (!Subtarget.hasSSE2())
57850 return SDValue();
57851
57852 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57853 VT.getVectorNumElements() < 4 ||
57855 return SDValue();
57856
57857 // All inputs need to be sign extends.
57858 // TODO: Support ZERO_EXTEND from known positive?
57859 SDValue N00, N01, N10, N11;
57860 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57861 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57862 return SDValue();
57863
57864 // Must be extending from vXi16.
57865 EVT InVT = N00.getValueType();
57866 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57867 N10.getValueType() != InVT || N11.getValueType() != InVT)
57868 return SDValue();
57869
57870 // All inputs should be build_vectors.
57871 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57872 N01.getOpcode() != ISD::BUILD_VECTOR ||
57873 N10.getOpcode() != ISD::BUILD_VECTOR ||
57875 return SDValue();
57876
57877 // For each element, we need to ensure we have an odd element from one vector
57878 // multiplied by the odd element of another vector and the even element from
57879 // one of the same vectors being multiplied by the even element from the
57880 // other vector. So we need to make sure for each element i, this operator
57881 // is being performed:
57882 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57883 SDValue In0, In1;
57884 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57885 SDValue N00Elt = N00.getOperand(i);
57886 SDValue N01Elt = N01.getOperand(i);
57887 SDValue N10Elt = N10.getOperand(i);
57888 SDValue N11Elt = N11.getOperand(i);
57889 // TODO: Be more tolerant to undefs.
57890 SDValue N00In, N01In, N10In, N11In;
57891 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57892 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57893 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57894 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57895 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57896 return SDValue();
57897 // Add is commutative so indices can be reordered.
57898 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57899 std::swap(IdxN00, IdxN10);
57900 std::swap(IdxN01, IdxN11);
57901 }
57902 // N0 indices be the even element. N1 indices must be the next odd element.
57903 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57904 IdxN11 != 2 * i + 1)
57905 return SDValue();
57906
57907 // First time we find an input capture it.
57908 if (!In0) {
57909 In0 = N00In;
57910 In1 = N01In;
57911
57912 // The input vectors must be at least as wide as the output.
57913 // If they are larger than the output, we extract subvector below.
57914 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57915 In1.getValueSizeInBits() < VT.getSizeInBits())
57916 return SDValue();
57917 }
57918 // Mul is commutative so the input vectors can be in any order.
57919 // Canonicalize to make the compares easier.
57920 if (In0 != N00In)
57921 std::swap(N00In, N01In);
57922 if (In0 != N10In)
57923 std::swap(N10In, N11In);
57924 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57925 return SDValue();
57926 }
57927
57928 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57930 EVT OpVT = Ops[0].getValueType();
57931 assert(OpVT.getScalarType() == MVT::i16 &&
57932 "Unexpected scalar element type");
57933 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57934 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57935 OpVT.getVectorNumElements() / 2);
57936 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57937 };
57938
57939 // If the output is narrower than an input, extract the low part of the input
57940 // vector.
57941 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57942 VT.getVectorNumElements() * 2);
57943 if (OutVT16.bitsLT(In0.getValueType())) {
57944 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57945 DAG.getVectorIdxConstant(0, DL));
57946 }
57947 if (OutVT16.bitsLT(In1.getValueType())) {
57948 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57949 DAG.getVectorIdxConstant(0, DL));
57950 }
57951 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57952 PMADDBuilder);
57953}
57954
57955// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57956// If upper element in each pair of both VPMADDWD are zero then we can merge
57957// the operand elements and use the implicit add of VPMADDWD.
57958// TODO: Add support for VPMADDUBSW (which isn't commutable).
57960 const SDLoc &DL, EVT VT) {
57961 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57962 return SDValue();
57963
57964 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57965 if (VT.getSizeInBits() > 128)
57966 return SDValue();
57967
57968 unsigned NumElts = VT.getVectorNumElements();
57969 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57971 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57972
57973 bool Op0HiZero =
57974 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57975 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57976 bool Op1HiZero =
57977 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57978 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57979
57980 // TODO: Check for zero lower elements once we have actual codegen that
57981 // creates them.
57982 if (!Op0HiZero || !Op1HiZero)
57983 return SDValue();
57984
57985 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57986 SmallVector<int> Mask;
57987 for (int i = 0; i != (int)NumElts; ++i) {
57988 Mask.push_back(2 * i);
57989 Mask.push_back(2 * (i + NumElts));
57990 }
57991
57992 SDValue LHS =
57993 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57994 SDValue RHS =
57995 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57996 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57997}
57998
57999/// CMOV of constants requires materializing constant operands in registers.
58000/// Try to fold those constants into an 'add' instruction to reduce instruction
58001/// count. We do this with CMOV rather the generic 'select' because there are
58002/// earlier folds that may be used to turn select-of-constants into logic hacks.
58004 SelectionDAG &DAG,
58005 const X86Subtarget &Subtarget) {
58006 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
58007 // better because we eliminate 1-2 instructions. This transform is still
58008 // an improvement without zero operands because we trade 2 move constants and
58009 // 1 add for 2 adds (LEA) as long as the constants can be represented as
58010 // immediate asm operands (fit in 32-bits).
58011 auto isSuitableCmov = [](SDValue V) {
58012 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
58013 return false;
58014 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
58015 !isa<ConstantSDNode>(V.getOperand(1)))
58016 return false;
58017 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
58018 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
58019 V.getConstantOperandAPInt(1).isSignedIntN(32));
58020 };
58021
58022 // Match an appropriate CMOV as the first operand of the add.
58023 SDValue Cmov = N->getOperand(0);
58024 SDValue OtherOp = N->getOperand(1);
58025 if (!isSuitableCmov(Cmov))
58026 std::swap(Cmov, OtherOp);
58027 if (!isSuitableCmov(Cmov))
58028 return SDValue();
58029
58030 // Don't remove a load folding opportunity for the add. That would neutralize
58031 // any improvements from removing constant materializations.
58032 if (X86::mayFoldLoad(OtherOp, Subtarget))
58033 return SDValue();
58034
58035 EVT VT = N->getValueType(0);
58036 SDValue FalseOp = Cmov.getOperand(0);
58037 SDValue TrueOp = Cmov.getOperand(1);
58038
58039 // We will push the add through the select, but we can potentially do better
58040 // if we know there is another add in the sequence and this is pointer math.
58041 // In that case, we can absorb an add into the trailing memory op and avoid
58042 // a 3-operand LEA which is likely slower than a 2-operand LEA.
58043 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
58044 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
58045 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
58046 all_of(N->users(), [&](SDNode *Use) {
58047 auto *MemNode = dyn_cast<MemSDNode>(Use);
58048 return MemNode && MemNode->getBasePtr().getNode() == N;
58049 })) {
58050 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
58051 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
58052 // it is possible that choosing op1 might be better.
58053 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
58054 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
58055 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
58056 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
58057 Cmov.getOperand(2), Cmov.getOperand(3));
58058 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
58059 }
58060
58061 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
58062 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
58063 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
58064 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
58065 Cmov.getOperand(3));
58066}
58067
58068// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
58069// When upper 12 bits of x, y and MUL(x, y) are known to be 0
58071 EVT VT, const X86Subtarget &Subtarget) {
58072 using namespace SDPatternMatch;
58073 if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
58074 (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
58075 return SDValue();
58076
58077 // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
58078 if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
58079 VT.getSizeInBits() < 512)
58080 return SDValue();
58081
58082 const auto TotalSize = VT.getSizeInBits();
58083 if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
58084 return SDValue();
58085
58086 SDValue X, Y, Acc;
58087 if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
58088 return SDValue();
58089
58090 KnownBits KnownX = DAG.computeKnownBits(X);
58091 if (KnownX.countMinLeadingZeros() < 12)
58092 return SDValue();
58093 KnownBits KnownY = DAG.computeKnownBits(Y);
58094 if (KnownY.countMinLeadingZeros() < 12)
58095 return SDValue();
58096 KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
58097 if (KnownMul.countMinLeadingZeros() < 12)
58098 return SDValue();
58099
58100 auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
58101 ArrayRef<SDValue> SubOps) {
58102 EVT SubVT = SubOps[0].getValueType();
58103 assert(SubVT.getScalarSizeInBits() == 64 &&
58104 "Unexpected element size, only supports 64bit size");
58105 return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
58106 SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
58107 };
58108
58109 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
58110 /*CheckBWI*/ false,
58111 /*AllowAVX512*/ Subtarget.hasIFMA());
58112}
58113
58116 const X86Subtarget &Subtarget) {
58117 using namespace SDPatternMatch;
58118 EVT VT = N->getValueType(0);
58119 SDValue Op0 = N->getOperand(0);
58120 SDValue Op1 = N->getOperand(1);
58121 SDLoc DL(N);
58122
58123 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
58124 return Select;
58125
58126 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
58127 return MAdd;
58128 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
58129 return MAdd;
58130 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
58131 return MAdd;
58132
58133 // Try to synthesize horizontal adds from adds of shuffles.
58134 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58135 return V;
58136
58137 // Prefer VSHLI to reduce uses, X86FixupInstTunings may revert this depending
58138 // on the scheduler model. Limit multiple users to AVX+ targets to prevent
58139 // introducing extra register moves.
58140 if (Op0 == Op1 && supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL))
58141 if (Subtarget.hasAVX() || N->isOnlyUserOf(Op0.getNode()))
58143 Op0, 1, DAG);
58144
58145 // Canonicalize hidden LEA pattern:
58146 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
58147 // iff c < 4
58148 if (VT == MVT::i32 || VT == MVT::i64) {
58149 SDValue Y, Z, Shift;
58150 APInt Amt;
58151 if (sd_match(
58153 m_Shl(m_Value(), m_ConstInt(Amt))),
58154 m_Value(Y))),
58155 m_Value(Z))) &&
58156 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
58157 return DAG.getNode(ISD::SUB, DL, VT,
58158 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
58159 }
58160 }
58161
58162 SDValue X, Y;
58163
58164 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
58165 // iff X and Y won't overflow.
58166 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
58168 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
58169 MVT OpVT = X.getSimpleValueType();
58170 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
58171 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
58172 getZeroVector(OpVT, Subtarget, DAG, DL));
58173 }
58174
58175 if (VT.isVector()) {
58176 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
58178
58179 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
58180 // (sub Y, (sext (vXi1 X))).
58181 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
58182 // in generic DAG combine without a legal type check, but adding this there
58183 // caused regressions.
58184 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58186 m_Value(Y)))) {
58187 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58188 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58189 }
58190
58191 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58192 // canonicalisation as we don't have good vXi8 shifts.
58193 if (VT.getScalarType() == MVT::i8 &&
58195 SDValue Cmp =
58196 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58197 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58198 }
58199 }
58200
58201 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58202 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58203 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58204 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58205 if (sd_match(N, m_Add(m_Value(Accum),
58208 m_Value(Lo1)),
58210 m_Value(Hi1)))))) {
58211 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58212 concatSubVectors(Lo0, Hi0, DAG, DL),
58213 concatSubVectors(Lo1, Hi1, DAG, DL));
58214 }
58215 }
58216
58217 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58218 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58219 X86::isZeroNode(Op0.getOperand(1))) {
58220 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58221 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58222 Op0.getOperand(0), Op0.getOperand(2));
58223 }
58224
58225 if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
58226 return IFMA52;
58227
58228 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58229}
58230
58231// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58232// condition comes from the subtract node that produced -X. This matches the
58233// cmov expansion for absolute value. By swapping the operands we convert abs
58234// to nabs.
58235static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58236 SelectionDAG &DAG) {
58237 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58238 return SDValue();
58239
58240 SDValue Cond = N1.getOperand(3);
58241 if (Cond.getOpcode() != X86ISD::SUB)
58242 return SDValue();
58243 assert(Cond.getResNo() == 1 && "Unexpected result number");
58244
58245 SDValue FalseOp = N1.getOperand(0);
58246 SDValue TrueOp = N1.getOperand(1);
58248
58249 // ABS condition should come from a negate operation.
58250 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58251 isNullConstant(Cond.getOperand(0))) {
58252 // Get the X and -X from the negate.
58253 SDValue NegX = Cond.getValue(0);
58254 SDValue X = Cond.getOperand(1);
58255
58256 // Cmov operands should be X and NegX. Order doesn't matter.
58257 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58258 return SDValue();
58259
58260 // Build a new CMOV with the operands swapped.
58261 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58262 N1.getOperand(2), Cond);
58263 // Convert sub to add.
58264 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58265 }
58266
58267 // Handle ABD special case:
58268 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58269 // ABD condition should come from a pair of matching subtracts.
58270 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58271 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58272 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58273 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58274 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58275 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58276 // Build a new CMOV with the operands swapped.
58277 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58278 Cond);
58279 }
58280
58281 return SDValue();
58282}
58283
58285 SDValue Op0 = N->getOperand(0);
58286 SDValue Op1 = N->getOperand(1);
58287
58288 // (sub C (zero_extend (setcc)))
58289 // =>
58290 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58291 // Don't disturb (sub 0 setcc), which is easily done with neg.
58292 EVT VT = N->getValueType(0);
58293 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58294 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58295 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58296 Op1.getOperand(0).hasOneUse()) {
58297 SDValue SetCC = Op1.getOperand(0);
58300 APInt NewImm = Op0C->getAPIntValue() - 1;
58301 SDLoc DL(Op1);
58302 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58303 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58304 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58305 DAG.getConstant(NewImm, DL, VT));
58306 }
58307
58308 return SDValue();
58309}
58310
58312 if (N->getConstantOperandVal(3) != X86::COND_NE)
58313 return SDValue();
58314
58315 SDValue Sub = N->getOperand(4);
58316 if (Sub.getOpcode() != X86ISD::SUB)
58317 return SDValue();
58318
58319 SDValue Op1 = Sub.getOperand(1);
58320
58321 if (!X86::isZeroNode(Sub.getOperand(0)))
58322 return SDValue();
58323
58324 SDLoc DL(N);
58325 SmallVector<SDValue, 5> Ops(N->op_values());
58326 if (Op1.getOpcode() == X86ISD::SETCC) {
58327 // res, flags2 = sub 0, (setcc cc, flag)
58328 // cload/cstore ..., cond_ne, flag2
58329 // ->
58330 // cload/cstore cc, flag
58331 Ops[3] = Op1.getOperand(0);
58332 Ops[4] = Op1.getOperand(1);
58333 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58334 SDValue Src = Op1;
58335 SDValue Op10 = Op1.getOperand(0);
58336 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
58337 // res, flags2 = sub 0, (and (xor X, -1), Y)
58338 // cload/cstore ..., cond_ne, flag2
58339 // ->
58340 // res, flags2 = sub 0, (and X, Y)
58341 // cload/cstore ..., cond_e, flag2
58342 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58343 Op1.getOperand(1));
58344 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58345 }
58346 // res, flags2 = sub 0, (and X, Y)
58347 // cload/cstore ..., cc, flag2
58348 // ->
58349 // res, flags2 = cmp (and X, Y), 0
58350 // cload/cstore ..., cc, flag2
58351 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58352 } else {
58353 return SDValue();
58354 }
58355
58356 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58357 cast<MemSDNode>(N)->getMemoryVT(),
58358 cast<MemSDNode>(N)->getMemOperand());
58359}
58360
58363 const X86Subtarget &Subtarget) {
58364 EVT VT = N->getValueType(0);
58365 SDValue Op0 = N->getOperand(0);
58366 SDValue Op1 = N->getOperand(1);
58367 SDLoc DL(N);
58368
58369 auto IsNonOpaqueConstant = [&](SDValue Op) {
58371 /*AllowOpaques*/ false);
58372 };
58373
58374 // X86 can't encode an immediate LHS of a sub. See if we can push the
58375 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58376 // one use and a constant, invert the immediate, saving one register.
58377 // However, ignore cases where C1 is 0, as those will become a NEG.
58378 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58379 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58380 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58381 Op1->hasOneUse()) {
58382 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58383 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58384 SDValue NewAdd =
58385 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58386 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58387 }
58388
58389 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58390 return V;
58391
58392 // Try to synthesize horizontal subs from subs of shuffles.
58393 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58394 return V;
58395
58396 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58397 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58398 X86::isZeroNode(Op1.getOperand(1))) {
58399 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58400 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58401 Op1.getOperand(0), Op1.getOperand(2));
58402 }
58403
58404 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58405 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58406 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58407 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58408 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58409 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58410 Op1.getOperand(1), Op1.getOperand(2));
58411 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58412 }
58413
58414 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58415 return V;
58416
58417 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58418 return V;
58419
58420 return combineSubSetcc(N, DAG);
58421}
58422
58424 const X86Subtarget &Subtarget) {
58425 unsigned Opcode = N->getOpcode();
58426 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58427 "Unknown PCMP opcode");
58428
58429 SDValue LHS = N->getOperand(0);
58430 SDValue RHS = N->getOperand(1);
58431 MVT VT = N->getSimpleValueType(0);
58432 unsigned EltBits = VT.getScalarSizeInBits();
58433 unsigned NumElts = VT.getVectorNumElements();
58434 SDLoc DL(N);
58435
58436 if (LHS == RHS)
58437 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58438 : DAG.getConstant(0, DL, VT);
58439
58440 // Constant Folding.
58441 // PCMPEQ(X,UNDEF) -> UNDEF
58442 // PCMPGT(X,UNDEF) -> 0
58443 // PCMPGT(UNDEF,X) -> 0
58444 APInt LHSUndefs, RHSUndefs;
58445 SmallVector<APInt> LHSBits, RHSBits;
58446 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58447 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58448 APInt Ones = APInt::getAllOnes(EltBits);
58449 APInt Zero = APInt::getZero(EltBits);
58450 SmallVector<APInt> Results(NumElts);
58451 for (unsigned I = 0; I != NumElts; ++I) {
58452 if (Opcode == X86ISD::PCMPEQ) {
58453 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58454 } else {
58455 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58456 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58457 }
58458 }
58459 if (Opcode == X86ISD::PCMPEQ)
58460 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58461 return getConstVector(Results, VT, DAG, DL);
58462 }
58463
58464 return SDValue();
58465}
58466
58467// Helper to determine if we can convert an integer comparison to a float
58468// comparison byt casting the operands.
58469static std::optional<unsigned>
58470CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58471 unsigned NumSignificantBitsRHS) {
58472 MVT SVT = VT.getScalarType();
58473 assert(SVT == MVT::f32 && "Only tested for float so far");
58474 const fltSemantics &Sem = SVT.getFltSemantics();
58475 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58476 "Only PCMPEQ/PCMPGT currently supported");
58477
58478 // TODO: Handle bitcastable integers.
58479
58480 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58481 // a fp value.
58482 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58483 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58484 return ISD::SINT_TO_FP;
58485
58486 return std::nullopt;
58487}
58488
58489/// Helper that combines an array of subvector ops as if they were the operands
58490/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58491/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58494 const X86Subtarget &Subtarget,
58495 unsigned Depth) {
58496 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58497 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58498
58499 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58500 return DAG.getUNDEF(VT);
58501
58502 if (llvm::all_of(Ops, [](SDValue Op) {
58503 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58504 }))
58505 return getZeroVector(VT, Subtarget, DAG, DL);
58506
58508 return SDValue(); // Limit search depth.
58509
58510 SDValue Op0 = Ops[0];
58511 bool IsSplat = llvm::all_equal(Ops);
58512 unsigned NumOps = Ops.size();
58513 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58514 LLVMContext &Ctx = *DAG.getContext();
58515
58516 // Repeated subvectors.
58517 if (IsSplat &&
58518 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58519 // If this broadcast is inserted into both halves, use a larger broadcast.
58520 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58521 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58522
58523 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58524 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58525 (Subtarget.hasAVX2() ||
58527 VT.getScalarType(), Subtarget)))
58528 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58529 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58530 Op0.getOperand(0),
58531 DAG.getVectorIdxConstant(0, DL)));
58532
58533 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58534 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58535 (Subtarget.hasAVX2() ||
58536 (EltSizeInBits >= 32 &&
58537 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58538 Op0.getOperand(0).getValueType() == VT.getScalarType())
58539 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58540
58541 // concat_vectors(extract_subvector(splat(x)),
58542 // extract_subvector(splat(x))) -> splat(x)
58543 // concat_vectors(extract_subvector(subv_broadcast(x)),
58544 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58545 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58546 Op0.getOperand(0).getValueType() == VT) {
58547 SDValue SrcVec = Op0.getOperand(0);
58548 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58549 return SrcVec;
58550 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58551 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58552 return SrcVec;
58553 }
58554
58555 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58556 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58557 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58558 return DAG.getNode(Op0.getOpcode(), DL, VT,
58560 Op0.getOperand(0), Op0.getOperand(0)),
58561 Op0.getOperand(1));
58562 }
58563
58564 // TODO: This should go in combineX86ShufflesRecursively eventually.
58565 if (NumOps == 2) {
58566 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58567 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58568 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58570 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58571 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58572 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58573 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58574 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58575 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58576 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58577 // Only concat of subvector high halves which vperm2x128 is best at or if
58578 // it should fold into a subvector broadcast.
58579 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58580 SrcVT1.is256BitVector()) {
58581 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58582 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58583 "Bad subvector index");
58584 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58585 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58586 unsigned Index = 0;
58587 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58588 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58589 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58590 DAG.getBitcast(VT, Src0.getOperand(0)),
58591 DAG.getBitcast(VT, Src1.getOperand(0)),
58592 DAG.getTargetConstant(Index, DL, MVT::i8));
58593 }
58594 }
58595 // Widen extract_subvector
58596 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58597 // --> extract_subvector(x,lo)
58598 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58599 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58600 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58601 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58602 return DAG.getBitcast(VT,
58604 Src0.getConstantOperandVal(1),
58605 DAG, DL, VT.getSizeInBits()));
58606 }
58607 }
58608 }
58609
58610 // Repeated opcode.
58611 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58612 // but it currently struggles with different vector widths.
58613 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58614 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58615 })) {
58616 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58618 for (SDValue SubOp : SubOps)
58619 Subs.push_back(SubOp.getOperand(I));
58620 // Attempt to peek through bitcasts and concat the original subvectors.
58621 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58622 if (SubVT.isSimple() && SubVT.isVector()) {
58623 MVT ConcatVT =
58625 SubVT.getVectorElementCount() * Subs.size());
58626 for (SDValue &Sub : Subs)
58627 Sub = DAG.getBitcast(SubVT, Sub);
58628 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58629 Subtarget, Depth + 1))
58630 return DAG.getBitcast(VT, ConcatSrc);
58631 return DAG.getBitcast(
58632 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58633 }
58634 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58635 };
58636 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58637 bool AllConstants = true;
58638 bool AllSubs = true;
58639 unsigned VecSize = VT.getSizeInBits();
58640 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58641 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58642 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58643 }))
58644 return true;
58645 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58646 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58647 unsigned SubSize = BC.getValueSizeInBits();
58648 unsigned EltSize = BC.getScalarValueSizeInBits();
58649 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58651 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58652 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58653 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58654 }
58655 return AllConstants || AllSubs;
58656 };
58657 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58658 bool AllConstants = true;
58660 for (SDValue SubOp : SubOps) {
58661 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58662 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58664 Subs.push_back(SubOp.getOperand(I));
58665 }
58666 if (AllConstants)
58667 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58668 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58669 };
58670
58671 unsigned Opcode = Op0.getOpcode();
58672 switch (Opcode) {
58673 case ISD::BITCAST: {
58674 // TODO: Support AVX1/AVX2 bitcasts.
58676 for (SDValue SubOp : Ops)
58677 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58678 EVT InnerVT = SubOps[0].getValueType();
58679 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58680 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58681 (Subtarget.hasBWI() ||
58682 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58683 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58684 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58685 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58686 return Op.getValueType() == InnerVT;
58687 })) {
58688 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58689 MVT ConcatVT = MVT::getVectorVT(
58690 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58691 if (SDValue ConcatSrc = combineConcatVectorOps(
58692 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58693 return DAG.getBitcast(VT, ConcatSrc);
58694 }
58695 break;
58696 }
58697 case ISD::VECTOR_SHUFFLE: {
58698 // TODO: Generalize NumOps support.
58699 if (!IsSplat && NumOps == 2 &&
58700 ((VT.is256BitVector() &&
58701 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58702 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58703 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58704 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58705 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58706 if (Concat0 || Concat1 ||
58707 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58708 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58709 Subtarget.hasVBMI())) {
58710 int NumSubElts = Op0.getValueType().getVectorNumElements();
58711 SmallVector<int> NewMask;
58712 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58713 M = M >= NumSubElts ? M + NumSubElts : M;
58714 NewMask.push_back(M);
58715 }
58716 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58717 if (0 <= M)
58718 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58719 NewMask.push_back(M);
58720 }
58721 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58722 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58723 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58724 }
58725 }
58726 break;
58727 }
58728 case X86ISD::VBROADCAST: {
58729 // TODO: 512-bit VBROADCAST concatenation.
58730 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58731 return Op.getOperand(0).getValueType().is128BitVector();
58732 })) {
58733 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58734 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58735 ConcatSubOperand(VT, Ops, 0),
58736 ConcatSubOperand(VT, Ops, 0));
58737 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58738 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58739 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58741 DL, VT, ConcatSubOperand(VT, Ops, 0),
58742 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58743 }
58744 break;
58745 }
58746 case X86ISD::MOVDDUP:
58747 case X86ISD::MOVSHDUP:
58748 case X86ISD::MOVSLDUP: {
58749 if (!IsSplat && (VT.is256BitVector() ||
58750 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58751 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58752 break;
58753 }
58754 case X86ISD::SHUFP: {
58755 if (!IsSplat &&
58756 (VT == MVT::v8f32 ||
58757 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58758 llvm::all_of(Ops, [Op0](SDValue Op) {
58759 return Op.getOperand(2) == Op0.getOperand(2);
58760 })) {
58761 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58762 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58763 if (Concat0 || Concat1)
58764 return DAG.getNode(Opcode, DL, VT,
58765 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58766 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58767 Op0.getOperand(2));
58768 }
58769 break;
58770 }
58771 case X86ISD::UNPCKH:
58772 case X86ISD::UNPCKL: {
58773 // TODO: UNPCK should use CombineSubOperand
58774 // Don't concatenate build_vector patterns.
58775 if (!IsSplat &&
58776 ((VT.is256BitVector() &&
58777 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58778 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58779 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58780 none_of(Ops, [](SDValue Op) {
58781 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58783 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58785 })) {
58786 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58787 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58788 if (Concat0 || Concat1 ||
58789 (Subtarget.hasInt256() && EltSizeInBits == 64))
58790 return DAG.getNode(Opcode, DL, VT,
58791 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58792 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58793 }
58794 break;
58795 }
58796 case X86ISD::PSHUFHW:
58797 case X86ISD::PSHUFLW:
58798 case X86ISD::PSHUFD:
58799 if (!IsSplat &&
58800 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58801 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58802 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58803 llvm::all_of(Ops, [Op0](SDValue Op) {
58804 return Op.getOperand(1) == Op0.getOperand(1);
58805 })) {
58806 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58807 Op0.getOperand(1));
58808 }
58809 [[fallthrough]];
58810 case X86ISD::VPERMILPI:
58811 if (!IsSplat && EltSizeInBits == 32 &&
58812 (VT.is256BitVector() ||
58813 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58814 all_of(Ops, [&Op0](SDValue Op) {
58815 return Op0.getOperand(1) == Op.getOperand(1);
58816 })) {
58817 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58818 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58819 Res =
58820 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58821 return DAG.getBitcast(VT, Res);
58822 }
58823 break;
58824 case X86ISD::VPERMILPV:
58825 if (!IsSplat && (VT.is256BitVector() ||
58826 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58827 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58828 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58829 if (Concat0 || Concat1)
58830 return DAG.getNode(Opcode, DL, VT,
58831 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58832 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58833 }
58834 break;
58835 case X86ISD::PSHUFB:
58836 case X86ISD::PSADBW:
58837 case X86ISD::VPMADDUBSW:
58838 case X86ISD::VPMADDWD:
58839 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58840 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58841 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58842 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58843 NumOps * SrcVT.getVectorNumElements());
58844 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58845 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58846 if (Concat0 || Concat1)
58847 return DAG.getNode(
58848 Opcode, DL, VT,
58849 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58850 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58851 }
58852 break;
58853 case X86ISD::VPERMV:
58854 // TODO: Handle 256-bit and NumOps == 4 cases.
58855 if (!IsSplat && NumOps == 2 &&
58856 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58857 MVT OpVT = Op0.getSimpleValueType();
58858 int NumSrcElts = OpVT.getVectorNumElements();
58859 SmallVector<int, 64> ConcatMask;
58860 for (unsigned i = 0; i != NumOps; ++i) {
58861 SmallVector<int, 64> SubMask;
58863 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58864 break;
58865 for (int M : SubMask) {
58866 if (0 <= M)
58867 M += i * NumSrcElts;
58868 ConcatMask.push_back(M);
58869 }
58870 }
58871 if (ConcatMask.size() == (NumOps * NumSrcElts))
58872 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58873 ConcatSubOperand(VT, Ops, 1),
58874 DAG.getUNDEF(VT), Subtarget, DAG);
58875 }
58876 break;
58877 case X86ISD::VPERMV3:
58878 // TODO: Handle 256-bit and NumOps == 4 cases.
58879 if (!IsSplat && NumOps == 2 &&
58880 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58881 MVT OpVT = Op0.getSimpleValueType();
58882 int NumSrcElts = OpVT.getVectorNumElements();
58883 SmallVector<int, 64> ConcatMask;
58884 for (unsigned i = 0; i != NumOps; ++i) {
58885 SmallVector<int, 64> SubMask;
58887 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58888 break;
58889 for (int M : SubMask) {
58890 if (0 <= M) {
58891 int Src = M < NumSrcElts ? 0 : 2;
58892 M += M < NumSrcElts ? 0 : NumSrcElts;
58893
58894 // Reference the lowest sub if the upper sub is the same.
58895 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58896 M += i * NumSrcElts;
58897 }
58898 ConcatMask.push_back(M);
58899 }
58900 }
58901 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58902 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58903 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58904 if (Concat0 || Concat1)
58905 return lowerShuffleWithPERMV(
58906 DL, VT, ConcatMask,
58907 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58908 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58909 DAG);
58910 }
58911 }
58912 break;
58913 case X86ISD::VPERM2X128: {
58914 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58915 assert(NumOps == 2 && "Bad concat_vectors operands");
58916 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58917 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58918 // TODO: Handle zero'd subvectors.
58919 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58920 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58921 (int)((Imm1 >> 4) & 0x3)};
58922 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58923 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58924 Ops[0].getOperand(1), DAG, DL);
58925 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58926 Ops[1].getOperand(1), DAG, DL);
58927 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58928 DAG.getBitcast(ShuffleVT, LHS),
58929 DAG.getBitcast(ShuffleVT, RHS),
58930 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58931 return DAG.getBitcast(VT, Res);
58932 }
58933 }
58934 break;
58935 }
58936 case X86ISD::SHUF128: {
58937 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58938 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58939 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58940 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58941 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58942 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58943 Ops[0].getOperand(1), DAG, DL);
58944 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58945 Ops[1].getOperand(1), DAG, DL);
58946 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58947 DAG.getTargetConstant(Imm, DL, MVT::i8));
58948 }
58949 break;
58950 }
58951 case ISD::TRUNCATE:
58952 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58953 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58954 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58955 SrcVT == Ops[1].getOperand(0).getValueType() &&
58956 Subtarget.useAVX512Regs() &&
58957 Subtarget.getPreferVectorWidth() >= 512 &&
58958 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58959 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58960 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58961 ConcatSubOperand(NewSrcVT, Ops, 0));
58962 }
58963 }
58964 break;
58965 case ISD::ANY_EXTEND:
58966 case ISD::SIGN_EXTEND:
58967 case ISD::ZERO_EXTEND:
58968 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58969 if (!IsSplat && NumOps == 2 &&
58970 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58971 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58972 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58973 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58974 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58975 SrcVT == Ops[1].getOperand(0).getValueType()) {
58976 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58977 return DAG.getNode(Opcode, DL, VT,
58978 ConcatSubOperand(NewSrcVT, Ops, 0));
58979 }
58980 }
58981 break;
58985 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58986 if (!IsSplat && NumOps == 2 &&
58987 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58988 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58989 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58991 Op0.getOperand(0).getValueType() ==
58992 Ops[0].getOperand(0).getValueType()) {
58993 EVT SrcVT = Op0.getOperand(0).getValueType();
58994 unsigned NumElts = VT.getVectorNumElements();
58995 MVT UnpackSVT =
58996 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
58997 MVT UnpackVT =
58998 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
58999 SDValue Unpack =
59000 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
59001 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
59002 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
59003 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
59004 DAG.getBitcast(SrcVT, Unpack), DAG);
59005 }
59006 break;
59007 }
59008 case X86ISD::VSHLI:
59009 case X86ISD::VSRLI:
59010 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
59011 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
59012 llvm::all_of(Ops, [](SDValue Op) {
59013 return Op.getConstantOperandAPInt(1) == 32;
59014 })) {
59015 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
59016 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
59017 Res = DAG.getBitcast(MVT::v8i32, Res);
59018 if (Opcode == X86ISD::VSHLI) {
59019 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
59020 {8, 0, 8, 2, 8, 4, 8, 6});
59021 } else {
59022 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
59023 {1, 8, 3, 8, 5, 8, 7, 8});
59024 }
59025 return DAG.getBitcast(VT, Res);
59026 }
59027 }
59028 [[fallthrough]];
59029 case X86ISD::VSRAI:
59030 case X86ISD::VSHL:
59031 case X86ISD::VSRL:
59032 case X86ISD::VSRA:
59033 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
59034 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59035 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
59036 llvm::all_of(Ops, [Op0](SDValue Op) {
59037 return Op0.getOperand(1) == Op.getOperand(1);
59038 })) {
59039 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59040 Op0.getOperand(1));
59041 }
59042 break;
59043 case X86ISD::VPERMI:
59044 case X86ISD::VROTLI:
59045 case X86ISD::VROTRI:
59046 if (!IsSplat &&
59047 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
59048 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59049 llvm::all_of(Ops, [Op0](SDValue Op) {
59050 return Op0.getOperand(1) == Op.getOperand(1);
59051 })) {
59052 assert(!(Opcode == X86ISD::VPERMI &&
59053 Op0.getValueType().is128BitVector()) &&
59054 "Illegal 128-bit X86ISD::VPERMI nodes");
59055 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59056 Op0.getOperand(1));
59057 }
59058 break;
59059 case ISD::AND:
59060 case ISD::OR:
59061 case ISD::XOR:
59062 case X86ISD::ANDNP:
59063 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
59064 if (!IsSplat && (VT.is256BitVector() ||
59065 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59066 // Don't concatenate root AVX1 NOT patterns.
59067 // TODO: Allow NOT folding if Concat0 succeeds.
59068 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
59069 llvm::all_of(Ops, [](SDValue X) {
59070 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
59071 }))
59072 break;
59073 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59074 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59075 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
59076 return DAG.getNode(Opcode, DL, VT,
59077 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59078 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59079 }
59080 break;
59081 case X86ISD::PCMPEQ:
59082 case X86ISD::PCMPGT:
59083 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
59084 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
59085 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59086 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59087 if (Concat0 || Concat1)
59088 return DAG.getNode(Opcode, DL, VT,
59089 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59090 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59091 break;
59092 }
59093
59094 if (!IsSplat && VT == MVT::v8i32) {
59095 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
59096 // TODO: Handle v4f64 as well?
59097 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
59098 for (unsigned I = 0; I != NumOps; ++I) {
59099 MaxSigBitsLHS =
59100 std::max(MaxSigBitsLHS,
59101 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
59102 MaxSigBitsRHS =
59103 std::max(MaxSigBitsRHS,
59104 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
59105 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
59106 break;
59107 }
59108
59109 ISD::CondCode ICC =
59110 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
59111 ISD::CondCode FCC =
59113
59114 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
59115 MVT FpVT = VT.changeVectorElementType(FpSVT);
59116
59117 if (std::optional<unsigned> CastOpc =
59118 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
59119 SDValue LHS = CombineSubOperand(VT, Ops, 0);
59120 SDValue RHS = CombineSubOperand(VT, Ops, 1);
59121 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
59122 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
59123 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
59124 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
59125
59126 bool IsAlwaysSignaling;
59127 unsigned FSETCC =
59128 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
59129 return DAG.getBitcast(
59130 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
59131 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
59132 }
59133 }
59134 break;
59135 case ISD::CTPOP:
59136 case ISD::CTTZ:
59137 case ISD::CTLZ:
59140 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59141 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59142 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
59143 }
59144 break;
59146 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
59147 if (!IsSplat &&
59148 (VT.is256BitVector() ||
59149 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59150 llvm::all_of(Ops, [Op0](SDValue Op) {
59151 return Op0.getOperand(2) == Op.getOperand(2);
59152 })) {
59153 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59154 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
59155 }
59156 break;
59157 case ISD::ADD:
59158 case ISD::SUB:
59159 case ISD::MUL:
59160 // TODO: Add more integer binops?
59161 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59162 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59163 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59164 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59165 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59166 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
59167 return Op.getOperand(0) == Op.getOperand(1);
59168 }))
59169 return DAG.getNode(Opcode, DL, VT,
59170 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59171 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59172 }
59173 break;
59174 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
59175 // their latency are short, so here we don't replace them unless we won't
59176 // introduce extra VINSERT.
59177 case ISD::FADD:
59178 case ISD::FSUB:
59179 case ISD::FMUL:
59180 if (!IsSplat && (VT.is256BitVector() ||
59181 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59182 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59183 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59184 if (Concat0 || Concat1)
59185 return DAG.getNode(Opcode, DL, VT,
59186 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59187 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59188 }
59189 break;
59190 // Always prefer to concatenate high latency FDIV instructions.
59191 case ISD::FDIV:
59192 if (!IsSplat && (VT.is256BitVector() ||
59193 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59194 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59195 ConcatSubOperand(VT, Ops, 1));
59196 }
59197 break;
59198 case X86ISD::HADD:
59199 case X86ISD::HSUB:
59200 case X86ISD::FHADD:
59201 case X86ISD::FHSUB:
59202 if (!IsSplat && VT.is256BitVector() &&
59203 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59204 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59205 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59206 if (Concat0 || Concat1)
59207 return DAG.getNode(Opcode, DL, VT,
59208 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59209 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59210 }
59211 break;
59212 case X86ISD::PACKSS:
59213 case X86ISD::PACKUS:
59214 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59215 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59216 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59217 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59218 NumOps * SrcVT.getVectorNumElements());
59219 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59220 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59221 if (Concat0 || Concat1)
59222 return DAG.getNode(
59223 Opcode, DL, VT,
59224 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59225 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59226 }
59227 break;
59228 case X86ISD::VSHLD:
59229 case X86ISD::VSHRD:
59230 case X86ISD::PALIGNR:
59231 if (!IsSplat &&
59232 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59233 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59234 llvm::all_of(Ops, [Op0](SDValue Op) {
59235 return Op0.getOperand(2) == Op.getOperand(2);
59236 })) {
59237 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59238 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59239 if (Concat0 || Concat1)
59240 return DAG.getNode(Opcode, DL, VT,
59241 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59242 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59243 Op0.getOperand(2));
59244 }
59245 break;
59246 case X86ISD::BLENDI:
59247 if (VT.is256BitVector() && NumOps == 2 &&
59248 (EltSizeInBits >= 32 ||
59249 (Subtarget.hasInt256() &&
59250 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59251 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59252 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59253 if (Concat0 || Concat1) {
59254 unsigned NumElts = VT.getVectorNumElements();
59255 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59256 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59257 Mask = Mask.zextOrTrunc(8);
59258 return DAG.getNode(Opcode, DL, VT,
59259 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59260 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59261 DAG.getTargetConstant(Mask, DL, MVT::i8));
59262 }
59263 }
59264 // TODO: BWI targets should only use CombineSubOperand.
59265 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59266 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59267 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59268 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59269 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59270 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59271 unsigned NumElts = VT.getVectorNumElements();
59272 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59273 for (unsigned I = 1; I != NumOps; ++I)
59274 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59275 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59276 Mask = Mask.zextOrTrunc(NumMaskBits);
59277 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59278 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59279 SDValue Sel =
59280 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59281 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59282 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59283 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59284 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59285 }
59286 }
59287 break;
59288 case ISD::VSELECT:
59289 // TODO: VSELECT should use CombineSubOperand.
59290 if (!IsSplat && Subtarget.hasAVX512() &&
59291 (VT.is256BitVector() ||
59292 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59293 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59294 EVT SelVT = Ops[0].getOperand(0).getValueType();
59295 if (SelVT.getVectorElementType() == MVT::i1) {
59296 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59297 NumOps * SelVT.getVectorNumElements());
59298 if (TLI.isTypeLegal(SelVT))
59299 return DAG.getNode(
59300 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59301 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59302 }
59303 }
59304 [[fallthrough]];
59305 case X86ISD::BLENDV:
59306 // TODO: BLENDV should use CombineSubOperand.
59307 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59308 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59309 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59310 EVT SelVT = Ops[0].getOperand(0).getValueType();
59311 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59312 if (TLI.isTypeLegal(SelVT))
59313 return DAG.getNode(
59314 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59315 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59316 }
59317 break;
59318 }
59319 }
59320
59321 // Fold subvector loads into one.
59322 // If needed, look through bitcasts to get to the load.
59323 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59324 unsigned Fast;
59325 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59326 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59327 *FirstLd->getMemOperand(), &Fast) &&
59328 Fast) {
59329 if (SDValue Ld =
59330 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59331 return Ld;
59332 }
59333 }
59334
59335 // Attempt to fold target constant loads.
59336 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59337 SmallVector<APInt> EltBits;
59338 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59339 for (unsigned I = 0; I != NumOps; ++I) {
59340 APInt OpUndefElts;
59341 SmallVector<APInt> OpEltBits;
59342 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59343 OpEltBits, /*AllowWholeUndefs*/ true,
59344 /*AllowPartialUndefs*/ false))
59345 break;
59346 EltBits.append(OpEltBits);
59347 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59348 }
59349 if (EltBits.size() == VT.getVectorNumElements()) {
59350 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59351 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59352 SDValue CV = DAG.getConstantPool(C, PVT);
59355 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59356 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59358 return Ld;
59359 }
59360 }
59361
59362 // If this simple subvector or scalar/subvector broadcast_load is inserted
59363 // into both halves, use a larger broadcast_load. Update other uses to use
59364 // an extracted subvector.
59365 if (IsSplat &&
59366 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59367 if (ISD::isNormalLoad(Op0.getNode()) ||
59370 auto *Mem = cast<MemSDNode>(Op0);
59371 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59374 if (SDValue BcastLd =
59375 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59376 SDValue BcastSrc =
59377 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59378 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59379 return BcastLd;
59380 }
59381 }
59382 }
59383
59384 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59385 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59386 Subtarget.useAVX512Regs()) {
59387 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59388 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59389 Res = DAG.getBitcast(ShuffleVT, Res);
59390 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59391 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59392 return DAG.getBitcast(VT, Res);
59393 }
59394
59395 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59396 if (!IsSplat &&
59397 ((NumOps == 2 && VT == MVT::v4f64) ||
59398 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59399 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59400 // Collect the individual per-lane v2f64/v4f64 shuffles.
59401 MVT OpVT = Ops[0].getSimpleValueType();
59402 unsigned NumOpElts = OpVT.getVectorNumElements();
59405 if (all_of(seq<int>(NumOps), [&](int I) {
59406 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59407 Depth + 1) &&
59408 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59409 none_of(SrcMasks[I], isUndefOrZero) &&
59410 SrcMasks[I].size() == NumOpElts &&
59411 all_of(SrcOps[I], [&OpVT](SDValue V) {
59412 return V.getValueType() == OpVT;
59413 });
59414 })) {
59415 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59416 bool Unary = true;
59417 unsigned SHUFPDMask = 0;
59419 for (unsigned I = 0; I != NumOps; ++I) {
59420 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59421 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59422 Unary &= LHS[I] == RHS[I];
59423 for (unsigned J = 0; J != NumOpElts; ++J)
59424 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59425 }
59426 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59427 // PERMILPD mask and we can always profitably concatenate them.
59428 SDValue Concat0 =
59429 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59430 SDValue Concat1 =
59431 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59432 if (Unary || Concat0 || Concat1) {
59433 Concat0 =
59434 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59435 Concat1 =
59436 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59437 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59438 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59439 }
59440 }
59441 }
59442
59443 return SDValue();
59444}
59445
59448 const X86Subtarget &Subtarget) {
59449 EVT VT = N->getValueType(0);
59450 EVT SrcVT = N->getOperand(0).getValueType();
59451 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59453
59454 if (VT.getVectorElementType() == MVT::i1) {
59455 // Attempt to constant fold.
59456 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59458 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59460 if (!C) break;
59461 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59462 if (I == (E - 1)) {
59463 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59464 if (TLI.isTypeLegal(IntVT))
59465 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59466 }
59467 }
59468
59469 // Don't do anything else for i1 vectors.
59470 return SDValue();
59471 }
59472
59473 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59474 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59475 Subtarget))
59476 return R;
59477 }
59478
59479 return SDValue();
59480}
59481
59484 const X86Subtarget &Subtarget) {
59485 if (DCI.isBeforeLegalizeOps())
59486 return SDValue();
59487
59488 MVT OpVT = N->getSimpleValueType(0);
59489
59490 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59491
59492 SDLoc dl(N);
59493 SDValue Vec = N->getOperand(0);
59494 SDValue SubVec = N->getOperand(1);
59495
59496 uint64_t IdxVal = N->getConstantOperandVal(2);
59497 MVT SubVecVT = SubVec.getSimpleValueType();
59498 int VecNumElts = OpVT.getVectorNumElements();
59499 int SubVecNumElts = SubVecVT.getVectorNumElements();
59500
59501 if (Vec.isUndef() && SubVec.isUndef())
59502 return DAG.getUNDEF(OpVT);
59503
59504 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59505 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59506 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59507 return getZeroVector(OpVT, Subtarget, DAG, dl);
59508
59510 // If we're inserting into a zero vector and then into a larger zero vector,
59511 // just insert into the larger zero vector directly.
59512 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59514 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59515 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59516 getZeroVector(OpVT, Subtarget, DAG, dl),
59517 SubVec.getOperand(1),
59518 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59519 }
59520
59521 // If we're inserting into a zero vector and our input was extracted from an
59522 // insert into a zero vector of the same type and the extraction was at
59523 // least as large as the original insertion. Just insert the original
59524 // subvector into a zero vector.
59525 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59526 isNullConstant(SubVec.getOperand(1)) &&
59528 SDValue Ins = SubVec.getOperand(0);
59529 if (isNullConstant(Ins.getOperand(2)) &&
59530 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59531 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59532 SubVecVT.getFixedSizeInBits())
59533 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59534 getZeroVector(OpVT, Subtarget, DAG, dl),
59535 Ins.getOperand(1), N->getOperand(2));
59536 }
59537 }
59538
59539 // Stop here if this is an i1 vector.
59540 if (IsI1Vector)
59541 return SDValue();
59542
59543 // Eliminate an intermediate vector widening:
59544 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59545 // insert_subvector X, Y, Idx
59546 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59547 // there?
59548 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59549 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59550 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59551 SubVec.getOperand(1), N->getOperand(2));
59552
59553 // If this is an insert of an extract, combine to a shuffle. Don't do this
59554 // if the insert or extract can be represented with a subregister operation.
59555 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59556 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59557 (IdxVal != 0 ||
59558 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59559 SDValue ExtSrc = SubVec.getOperand(0);
59560 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59561 // Create a shuffle mask matching the extraction and insertion.
59562 SmallVector<int, 64> Mask(VecNumElts);
59563 std::iota(Mask.begin(), Mask.end(), 0);
59564 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59565 ExtIdxVal + VecNumElts);
59566 if (ExtIdxVal != 0)
59567 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59568 // See if we can use a blend instead of extract/insert pair.
59569 SmallVector<int, 64> BlendMask(VecNumElts);
59570 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59571 std::iota(BlendMask.begin() + IdxVal,
59572 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59573 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59574 VecNumElts == (2 * SubVecNumElts)) {
59575 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59576 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59577 SDValue Blend = DAG.getNode(
59578 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59579 DAG.getBitcast(MVT::v8f32, ExtSrc),
59580 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59581 return DAG.getBitcast(OpVT, Blend);
59582 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59583 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59584 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59585 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59586 SDValue Shuffle =
59587 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59588 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59589 return DAG.getBitcast(OpVT, Shuffle);
59590 }
59591 }
59592 }
59593
59594 // Match concat_vector style patterns.
59595 SmallVector<SDValue, 2> SubVectorOps;
59596 if (collectConcatOps(N, SubVectorOps, DAG)) {
59597 if (SDValue Fold =
59598 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59599 return Fold;
59600
59601 // If we're inserting all zeros into the upper half, change this to
59602 // a concat with zero. We will match this to a move
59603 // with implicit upper bit zeroing during isel.
59604 // We do this here because we don't want combineConcatVectorOps to
59605 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59606 if (SubVectorOps.size() == 2 &&
59607 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59608 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59609 getZeroVector(OpVT, Subtarget, DAG, dl),
59610 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59611
59612 // Attempt to recursively combine to a shuffle.
59613 if (all_of(SubVectorOps, [](SDValue SubOp) {
59615 })) {
59616 SDValue Op(N, 0);
59617 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59618 return Res;
59619 }
59620 }
59621
59622 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59623 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59624 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59625
59626 // If this is a broadcast load inserted into an upper undef, use a larger
59627 // broadcast load.
59628 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59629 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59630 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59632 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59633 }
59634
59635 // If we're splatting the lower half subvector of a full vector load into the
59636 // upper half, attempt to create a subvector broadcast.
59637 if ((int)IdxVal == (VecNumElts / 2) &&
59638 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59639 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59640 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59641 if (VecLd && SubLd &&
59643 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59645 SubVecVT, SubLd, 0, DAG);
59646 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59647 BcastLd, DAG.getVectorIdxConstant(0, dl));
59648 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59649 return BcastLd;
59650 }
59651 }
59652
59653 // Attempt to constant fold (if we're not widening).
59654 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59655 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59656 APInt VecUndefElts, SubUndefElts;
59657 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59658 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59659 VecEltBits) &&
59660 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59661 SubEltBits)) {
59662 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59663 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59664 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59665 }
59666 }
59667
59668 // Attempt to recursively combine to a shuffle.
59671 SDValue Op(N, 0);
59672 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59673 return Res;
59674 }
59675
59676 // Match insertion of subvector load that perfectly aliases a base load.
59677 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59678 ISD::isNormalLoad(SubVec.getNode()) &&
59680 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59681 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59682 return Vec;
59683
59684 return SDValue();
59685}
59686
59687/// If we are extracting a subvector of a vector select and the select condition
59688/// is composed of concatenated vectors, try to narrow the select width. This
59689/// is a common pattern for AVX1 integer code because 256-bit selects may be
59690/// legal, but there is almost no integer math/logic available for 256-bit.
59691/// This function should only be called with legal types (otherwise, the calls
59692/// to get simple value types will assert).
59694 SelectionDAG &DAG) {
59695 SDValue Sel = Ext->getOperand(0);
59696 if (Sel.getOpcode() != ISD::VSELECT ||
59697 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59698 return SDValue();
59699
59700 // Note: We assume simple value types because this should only be called with
59701 // legal operations/types.
59702 // TODO: This can be extended to handle extraction to 256-bits.
59703 MVT VT = Ext->getSimpleValueType(0);
59704 if (!VT.is128BitVector())
59705 return SDValue();
59706
59707 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59708 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59709 return SDValue();
59710
59711 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59712 MVT SelVT = Sel.getSimpleValueType();
59713 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59714 "Unexpected vector type with legal operations");
59715
59716 unsigned SelElts = SelVT.getVectorNumElements();
59717 unsigned CastedElts = WideVT.getVectorNumElements();
59718 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59719 if (SelElts % CastedElts == 0) {
59720 // The select has the same or more (narrower) elements than the extract
59721 // operand. The extraction index gets scaled by that factor.
59722 ExtIdx *= (SelElts / CastedElts);
59723 } else if (CastedElts % SelElts == 0) {
59724 // The select has less (wider) elements than the extract operand. Make sure
59725 // that the extraction index can be divided evenly.
59726 unsigned IndexDivisor = CastedElts / SelElts;
59727 if (ExtIdx % IndexDivisor != 0)
59728 return SDValue();
59729 ExtIdx /= IndexDivisor;
59730 } else {
59731 llvm_unreachable("Element count of simple vector types are not divisible?");
59732 }
59733
59734 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59735 unsigned NarrowElts = SelElts / NarrowingFactor;
59736 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59737 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59738 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59739 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59740 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59741 return DAG.getBitcast(VT, NarrowSel);
59742}
59743
59746 const X86Subtarget &Subtarget) {
59747 if (!N->getValueType(0).isSimple())
59748 return SDValue();
59749
59750 MVT VT = N->getSimpleValueType(0);
59751 SDValue InVec = N->getOperand(0);
59752 unsigned IdxVal = N->getConstantOperandVal(1);
59753 EVT InVecVT = InVec.getValueType();
59754 unsigned SizeInBits = VT.getSizeInBits();
59755 unsigned InSizeInBits = InVecVT.getSizeInBits();
59756 unsigned NumSubElts = VT.getVectorNumElements();
59757 unsigned NumInElts = InVecVT.getVectorNumElements();
59758 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59759 SDLoc DL(N);
59760
59761 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59762 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59763 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59764 // We let generic combining take over from there to simplify the
59765 // insert/extract and 'not'.
59766 // This pattern emerges during AVX1 legalization. We handle it before lowering
59767 // to avoid complications like splitting constant vector loads.
59768 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59769 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59770 auto isConcatenatedNot = [](SDValue V) {
59771 V = peekThroughBitcasts(V);
59772 if (!isBitwiseNot(V))
59773 return false;
59774 SDValue NotOp = V->getOperand(0);
59776 };
59777 if (isConcatenatedNot(InVec.getOperand(0)) ||
59778 isConcatenatedNot(InVec.getOperand(1))) {
59779 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59780 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59781 splitVectorIntBinary(InVec, DAG, DL),
59782 N->getOperand(1));
59783 }
59784 }
59785
59786 if (DCI.isBeforeLegalizeOps())
59787 return SDValue();
59788
59789 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59790 return V;
59791
59793 return getZeroVector(VT, Subtarget, DAG, DL);
59794
59795 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59796 if (VT.getScalarType() == MVT::i1)
59797 return DAG.getConstant(1, DL, VT);
59798 return getOnesVector(VT, DAG, DL);
59799 }
59800
59801 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59802 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59803
59804 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59805 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59806 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59807 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59808 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59809 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59810 }
59811
59812 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59813 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59814 // iff SUB is entirely contained in the extraction.
59815 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59816 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59817 SDValue Src = InVec.getOperand(0);
59818 SDValue Sub = InVec.getOperand(1);
59819 EVT SubVT = Sub.getValueType();
59820 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59821 if (IdxVal <= InsIdx &&
59822 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59823 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59824 DAG.getVectorIdxConstant(IdxVal, DL));
59825 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59826 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59827 }
59828 }
59829
59830 // If we're extracting an upper subvector see if we'd get the same elements if
59831 // we extracted the lowest subvector instead which should allow
59832 // SimplifyDemandedVectorElts do more simplifications.
59833 if (IdxVal != 0) {
59834 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59835 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59836 });
59837 if (AllEquiv)
59838 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59839 }
59840
59841 // Check if we're extracting a whole broadcasted subvector.
59842 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59843 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59844 EVT MemVT = MemIntr->getMemoryVT();
59845 if (MemVT == VT) {
59846 // If this is the only use, we can replace with a regular load (this may
59847 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59848 // memory chain).
59849 if (InVec.hasOneUse()) {
59850 SDValue Ld =
59851 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59852 MemIntr->getMemOperand());
59853 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59854 return Ld;
59855 }
59856 }
59857 }
59858
59859 // Attempt to extract from the source of a shuffle vector.
59860 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59861 SmallVector<int, 32> ShuffleMask;
59862 SmallVector<int, 32> ScaledMask;
59863 SmallVector<SDValue, 2> ShuffleInputs;
59864 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59865 // Decode the shuffle mask and scale it so its shuffling subvectors.
59866 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59867 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59868 unsigned SubVecIdx = IdxVal / NumSubElts;
59869 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59870 return DAG.getUNDEF(VT);
59871 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59872 return getZeroVector(VT, Subtarget, DAG, DL);
59873 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59874 if (Src.getValueSizeInBits() == InSizeInBits) {
59875 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59876 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59877 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59878 DL, SizeInBits);
59879 }
59880 }
59881 }
59882
59883 auto IsExtractFree = [](SDValue V) {
59884 if (V.hasOneUse()) {
59886 if (V.getOpcode() == ISD::LOAD)
59887 return true;
59888 }
59889 V = peekThroughBitcasts(V);
59890 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59891 return true;
59893 return true;
59894 return V.isUndef();
59895 };
59896
59897 // If we're extracting the lowest subvector and we're the only user,
59898 // we may be able to perform this with a smaller vector width.
59899 unsigned InOpcode = InVec.getOpcode();
59900 if (InVec.hasOneUse()) {
59901 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59902 // v2f64 CVTDQ2PD(v4i32).
59903 if (InOpcode == ISD::SINT_TO_FP &&
59904 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59905 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59906 }
59907 // v2f64 CVTUDQ2PD(v4i32).
59908 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59909 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59910 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59911 }
59912 // v2f64 CVTPS2PD(v4f32).
59913 if (InOpcode == ISD::FP_EXTEND &&
59914 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59915 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59916 }
59917 }
59918 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59919 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59920 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59921 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59922 Subtarget.hasVLX())) &&
59923 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59924 SDValue Src = InVec.getOperand(0);
59925 if (Src.getValueType().getScalarSizeInBits() == 32)
59926 return DAG.getNode(InOpcode, DL, VT,
59927 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59928 }
59929 if (IdxVal == 0 &&
59930 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59931 (SizeInBits == 128 || SizeInBits == 256) &&
59932 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59933 SDValue Ext = InVec.getOperand(0);
59934 if (Ext.getValueSizeInBits() > SizeInBits)
59935 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59936 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59937 return DAG.getNode(ExtOp, DL, VT, Ext);
59938 }
59939 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59940 InVec.getOperand(0).getValueType().is256BitVector() &&
59941 InVec.getOperand(1).getValueType().is256BitVector() &&
59942 InVec.getOperand(2).getValueType().is256BitVector()) {
59943 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59944 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59945 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59946 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59947 }
59948 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59949 (SizeInBits == 128 || SizeInBits == 256)) {
59950 SDValue InVecSrc = InVec.getOperand(0);
59951 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59952 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59953 return DAG.getNode(InOpcode, DL, VT, Ext);
59954 }
59955
59956 if (SizeInBits == 128 || SizeInBits == 256) {
59957 switch (InOpcode) {
59958 case X86ISD::MOVDDUP:
59959 return DAG.getNode(
59960 InOpcode, DL, VT,
59961 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59962 case X86ISD::PSHUFD:
59963 case X86ISD::VPERMILPI:
59964 if (InVec.getOperand(0).hasOneUse()) {
59965 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59966 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59967 return DAG.getNode(InOpcode, DL, VT,
59968 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59969 DL, SizeInBits),
59970 DAG.getTargetConstant(M, DL, MVT::i8));
59971 }
59972 break;
59973 case X86ISD::PCMPEQ:
59974 case X86ISD::PCMPGT:
59975 case X86ISD::UNPCKH:
59976 case X86ISD::UNPCKL:
59977 if (IsExtractFree(InVec.getOperand(0)) ||
59978 IsExtractFree(InVec.getOperand(1)))
59979 return DAG.getNode(InOpcode, DL, VT,
59980 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59981 DL, SizeInBits),
59982 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59983 DL, SizeInBits));
59984 break;
59985 case X86ISD::CMPP:
59986 if (IsExtractFree(InVec.getOperand(0)) ||
59987 IsExtractFree(InVec.getOperand(1)))
59988 return DAG.getNode(InOpcode, DL, VT,
59989 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59990 DL, SizeInBits),
59991 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59992 DL, SizeInBits),
59993 InVec.getOperand(2));
59994 break;
59995 case X86ISD::BLENDI:
59996 if (IsExtractFree(InVec.getOperand(0)) ||
59997 IsExtractFree(InVec.getOperand(1))) {
59998 uint64_t M = InVec.getConstantOperandVal(2) & 255;
59999 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
60000 return DAG.getNode(InOpcode, DL, VT,
60001 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
60002 DL, SizeInBits),
60003 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
60004 DL, SizeInBits),
60005 DAG.getTargetConstant(M, DL, MVT::i8));
60006 }
60007 break;
60008 case X86ISD::VPERMV:
60009 if (IdxVal != 0) {
60010 SDValue Mask = InVec.getOperand(0);
60011 SDValue Src = InVec.getOperand(1);
60012 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
60013 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
60014 DL, InSizeInBits);
60015 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
60016 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
60017 }
60018 break;
60019 case X86ISD::VPERMV3:
60020 if (IdxVal != 0) {
60021 SDValue Src0 = InVec.getOperand(0);
60022 SDValue Mask = InVec.getOperand(1);
60023 SDValue Src1 = InVec.getOperand(2);
60024 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
60025 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
60026 DL, InSizeInBits);
60027 SDValue Shuffle =
60028 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
60029 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
60030 }
60031 break;
60032 }
60033 }
60034 }
60035
60036 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
60037 // as this is very likely to fold into a shuffle/truncation.
60038 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
60039 InVecVT.getScalarSizeInBits() == 64 &&
60040 InVec.getConstantOperandAPInt(1) == 32) {
60041 SDValue Ext =
60042 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
60043 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
60044 }
60045
60046 return SDValue();
60047}
60048
60050 const X86Subtarget &Subtarget) {
60051 using namespace SDPatternMatch;
60052 EVT VT = N->getValueType(0);
60053 SDValue Src = N->getOperand(0);
60054 SDLoc DL(N);
60055
60056 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
60057 // This occurs frequently in our masked scalar intrinsic code and our
60058 // floating point select lowering with AVX512.
60059 // TODO: SimplifyDemandedBits instead?
60060 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
60061 isOneConstant(Src.getOperand(1)))
60062 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
60063
60064 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
60065 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60066 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
60067 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
60068 isNullConstant(Src.getOperand(1)))
60069 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
60070 Src.getOperand(1));
60071
60072 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
60073 // TODO: Move to DAGCombine/SimplifyDemandedBits?
60074 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
60075 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
60076 if (Op.getValueType() != MVT::i64)
60077 return SDValue();
60078 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
60079 if (Op.getOpcode() == Opc &&
60080 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
60081 return Op.getOperand(0);
60082 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
60083 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
60084 if (Ld->getExtensionType() == Ext &&
60085 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
60086 return Op;
60087 if (IsZeroExt) {
60088 KnownBits Known = DAG.computeKnownBits(Op);
60089 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
60090 return Op;
60091 }
60092 return SDValue();
60093 };
60094
60095 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
60096 return DAG.getBitcast(
60097 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60098 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
60099
60100 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
60101 return DAG.getBitcast(
60102 VT,
60103 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
60104 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60105 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
60106 }
60107
60108 if (Src.getOpcode() == ISD::BITCAST) {
60109 SDValue SrcOp = Src.getOperand(0);
60110 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
60111 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
60112 return DAG.getBitcast(
60113 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
60114 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
60115 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
60116 return DAG.getBitcast(
60117 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
60118 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
60119 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
60120 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
60121 }
60122
60123 if (VT == MVT::v4i32) {
60124 SDValue HalfSrc;
60125 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
60126 // to remove XMM->GPR->XMM moves.
60127 if (sd_match(Src, m_AnyExt(m_BitCast(
60128 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
60129 return DAG.getBitcast(
60130 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
60131 }
60132
60133 // See if we're broadcasting the scalar value, in which case just reuse that.
60134 // Ensure the same SDValue from the SDNode use is being used.
60135 if (VT.getScalarType() == Src.getValueType())
60136 for (SDNode *User : Src->users())
60137 if (User->getOpcode() == X86ISD::VBROADCAST &&
60138 Src == User->getOperand(0)) {
60139 unsigned SizeInBits = VT.getFixedSizeInBits();
60140 unsigned BroadcastSizeInBits =
60141 User->getValueSizeInBits(0).getFixedValue();
60142 if (BroadcastSizeInBits == SizeInBits)
60143 return SDValue(User, 0);
60144 if (BroadcastSizeInBits > SizeInBits)
60145 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
60146 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
60147 // coverage.
60148 }
60149
60150 // Check for cases where we've ended up with a scalarized shift, typically
60151 // during type legalization.
60152 switch (Src.getOpcode()) {
60153 case ISD::SHL:
60154 case ISD::SRL:
60155 case ISD::SRA:
60156 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
60157 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
60158 Src.hasOneUse()) {
60159 SDValue SrcVec =
60160 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60161 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
60162 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
60163 Amt->getZExtValue(), DAG);
60164 }
60165 }
60166 break;
60167 case ISD::FSHL:
60168 case ISD::FSHR:
60169 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
60170 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
60171 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60172 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60173 Src.hasOneUse()) {
60174 uint64_t AmtVal =
60175 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
60176 SDValue SrcVec0 =
60177 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60178 SDValue SrcVec1 =
60179 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
60180 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
60181 DAG.getConstant(AmtVal, DL, VT));
60182 }
60183 }
60184 break;
60185 }
60186
60187 return SDValue();
60188}
60189
60190// Simplify PMULDQ and PMULUDQ operations.
60193 const X86Subtarget &Subtarget) {
60194 SDValue LHS = N->getOperand(0);
60195 SDValue RHS = N->getOperand(1);
60196
60197 // Canonicalize constant to RHS.
60200 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60201
60202 // Multiply by zero.
60203 // Don't return RHS as it may contain UNDEFs.
60204 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60205 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60206
60207 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60208 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60209 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60210 return SDValue(N, 0);
60211
60212 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60213 // convert it to any_extend_invec, due to the LegalOperations check, do the
60214 // conversion directly to a vector shuffle manually. This exposes combine
60215 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60216 // combineX86ShufflesRecursively on SSE4.1 targets.
60217 // FIXME: This is basically a hack around several other issues related to
60218 // ANY_EXTEND_VECTOR_INREG.
60219 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60220 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60221 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60222 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60223 SDLoc dl(N);
60224 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60225 LHS.getOperand(0), { 0, -1, 1, -1 });
60226 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60227 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60228 }
60229 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60230 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60231 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60232 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60233 SDLoc dl(N);
60234 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60235 RHS.getOperand(0), { 0, -1, 1, -1 });
60236 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60237 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60238 }
60239
60240 return SDValue();
60241}
60242
60243// Simplify VPMADDUBSW/VPMADDWD operations.
60246 MVT VT = N->getSimpleValueType(0);
60247 SDValue LHS = N->getOperand(0);
60248 SDValue RHS = N->getOperand(1);
60249 unsigned Opc = N->getOpcode();
60250 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60252 "Unexpected PMADD opcode");
60253
60254 // Multiply by zero.
60255 // Don't return LHS/RHS as it may contain UNDEFs.
60256 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60258 return DAG.getConstant(0, SDLoc(N), VT);
60259
60260 // Constant folding.
60261 APInt LHSUndefs, RHSUndefs;
60262 SmallVector<APInt> LHSBits, RHSBits;
60263 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60264 unsigned DstEltBits = VT.getScalarSizeInBits();
60265 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60266 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60267 SmallVector<APInt> Result;
60268 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60269 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60270 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60271 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60272 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60273 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60274 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60275 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60276 Result.push_back(Res);
60277 }
60278 return getConstVector(Result, VT, DAG, SDLoc(N));
60279 }
60280
60281 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60282 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60283 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60284 return SDValue(N, 0);
60285
60286 return SDValue();
60287}
60288
60289// Simplify VPMADD52L/VPMADD52H operations.
60292 MVT VT = N->getSimpleValueType(0);
60293
60294 bool AddLow = N->getOpcode() == X86ISD::VPMADD52L;
60295 SDValue Op0 = N->getOperand(0);
60296 SDValue Op1 = N->getOperand(1);
60297 SDValue Op2 = N->getOperand(2);
60298 SDLoc DL(N);
60299
60300 APInt C0, C1;
60301 bool HasC0 = X86::isConstantSplat(Op0, C0),
60302 HasC1 = X86::isConstantSplat(Op1, C1);
60303
60304 // lo/hi(C * X) + Z --> lo/hi(X * C) + Z
60305 if (HasC0 && !HasC1)
60306 return DAG.getNode(N->getOpcode(), DL, VT, Op1, Op0, Op2);
60307
60308 // lo(X * 1) + Z --> lo(X) + Z iff X == lo(X)
60309 if (AddLow && HasC1 && C1.trunc(52).isOne()) {
60310 KnownBits KnownOp0 = DAG.computeKnownBits(Op0);
60311 if (KnownOp0.countMinLeadingZeros() >= 12)
60312 return DAG.getNode(ISD::ADD, DL, VT, Op0, Op2);
60313 }
60314
60315 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60316 unsigned NumEltBits = VT.getScalarSizeInBits();
60317 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60318 DCI))
60319 return SDValue(N, 0);
60320
60321 return SDValue();
60322}
60323
60326 const X86Subtarget &Subtarget) {
60327 EVT VT = N->getValueType(0);
60328 SDValue In = N->getOperand(0);
60329 unsigned Opcode = N->getOpcode();
60330 unsigned InOpcode = In.getOpcode();
60331 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60332 SDLoc DL(N);
60333
60334 // Try to merge vector loads and extend_inreg to an extload.
60335 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60336 In.hasOneUse()) {
60337 auto *Ld = cast<LoadSDNode>(In);
60338 if (Ld->isSimple()) {
60339 MVT SVT = In.getSimpleValueType().getVectorElementType();
60342 : ISD::ZEXTLOAD;
60343 EVT MemVT = VT.changeVectorElementType(SVT);
60344 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60345 SDValue Load = DAG.getExtLoad(
60346 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60347 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60348 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60349 return Load;
60350 }
60351 }
60352 }
60353
60354 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60355 if (Opcode == InOpcode)
60356 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60357
60358 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60359 // -> EXTEND_VECTOR_INREG(X).
60360 // TODO: Handle non-zero subvector indices.
60361 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60362 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60363 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60364 In.getValueSizeInBits())
60365 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60366
60367 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60368 // TODO: Move to DAGCombine?
60369 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60370 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60371 In.getValueSizeInBits() == VT.getSizeInBits()) {
60372 unsigned NumElts = VT.getVectorNumElements();
60373 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60374 EVT EltVT = In.getOperand(0).getValueType();
60375 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60376 for (unsigned I = 0; I != NumElts; ++I)
60377 Elts[I * Scale] = In.getOperand(I);
60378 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60379 }
60380
60381 // Attempt to combine as a shuffle on SSE41+ targets.
60382 if (Subtarget.hasSSE41()) {
60383 SDValue Op(N, 0);
60384 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60385 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60386 return Res;
60387 }
60388
60389 return SDValue();
60390}
60391
60394 EVT VT = N->getValueType(0);
60395 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60396 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60397 return DAG.getConstant(0, SDLoc(N), VT);
60398
60399 // Fold kshiftr(extract_subvector(X,C1),C2)
60400 // --> extract_subvector(kshiftr(X,C1+C2),0)
60401 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60402 if (N->getOpcode() == X86ISD::KSHIFTR) {
60403 SDLoc DL(N);
60404 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60405 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60406 SDValue Src = N->getOperand(0).getOperand(0);
60407 uint64_t Amt = N->getConstantOperandVal(1) +
60408 N->getOperand(0).getConstantOperandVal(1);
60409 EVT SrcVT = Src.getValueType();
60410 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60411 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60412 DAG.getTargetConstant(Amt, DL, MVT::i8));
60413 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60414 DAG.getVectorIdxConstant(0, DL));
60415 }
60416 }
60417 }
60418
60419 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60420 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60421 return SDValue(N, 0);
60422
60423 return SDValue();
60424}
60425
60426// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60427// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60428// extra instructions between the conversion due to going to scalar and back.
60430 const X86Subtarget &Subtarget) {
60431 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60432 return SDValue();
60433
60434 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60435 return SDValue();
60436
60437 if (N->getValueType(0) != MVT::f32 ||
60438 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60439 return SDValue();
60440
60441 SDLoc dl(N);
60442 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60443 N->getOperand(0).getOperand(0));
60444 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60445 DAG.getTargetConstant(4, dl, MVT::i32));
60446 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60447 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60448 DAG.getVectorIdxConstant(0, dl));
60449}
60450
60453 const X86Subtarget &Subtarget) {
60454 EVT VT = N->getValueType(0);
60455 bool IsStrict = N->isStrictFPOpcode();
60456 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60457 EVT SrcVT = Src.getValueType();
60458
60459 SDLoc dl(N);
60460 if (SrcVT.getScalarType() == MVT::bf16) {
60461 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60462 !IsStrict && Src.getOperand(0).getValueType() == VT)
60463 return Src.getOperand(0);
60464
60465 if (!SrcVT.isVector())
60466 return SDValue();
60467
60468 assert(!IsStrict && "Strict FP doesn't support BF16");
60469 if (VT.getVectorElementType() == MVT::f64) {
60470 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60471 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60472 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60473 }
60474 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60475 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60476 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60477 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60478 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60479 return DAG.getBitcast(VT, Src);
60480 }
60481
60482 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60483 return SDValue();
60484
60485 if (Subtarget.hasFP16())
60486 return SDValue();
60487
60488 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60489 return SDValue();
60490
60491 if (VT.getVectorElementType() != MVT::f32 &&
60492 VT.getVectorElementType() != MVT::f64)
60493 return SDValue();
60494
60495 unsigned NumElts = VT.getVectorNumElements();
60496 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60497 return SDValue();
60498
60499 // Convert the input to vXi16.
60500 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60501 Src = DAG.getBitcast(IntVT, Src);
60502
60503 // Widen to at least 8 input elements.
60504 if (NumElts < 8) {
60505 unsigned NumConcats = 8 / NumElts;
60506 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60507 : DAG.getConstant(0, dl, IntVT);
60508 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60509 Ops[0] = Src;
60510 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60511 }
60512
60513 // Destination is vXf32 with at least 4 elements.
60514 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60515 std::max(4U, NumElts));
60516 SDValue Cvt, Chain;
60517 if (IsStrict) {
60518 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60519 {N->getOperand(0), Src});
60520 Chain = Cvt.getValue(1);
60521 } else {
60522 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60523 }
60524
60525 if (NumElts < 4) {
60526 assert(NumElts == 2 && "Unexpected size");
60527 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60528 DAG.getVectorIdxConstant(0, dl));
60529 }
60530
60531 if (IsStrict) {
60532 // Extend to the original VT if necessary.
60533 if (Cvt.getValueType() != VT) {
60534 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60535 {Chain, Cvt});
60536 Chain = Cvt.getValue(1);
60537 }
60538 return DAG.getMergeValues({Cvt, Chain}, dl);
60539 }
60540
60541 // Extend to the original VT if necessary.
60542 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60543}
60544
60545// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60548 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60549 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60550 "Unknown broadcast load type");
60551
60552 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60553 SDValue Ptr = MemIntrin->getBasePtr();
60554 SDValue Chain = MemIntrin->getChain();
60555 EVT VT = N->getSimpleValueType(0);
60556 EVT MemVT = MemIntrin->getMemoryVT();
60557
60558 // Look at other users of our base pointer and try to find a wider broadcast.
60559 // The input chain and the size of the memory VT must match.
60560 for (SDNode *User : Ptr->users())
60561 if (User != N && User->getOpcode() == N->getOpcode() &&
60562 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60563 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60564 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60565 MemVT.getSizeInBits() &&
60566 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60568 MemIntrin->isSimple() && "Illegal broadcast load type");
60570 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60571 VT.getSizeInBits());
60572 Extract = DAG.getBitcast(VT, Extract);
60573 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60574 return Extract;
60575 }
60576
60577 return SDValue();
60578}
60579
60581 const X86Subtarget &Subtarget) {
60582 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60583 return SDValue();
60584
60585 bool IsStrict = N->isStrictFPOpcode();
60586 EVT VT = N->getValueType(0);
60587 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60588 EVT SrcVT = Src.getValueType();
60589
60590 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60591 SrcVT.getVectorElementType() != MVT::f32)
60592 return SDValue();
60593
60594 SDLoc dl(N);
60595
60596 SDValue Cvt, Chain;
60597 unsigned NumElts = VT.getVectorNumElements();
60598 if (Subtarget.hasFP16()) {
60599 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60600 // v4f32 (xint_to_fp v4i64))))
60601 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60602 // v8f16 (CVTXI2P v4i64)))
60603 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60604 Src.getNumOperands() == 2) {
60605 SDValue Cvt0, Cvt1;
60606 SDValue Op0 = Src.getOperand(0);
60607 SDValue Op1 = Src.getOperand(1);
60608 bool IsOp0Strict = Op0->isStrictFPOpcode();
60609 if (Op0.getOpcode() != Op1.getOpcode() ||
60610 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60611 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60612 return SDValue();
60613 }
60614 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60615 if (IsStrict) {
60616 assert(IsOp0Strict && "Op0 must be strict node");
60617 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60620 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60621 {Op0.getOperand(0), Op0.getOperand(1)});
60622 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60623 {Op1.getOperand(0), Op1.getOperand(1)});
60624 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60625 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60626 }
60627 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60629 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60630 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60631 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60632 }
60633 return SDValue();
60634 }
60635
60636 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60637 return SDValue();
60638
60639 // Widen to at least 4 input elements.
60640 if (NumElts < 4)
60641 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60642 DAG.getConstantFP(0.0, dl, SrcVT));
60643
60644 // Destination is v8i16 with at least 8 elements.
60645 EVT CvtVT =
60646 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60647 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60648 if (IsStrict) {
60649 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60650 {N->getOperand(0), Src, Rnd});
60651 Chain = Cvt.getValue(1);
60652 } else {
60653 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60654 }
60655
60656 // Extract down to real number of elements.
60657 if (NumElts < 8) {
60659 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60660 DAG.getVectorIdxConstant(0, dl));
60661 }
60662
60663 Cvt = DAG.getBitcast(VT, Cvt);
60664
60665 if (IsStrict)
60666 return DAG.getMergeValues({Cvt, Chain}, dl);
60667
60668 return Cvt;
60669}
60670
60672 SDValue Src = N->getOperand(0);
60673
60674 // Turn MOVDQ2Q+simple_load into an mmx load.
60675 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60676 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60677
60678 if (LN->isSimple()) {
60679 SDValue NewLd =
60680 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60681 LN->getPointerInfo(), LN->getBaseAlign(),
60682 LN->getMemOperand()->getFlags());
60683 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60684 return NewLd;
60685 }
60686 }
60687
60688 return SDValue();
60689}
60690
60693 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60694 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60695 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60696 return SDValue(N, 0);
60697
60698 return SDValue();
60699}
60700
60701// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60702// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60703// use x86mmx instead.
60705 SDLoc dl(N);
60706
60707 bool MadeChange = false, CastReturnVal = false;
60709 for (const SDValue &Arg : N->op_values()) {
60710 if (Arg.getValueType() == MVT::v1i64) {
60711 MadeChange = true;
60712 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60713 } else
60714 Args.push_back(Arg);
60715 }
60716 SDVTList VTs = N->getVTList();
60717 SDVTList NewVTs = VTs;
60718 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60719 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60720 NewVTArr[0] = MVT::x86mmx;
60721 NewVTs = DAG.getVTList(NewVTArr);
60722 MadeChange = true;
60723 CastReturnVal = true;
60724 }
60725
60726 if (MadeChange) {
60727 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60728 if (CastReturnVal) {
60730 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60731 Returns.push_back(Result.getValue(i));
60732 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60733 return DAG.getMergeValues(Returns, dl);
60734 }
60735 return Result;
60736 }
60737 return SDValue();
60738}
60741 if (!DCI.isBeforeLegalize())
60742 return SDValue();
60743
60744 unsigned IntNo = N->getConstantOperandVal(0);
60745 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60746
60747 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60748 return FixupMMXIntrinsicTypes(N, DAG);
60749
60750 return SDValue();
60751}
60752
60755 if (!DCI.isBeforeLegalize())
60756 return SDValue();
60757
60758 unsigned IntNo = N->getConstantOperandVal(1);
60759 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60760
60761 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60762 return FixupMMXIntrinsicTypes(N, DAG);
60763
60764 return SDValue();
60765}
60766
60769 if (!DCI.isBeforeLegalize())
60770 return SDValue();
60771
60772 unsigned IntNo = N->getConstantOperandVal(1);
60773 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60774
60775 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60776 return FixupMMXIntrinsicTypes(N, DAG);
60777
60778 return SDValue();
60779}
60780
60782 DAGCombinerInfo &DCI) const {
60783 SelectionDAG &DAG = DCI.DAG;
60784 switch (N->getOpcode()) {
60785 // clang-format off
60786 default: break;
60788 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60790 case X86ISD::PEXTRW:
60791 case X86ISD::PEXTRB:
60792 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60794 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60796 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60798 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60799 case ISD::VSELECT:
60800 case ISD::SELECT:
60801 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60802 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60803 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60804 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60805 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60806 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60807 case X86ISD::ADD:
60808 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60809 case X86ISD::CLOAD:
60810 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60811 case X86ISD::SBB: return combineSBB(N, DAG);
60812 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60813 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60814 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60815 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60816 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60817 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60818 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60819 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60820 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60821 case ISD::AVGCEILS:
60822 case ISD::AVGCEILU:
60823 case ISD::AVGFLOORS:
60824 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60825 case X86ISD::BEXTR:
60826 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60827 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60828 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60829 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60830 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60832 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60833 case ISD::SINT_TO_FP:
60835 return combineSIntToFP(N, DAG, DCI, Subtarget);
60836 case ISD::UINT_TO_FP:
60838 return combineUIntToFP(N, DAG, Subtarget);
60839 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60840 case ISD::LRINT:
60841 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60842 case ISD::FADD:
60843 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60844 case X86ISD::VFCMULC:
60845 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60846 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60847 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60848 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60849 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60850 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60851 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60852 case X86ISD::FXOR:
60853 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60854 case X86ISD::FMIN:
60855 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60856 case ISD::FMINNUM:
60857 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60858 case X86ISD::CVTSI2P:
60859 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60860 case X86ISD::CVTP2SI:
60861 case X86ISD::CVTP2UI:
60863 case X86ISD::CVTTP2SI:
60865 case X86ISD::CVTTP2UI:
60866 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60868 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60869 case X86ISD::BT: return combineBT(N, DAG, DCI);
60870 case ISD::ANY_EXTEND:
60871 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60872 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60873 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60877 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60878 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60879 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60880 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60881 case X86ISD::PACKSS:
60882 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60883 case X86ISD::HADD:
60884 case X86ISD::HSUB:
60885 case X86ISD::FHADD:
60886 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60887 case X86ISD::VSHL:
60888 case X86ISD::VSRA:
60889 case X86ISD::VSRL:
60890 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60891 case X86ISD::VSHLI:
60892 case X86ISD::VSRAI:
60893 case X86ISD::VSRLI:
60894 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60896 case X86ISD::PINSRB:
60897 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60898 case X86ISD::SHUFP: // Handle all target specific shuffles
60899 case X86ISD::INSERTPS:
60900 case X86ISD::EXTRQI:
60901 case X86ISD::INSERTQI:
60902 case X86ISD::VALIGN:
60903 case X86ISD::PALIGNR:
60904 case X86ISD::VSHLDQ:
60905 case X86ISD::VSRLDQ:
60906 case X86ISD::BLENDI:
60907 case X86ISD::UNPCKH:
60908 case X86ISD::UNPCKL:
60909 case X86ISD::MOVHLPS:
60910 case X86ISD::MOVLHPS:
60911 case X86ISD::PSHUFB:
60912 case X86ISD::PSHUFD:
60913 case X86ISD::PSHUFHW:
60914 case X86ISD::PSHUFLW:
60915 case X86ISD::MOVSHDUP:
60916 case X86ISD::MOVSLDUP:
60917 case X86ISD::MOVDDUP:
60918 case X86ISD::MOVSS:
60919 case X86ISD::MOVSD:
60920 case X86ISD::MOVSH:
60921 case X86ISD::VBROADCAST:
60922 case X86ISD::VPPERM:
60923 case X86ISD::VPERMI:
60924 case X86ISD::VPERMV:
60925 case X86ISD::VPERMV3:
60926 case X86ISD::VPERMIL2:
60927 case X86ISD::VPERMILPI:
60928 case X86ISD::VPERMILPV:
60929 case X86ISD::VPERM2X128:
60930 case X86ISD::SHUF128:
60931 case X86ISD::VZEXT_MOVL:
60932 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60933 case X86ISD::FMADD_RND:
60934 case X86ISD::FMSUB:
60936 case X86ISD::FMSUB_RND:
60937 case X86ISD::FNMADD:
60939 case X86ISD::FNMADD_RND:
60940 case X86ISD::FNMSUB:
60942 case X86ISD::FNMSUB_RND:
60943 case ISD::FMA:
60944 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60947 case X86ISD::FMADDSUB:
60948 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60949 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60950 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60951 case X86ISD::MGATHER:
60952 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60953 case ISD::MGATHER:
60954 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60955 case X86ISD::PCMPEQ:
60956 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60957 case X86ISD::PMULDQ:
60958 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60959 case X86ISD::VPMADDUBSW:
60960 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60961 case X86ISD::VPMADD52L:
60962 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60963 case X86ISD::KSHIFTL:
60964 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60965 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60967 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60969 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60971 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60972 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60973 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60974 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60975 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60976 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60978 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60979 // clang-format on
60980 }
60981
60982 return SDValue();
60983}
60984
60986 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60987}
60988
60989// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60991 EVT ExtVT) const {
60992 return Subtarget.hasAVX512() || !VT.isVector();
60993}
60994
60996 if (!isTypeLegal(VT))
60997 return false;
60998
60999 // There are no vXi8 shifts.
61000 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
61001 return false;
61002
61003 // TODO: Almost no 8-bit ops are desirable because they have no actual
61004 // size/speed advantages vs. 32-bit ops, but they do have a major
61005 // potential disadvantage by causing partial register stalls.
61006 //
61007 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
61008 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
61009 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
61010 // check for a constant operand to the multiply.
61011 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
61012 return false;
61013
61014 // i16 instruction encodings are longer and some i16 instructions are slow,
61015 // so those are not desirable.
61016 if (VT == MVT::i16) {
61017 switch (Opc) {
61018 default:
61019 break;
61020 case ISD::LOAD:
61021 case ISD::SIGN_EXTEND:
61022 case ISD::ZERO_EXTEND:
61023 case ISD::ANY_EXTEND:
61024 case ISD::MUL:
61025 return false;
61026 case ISD::SHL:
61027 case ISD::SRA:
61028 case ISD::SRL:
61029 case ISD::SUB:
61030 case ISD::ADD:
61031 case ISD::AND:
61032 case ISD::OR:
61033 case ISD::XOR:
61034 // NDD instruction never has "partial register write" issue b/c it has
61035 // destination register's upper bits [63:OSIZE]) zeroed even when
61036 // OSIZE=8/16.
61037 return Subtarget.hasNDD();
61038 }
61039 }
61040
61041 // Any legal type not explicitly accounted for above here is desirable.
61042 return true;
61043}
61044
61046 SDValue Value, SDValue Addr,
61047 int JTI,
61048 SelectionDAG &DAG) const {
61049 const Module *M = DAG.getMachineFunction().getFunction().getParent();
61050 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
61051 if (IsCFProtectionSupported) {
61052 // In case control-flow branch protection is enabled, we need to add
61053 // notrack prefix to the indirect branch.
61054 // In order to do that we create NT_BRIND SDNode.
61055 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
61056 SDValue Chain = Value;
61057 // Jump table debug info is only needed if CodeView is enabled.
61059 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
61060 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
61061 }
61062
61063 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
61064}
61065
61068 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
61070 EVT VT = LogicOp->getValueType(0);
61071 EVT OpVT = SETCC0->getOperand(0).getValueType();
61072 if (!VT.isInteger())
61074
61075 if (VT.isVector())
61080
61081 // Don't use `NotAnd` as even though `not` is generally shorter code size than
61082 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
61083 // `NotAnd` applies, `AddAnd` does as well.
61084 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
61085 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
61087}
61088
61090 EVT VT = Op.getValueType();
61091 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
61092 isa<ConstantSDNode>(Op.getOperand(1));
61093
61094 // i16 is legal, but undesirable since i16 instruction encodings are longer
61095 // and some i16 instructions are slow.
61096 // 8-bit multiply-by-constant can usually be expanded to something cheaper
61097 // using LEA and/or other ALU ops.
61098 if (VT != MVT::i16 && !Is8BitMulByConstant)
61099 return false;
61100
61101 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
61102 if (!Op.hasOneUse())
61103 return false;
61104 SDNode *User = *Op->user_begin();
61106 return false;
61107 auto *Ld = cast<LoadSDNode>(Load);
61108 auto *St = cast<StoreSDNode>(User);
61109 return Ld->getBasePtr() == St->getBasePtr();
61110 };
61111
61112 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
61113 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
61114 return false;
61115 if (!Op.hasOneUse())
61116 return false;
61117 SDNode *User = *Op->user_begin();
61118 if (User->getOpcode() != ISD::ATOMIC_STORE)
61119 return false;
61120 auto *Ld = cast<AtomicSDNode>(Load);
61121 auto *St = cast<AtomicSDNode>(User);
61122 return Ld->getBasePtr() == St->getBasePtr();
61123 };
61124
61125 auto IsFoldableZext = [](SDValue Op) {
61126 if (!Op.hasOneUse())
61127 return false;
61128 SDNode *User = *Op->user_begin();
61129 EVT VT = User->getValueType(0);
61130 return (User->getOpcode() == ISD::ZERO_EXTEND &&
61131 (VT == MVT::i32 || VT == MVT::i64));
61132 };
61133
61134 bool Commute = false;
61135 switch (Op.getOpcode()) {
61136 default: return false;
61137 case ISD::SIGN_EXTEND:
61138 case ISD::ZERO_EXTEND:
61139 case ISD::ANY_EXTEND:
61140 break;
61141 case ISD::SHL:
61142 case ISD::SRA:
61143 case ISD::SRL: {
61144 SDValue N0 = Op.getOperand(0);
61145 // Look out for (store (shl (load), x)).
61146 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
61147 return false;
61148 break;
61149 }
61150 case ISD::MUL:
61151 // When ZU is enabled, we prefer to not promote for MUL by a constant
61152 // when there is an opportunity to fold a zext with imulzu.
61153 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
61154 (isa<ConstantSDNode>(Op.getOperand(0)) ||
61155 isa<ConstantSDNode>(Op.getOperand(1))))
61156 return false;
61157 [[fallthrough]];
61158 case ISD::ADD:
61159 case ISD::AND:
61160 case ISD::OR:
61161 case ISD::XOR:
61162 Commute = true;
61163 [[fallthrough]];
61164 case ISD::SUB: {
61165 SDValue N0 = Op.getOperand(0);
61166 SDValue N1 = Op.getOperand(1);
61167 // Avoid disabling potential load folding opportunities.
61168 if (X86::mayFoldLoad(N1, Subtarget) &&
61169 (!Commute || !isa<ConstantSDNode>(N0) ||
61170 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
61171 return false;
61172 if (X86::mayFoldLoad(N0, Subtarget) &&
61173 ((Commute && !isa<ConstantSDNode>(N1)) ||
61174 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
61175 return false;
61176 if (IsFoldableAtomicRMW(N0, Op) ||
61177 (Commute && IsFoldableAtomicRMW(N1, Op)))
61178 return false;
61179 }
61180 }
61181
61182 PVT = MVT::i32;
61183 return true;
61184}
61185
61186//===----------------------------------------------------------------------===//
61187// X86 Inline Assembly Support
61188//===----------------------------------------------------------------------===//
61189
61192 .Case("{@cca}", X86::COND_A)
61193 .Case("{@ccae}", X86::COND_AE)
61194 .Case("{@ccb}", X86::COND_B)
61195 .Case("{@ccbe}", X86::COND_BE)
61196 .Case("{@ccc}", X86::COND_B)
61197 .Case("{@cce}", X86::COND_E)
61198 .Case("{@ccz}", X86::COND_E)
61199 .Case("{@ccg}", X86::COND_G)
61200 .Case("{@ccge}", X86::COND_GE)
61201 .Case("{@ccl}", X86::COND_L)
61202 .Case("{@ccle}", X86::COND_LE)
61203 .Case("{@ccna}", X86::COND_BE)
61204 .Case("{@ccnae}", X86::COND_B)
61205 .Case("{@ccnb}", X86::COND_AE)
61206 .Case("{@ccnbe}", X86::COND_A)
61207 .Case("{@ccnc}", X86::COND_AE)
61208 .Case("{@ccne}", X86::COND_NE)
61209 .Case("{@ccnz}", X86::COND_NE)
61210 .Case("{@ccng}", X86::COND_LE)
61211 .Case("{@ccnge}", X86::COND_L)
61212 .Case("{@ccnl}", X86::COND_GE)
61213 .Case("{@ccnle}", X86::COND_G)
61214 .Case("{@ccno}", X86::COND_NO)
61215 .Case("{@ccnp}", X86::COND_NP)
61216 .Case("{@ccns}", X86::COND_NS)
61217 .Case("{@cco}", X86::COND_O)
61218 .Case("{@ccp}", X86::COND_P)
61219 .Case("{@ccs}", X86::COND_S)
61221 return Cond;
61222}
61223
61224/// Given a constraint letter, return the type of constraint for this target.
61227 if (Constraint.size() == 1) {
61228 switch (Constraint[0]) {
61229 case 'R':
61230 case 'q':
61231 case 'Q':
61232 case 'f':
61233 case 't':
61234 case 'u':
61235 case 'y':
61236 case 'x':
61237 case 'v':
61238 case 'l':
61239 case 'k': // AVX512 masking registers.
61240 return C_RegisterClass;
61241 case 'a':
61242 case 'b':
61243 case 'c':
61244 case 'd':
61245 case 'S':
61246 case 'D':
61247 case 'A':
61248 return C_Register;
61249 case 'I':
61250 case 'J':
61251 case 'K':
61252 case 'N':
61253 case 'G':
61254 case 'L':
61255 case 'M':
61256 return C_Immediate;
61257 case 'C':
61258 case 'e':
61259 case 'Z':
61260 return C_Other;
61261 default:
61262 break;
61263 }
61264 }
61265 else if (Constraint.size() == 2) {
61266 switch (Constraint[0]) {
61267 default:
61268 break;
61269 case 'W':
61270 if (Constraint[1] != 's')
61271 break;
61272 return C_Other;
61273 case 'Y':
61274 switch (Constraint[1]) {
61275 default:
61276 break;
61277 case 'z':
61278 return C_Register;
61279 case 'i':
61280 case 'm':
61281 case 'k':
61282 case 't':
61283 case '2':
61284 return C_RegisterClass;
61285 }
61286 break;
61287 case 'j':
61288 switch (Constraint[1]) {
61289 default:
61290 break;
61291 case 'r':
61292 case 'R':
61293 return C_RegisterClass;
61294 }
61295 }
61296 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61297 return C_Other;
61298 return TargetLowering::getConstraintType(Constraint);
61299}
61300
61301/// Examine constraint type and operand type and determine a weight value.
61302/// This object must already have been set up with the operand type
61303/// and the current alternative constraint selected.
61306 AsmOperandInfo &Info, const char *Constraint) const {
61308 Value *CallOperandVal = Info.CallOperandVal;
61309 // If we don't have a value, we can't do a match,
61310 // but allow it at the lowest weight.
61311 if (!CallOperandVal)
61312 return CW_Default;
61313 Type *Ty = CallOperandVal->getType();
61314 // Look at the constraint type.
61315 switch (*Constraint) {
61316 default:
61318 [[fallthrough]];
61319 case 'R':
61320 case 'q':
61321 case 'Q':
61322 case 'a':
61323 case 'b':
61324 case 'c':
61325 case 'd':
61326 case 'S':
61327 case 'D':
61328 case 'A':
61329 if (CallOperandVal->getType()->isIntegerTy())
61330 Wt = CW_SpecificReg;
61331 break;
61332 case 'f':
61333 case 't':
61334 case 'u':
61335 if (Ty->isFloatingPointTy())
61336 Wt = CW_SpecificReg;
61337 break;
61338 case 'y':
61339 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61340 Wt = CW_SpecificReg;
61341 break;
61342 case 'Y':
61343 if (StringRef(Constraint).size() != 2)
61344 break;
61345 switch (Constraint[1]) {
61346 default:
61347 return CW_Invalid;
61348 // XMM0
61349 case 'z':
61350 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61351 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61352 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61353 return CW_SpecificReg;
61354 return CW_Invalid;
61355 // Conditional OpMask regs (AVX512)
61356 case 'k':
61357 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61358 return CW_Register;
61359 return CW_Invalid;
61360 // Any MMX reg
61361 case 'm':
61362 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61363 return CW_SpecificReg;
61364 return CW_Invalid;
61365 // Any SSE reg when ISA >= SSE2, same as 'x'
61366 case 'i':
61367 case 't':
61368 case '2':
61369 if (!Subtarget.hasSSE2())
61370 return CW_Invalid;
61371 break;
61372 }
61373 break;
61374 case 'j':
61375 if (StringRef(Constraint).size() != 2)
61376 break;
61377 switch (Constraint[1]) {
61378 default:
61379 return CW_Invalid;
61380 case 'r':
61381 case 'R':
61382 if (CallOperandVal->getType()->isIntegerTy())
61383 Wt = CW_SpecificReg;
61384 break;
61385 }
61386 break;
61387 case 'v':
61388 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61389 Wt = CW_Register;
61390 [[fallthrough]];
61391 case 'x':
61392 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61393 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61394 Wt = CW_Register;
61395 break;
61396 case 'k':
61397 // Enable conditional vector operations using %k<#> registers.
61398 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61399 Wt = CW_Register;
61400 break;
61401 case 'I':
61402 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61403 if (C->getZExtValue() <= 31)
61404 Wt = CW_Constant;
61405 break;
61406 case 'J':
61407 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61408 if (C->getZExtValue() <= 63)
61409 Wt = CW_Constant;
61410 break;
61411 case 'K':
61412 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61413 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61414 Wt = CW_Constant;
61415 break;
61416 case 'L':
61417 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61418 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61419 Wt = CW_Constant;
61420 break;
61421 case 'M':
61422 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61423 if (C->getZExtValue() <= 3)
61424 Wt = CW_Constant;
61425 break;
61426 case 'N':
61427 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61428 if (C->getZExtValue() <= 0xff)
61429 Wt = CW_Constant;
61430 break;
61431 case 'G':
61432 case 'C':
61433 if (isa<ConstantFP>(CallOperandVal))
61434 Wt = CW_Constant;
61435 break;
61436 case 'e':
61437 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61438 if ((C->getSExtValue() >= -0x80000000LL) &&
61439 (C->getSExtValue() <= 0x7fffffffLL))
61440 Wt = CW_Constant;
61441 break;
61442 case 'Z':
61443 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61444 if (C->getZExtValue() <= 0xffffffff)
61445 Wt = CW_Constant;
61446 break;
61447 }
61448 return Wt;
61449}
61450
61451/// Try to replace an X constraint, which matches anything, with another that
61452/// has more specific requirements based on the type of the corresponding
61453/// operand.
61455LowerXConstraint(EVT ConstraintVT) const {
61456 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61457 // 'f' like normal targets.
61458 if (ConstraintVT.isFloatingPoint()) {
61459 if (Subtarget.hasSSE1())
61460 return "x";
61461 }
61462
61463 return TargetLowering::LowerXConstraint(ConstraintVT);
61464}
61465
61466// Lower @cc targets via setcc.
61468 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61469 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61470 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
61471 if (Cond == X86::COND_INVALID)
61472 return SDValue();
61473 // Check that return type is valid.
61474 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61475 OpInfo.ConstraintVT.getSizeInBits() < 8)
61476 report_fatal_error("Glue output operand is of invalid type");
61477
61478 // Get EFLAGS register. Only update chain when copyfrom is glued.
61479 if (Glue.getNode()) {
61480 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61481 Chain = Glue.getValue(1);
61482 } else
61483 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61484 // Extract CC code.
61485 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61486 // Extend to 32-bits
61487 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61488
61489 return Result;
61490}
61491
61492/// Lower the specified operand into the Ops vector.
61493/// If it is invalid, don't add anything to Ops.
61495 StringRef Constraint,
61496 std::vector<SDValue> &Ops,
61497 SelectionDAG &DAG) const {
61498 SDValue Result;
61499 char ConstraintLetter = Constraint[0];
61500 switch (ConstraintLetter) {
61501 default: break;
61502 case 'I':
61503 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61504 if (C->getZExtValue() <= 31) {
61505 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61506 Op.getValueType());
61507 break;
61508 }
61509 }
61510 return;
61511 case 'J':
61512 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61513 if (C->getZExtValue() <= 63) {
61514 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61515 Op.getValueType());
61516 break;
61517 }
61518 }
61519 return;
61520 case 'K':
61521 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61522 if (isInt<8>(C->getSExtValue())) {
61523 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61524 Op.getValueType());
61525 break;
61526 }
61527 }
61528 return;
61529 case 'L':
61530 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61531 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61532 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61533 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61534 Op.getValueType());
61535 break;
61536 }
61537 }
61538 return;
61539 case 'M':
61540 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61541 if (C->getZExtValue() <= 3) {
61542 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61543 Op.getValueType());
61544 break;
61545 }
61546 }
61547 return;
61548 case 'N':
61549 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61550 if (C->getZExtValue() <= 255) {
61551 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61552 Op.getValueType());
61553 break;
61554 }
61555 }
61556 return;
61557 case 'O':
61558 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61559 if (C->getZExtValue() <= 127) {
61560 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61561 Op.getValueType());
61562 break;
61563 }
61564 }
61565 return;
61566 case 'e': {
61567 // 32-bit signed value
61568 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61570 C->getSExtValue())) {
61571 // Widen to 64 bits here to get it sign extended.
61572 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61573 break;
61574 }
61575 // FIXME gcc accepts some relocatable values here too, but only in certain
61576 // memory models; it's complicated.
61577 }
61578 return;
61579 }
61580 case 'W': {
61581 assert(Constraint[1] == 's');
61582 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61583 // offset.
61584 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61585 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61586 BA->getValueType(0)));
61587 } else {
61588 int64_t Offset = 0;
61589 if (Op->getOpcode() == ISD::ADD &&
61590 isa<ConstantSDNode>(Op->getOperand(1))) {
61591 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61592 Op = Op->getOperand(0);
61593 }
61594 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61595 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61596 GA->getValueType(0), Offset));
61597 }
61598 return;
61599 }
61600 case 'Z': {
61601 // 32-bit unsigned value
61602 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61604 C->getZExtValue())) {
61605 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61606 Op.getValueType());
61607 break;
61608 }
61609 }
61610 // FIXME gcc accepts some relocatable values here too, but only in certain
61611 // memory models; it's complicated.
61612 return;
61613 }
61614 case 'i': {
61615 // Literal immediates are always ok.
61616 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61617 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61618 BooleanContent BCont = getBooleanContents(MVT::i64);
61619 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61621 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61622 : CST->getSExtValue();
61623 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61624 break;
61625 }
61626
61627 // In any sort of PIC mode addresses need to be computed at runtime by
61628 // adding in a register or some sort of table lookup. These can't
61629 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61630 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61632 return;
61633
61634 // If we are in non-pic codegen mode, we allow the address of a global (with
61635 // an optional displacement) to be used with 'i'.
61636 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61637 // If we require an extra load to get this address, as in PIC mode, we
61638 // can't accept it.
61640 Subtarget.classifyGlobalReference(GA->getGlobal())))
61641 return;
61642 break;
61643 }
61644 }
61645
61646 if (Result.getNode()) {
61647 Ops.push_back(Result);
61648 return;
61649 }
61650 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61651}
61652
61653/// Check if \p RC is a general purpose register class.
61654/// I.e., GR* or one of their variant.
61655static bool isGRClass(const TargetRegisterClass &RC) {
61656 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61657 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61658 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61659 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61660 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61661}
61662
61663/// Check if \p RC is a vector register class.
61664/// I.e., FR* / VR* or one of their variant.
61665static bool isFRClass(const TargetRegisterClass &RC) {
61666 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61667 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61668 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61669 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61670 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61671 RC.hasSuperClassEq(&X86::VR512RegClass);
61672}
61673
61674/// Check if \p RC is a mask register class.
61675/// I.e., VK* or one of their variant.
61676static bool isVKClass(const TargetRegisterClass &RC) {
61677 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61678 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61679 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61680 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61681 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61682 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61683 RC.hasSuperClassEq(&X86::VK64RegClass);
61684}
61685
61686static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61687 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61688}
61689
61690std::pair<unsigned, const TargetRegisterClass *>
61692 StringRef Constraint,
61693 MVT VT) const {
61694 // First, see if this is a constraint that directly corresponds to an LLVM
61695 // register class.
61696 if (Constraint.size() == 1) {
61697 // GCC Constraint Letters
61698 switch (Constraint[0]) {
61699 default: break;
61700 // 'A' means [ER]AX + [ER]DX.
61701 case 'A':
61702 if (Subtarget.is64Bit())
61703 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61704 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61705 "Expecting 64, 32 or 16 bit subtarget");
61706 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61707
61708 // TODO: Slight differences here in allocation order and leaving
61709 // RIP in the class. Do they matter any more here than they do
61710 // in the normal allocation?
61711 case 'k':
61712 if (Subtarget.hasAVX512()) {
61713 if (VT == MVT::v1i1 || VT == MVT::i1)
61714 return std::make_pair(0U, &X86::VK1RegClass);
61715 if (VT == MVT::v8i1 || VT == MVT::i8)
61716 return std::make_pair(0U, &X86::VK8RegClass);
61717 if (VT == MVT::v16i1 || VT == MVT::i16)
61718 return std::make_pair(0U, &X86::VK16RegClass);
61719 }
61720 if (Subtarget.hasBWI()) {
61721 if (VT == MVT::v32i1 || VT == MVT::i32)
61722 return std::make_pair(0U, &X86::VK32RegClass);
61723 if (VT == MVT::v64i1 || VT == MVT::i64)
61724 return std::make_pair(0U, &X86::VK64RegClass);
61725 }
61726 break;
61727 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61728 if (Subtarget.is64Bit()) {
61729 if (VT == MVT::i8 || VT == MVT::i1)
61730 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61731 ? &X86::GR8RegClass
61732 : &X86::GR8_NOREX2RegClass);
61733 if (VT == MVT::i16)
61734 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61735 ? &X86::GR16RegClass
61736 : &X86::GR16_NOREX2RegClass);
61737 if (VT == MVT::i32 || VT == MVT::f32)
61738 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61739 ? &X86::GR32RegClass
61740 : &X86::GR32_NOREX2RegClass);
61741 if (VT != MVT::f80 && !VT.isVector())
61742 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61743 ? &X86::GR64RegClass
61744 : &X86::GR64_NOREX2RegClass);
61745 break;
61746 }
61747 [[fallthrough]];
61748 // 32-bit fallthrough
61749 case 'Q': // Q_REGS
61750 if (VT == MVT::i8 || VT == MVT::i1)
61751 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61752 if (VT == MVT::i16)
61753 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61754 if (VT == MVT::i32 || VT == MVT::f32 ||
61755 (!VT.isVector() && !Subtarget.is64Bit()))
61756 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61757 if (VT != MVT::f80 && !VT.isVector())
61758 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61759 break;
61760 case 'r': // GENERAL_REGS
61761 case 'l': // INDEX_REGS
61762 if (VT == MVT::i8 || VT == MVT::i1)
61763 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61764 ? &X86::GR8RegClass
61765 : &X86::GR8_NOREX2RegClass);
61766 if (VT == MVT::i16)
61767 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61768 ? &X86::GR16RegClass
61769 : &X86::GR16_NOREX2RegClass);
61770 if (VT == MVT::i32 || VT == MVT::f32 ||
61771 (!VT.isVector() && !Subtarget.is64Bit()))
61772 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61773 ? &X86::GR32RegClass
61774 : &X86::GR32_NOREX2RegClass);
61775 if (VT != MVT::f80 && !VT.isVector())
61776 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61777 ? &X86::GR64RegClass
61778 : &X86::GR64_NOREX2RegClass);
61779 break;
61780 case 'R': // LEGACY_REGS
61781 if (VT == MVT::i8 || VT == MVT::i1)
61782 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61783 if (VT == MVT::i16)
61784 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61785 if (VT == MVT::i32 || VT == MVT::f32 ||
61786 (!VT.isVector() && !Subtarget.is64Bit()))
61787 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61788 if (VT != MVT::f80 && !VT.isVector())
61789 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61790 break;
61791 case 'f': // FP Stack registers.
61792 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61793 // value to the correct fpstack register class.
61794 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61795 return std::make_pair(0U, &X86::RFP32RegClass);
61796 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61797 return std::make_pair(0U, &X86::RFP64RegClass);
61798 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61799 return std::make_pair(0U, &X86::RFP80RegClass);
61800 break;
61801 case 'y': // MMX_REGS if MMX allowed.
61802 if (!Subtarget.hasMMX()) break;
61803 return std::make_pair(0U, &X86::VR64RegClass);
61804 case 'v':
61805 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61806 if (!Subtarget.hasSSE1()) break;
61807 bool VConstraint = (Constraint[0] == 'v');
61808
61809 switch (VT.SimpleTy) {
61810 default: break;
61811 // Scalar SSE types.
61812 case MVT::f16:
61813 if (VConstraint && Subtarget.hasFP16())
61814 return std::make_pair(0U, &X86::FR16XRegClass);
61815 break;
61816 case MVT::f32:
61817 case MVT::i32:
61818 if (VConstraint && Subtarget.hasVLX())
61819 return std::make_pair(0U, &X86::FR32XRegClass);
61820 return std::make_pair(0U, &X86::FR32RegClass);
61821 case MVT::f64:
61822 case MVT::i64:
61823 if (VConstraint && Subtarget.hasVLX())
61824 return std::make_pair(0U, &X86::FR64XRegClass);
61825 return std::make_pair(0U, &X86::FR64RegClass);
61826 case MVT::i128:
61827 if (Subtarget.is64Bit()) {
61828 if (VConstraint && Subtarget.hasVLX())
61829 return std::make_pair(0U, &X86::VR128XRegClass);
61830 return std::make_pair(0U, &X86::VR128RegClass);
61831 }
61832 break;
61833 // Vector types and fp128.
61834 case MVT::v8f16:
61835 if (!Subtarget.hasFP16())
61836 break;
61837 if (VConstraint)
61838 return std::make_pair(0U, &X86::VR128XRegClass);
61839 return std::make_pair(0U, &X86::VR128RegClass);
61840 case MVT::v8bf16:
61841 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61842 break;
61843 if (VConstraint)
61844 return std::make_pair(0U, &X86::VR128XRegClass);
61845 return std::make_pair(0U, &X86::VR128RegClass);
61846 case MVT::f128:
61847 if (!Subtarget.is64Bit())
61848 break;
61849 [[fallthrough]];
61850 case MVT::v16i8:
61851 case MVT::v8i16:
61852 case MVT::v4i32:
61853 case MVT::v2i64:
61854 case MVT::v4f32:
61855 case MVT::v2f64:
61856 if (VConstraint && Subtarget.hasVLX())
61857 return std::make_pair(0U, &X86::VR128XRegClass);
61858 return std::make_pair(0U, &X86::VR128RegClass);
61859 // AVX types.
61860 case MVT::v16f16:
61861 if (!Subtarget.hasFP16())
61862 break;
61863 if (VConstraint)
61864 return std::make_pair(0U, &X86::VR256XRegClass);
61865 return std::make_pair(0U, &X86::VR256RegClass);
61866 case MVT::v16bf16:
61867 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61868 break;
61869 if (VConstraint)
61870 return std::make_pair(0U, &X86::VR256XRegClass);
61871 return std::make_pair(0U, &X86::VR256RegClass);
61872 case MVT::v32i8:
61873 case MVT::v16i16:
61874 case MVT::v8i32:
61875 case MVT::v4i64:
61876 case MVT::v8f32:
61877 case MVT::v4f64:
61878 if (VConstraint && Subtarget.hasVLX())
61879 return std::make_pair(0U, &X86::VR256XRegClass);
61880 if (Subtarget.hasAVX())
61881 return std::make_pair(0U, &X86::VR256RegClass);
61882 break;
61883 case MVT::v32f16:
61884 if (!Subtarget.hasFP16())
61885 break;
61886 if (VConstraint)
61887 return std::make_pair(0U, &X86::VR512RegClass);
61888 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61889 case MVT::v32bf16:
61890 if (!Subtarget.hasBF16())
61891 break;
61892 if (VConstraint)
61893 return std::make_pair(0U, &X86::VR512RegClass);
61894 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61895 case MVT::v64i8:
61896 case MVT::v32i16:
61897 case MVT::v8f64:
61898 case MVT::v16f32:
61899 case MVT::v16i32:
61900 case MVT::v8i64:
61901 if (!Subtarget.hasAVX512()) break;
61902 if (VConstraint)
61903 return std::make_pair(0U, &X86::VR512RegClass);
61904 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61905 }
61906 break;
61907 }
61908 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61909 switch (Constraint[1]) {
61910 default:
61911 break;
61912 case 'i':
61913 case 't':
61914 case '2':
61915 return getRegForInlineAsmConstraint(TRI, "x", VT);
61916 case 'm':
61917 if (!Subtarget.hasMMX()) break;
61918 return std::make_pair(0U, &X86::VR64RegClass);
61919 case 'z':
61920 if (!Subtarget.hasSSE1()) break;
61921 switch (VT.SimpleTy) {
61922 default: break;
61923 // Scalar SSE types.
61924 case MVT::f16:
61925 if (!Subtarget.hasFP16())
61926 break;
61927 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61928 case MVT::f32:
61929 case MVT::i32:
61930 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61931 case MVT::f64:
61932 case MVT::i64:
61933 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61934 case MVT::v8f16:
61935 if (!Subtarget.hasFP16())
61936 break;
61937 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61938 case MVT::v8bf16:
61939 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61940 break;
61941 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61942 case MVT::f128:
61943 case MVT::v16i8:
61944 case MVT::v8i16:
61945 case MVT::v4i32:
61946 case MVT::v2i64:
61947 case MVT::v4f32:
61948 case MVT::v2f64:
61949 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61950 // AVX types.
61951 case MVT::v16f16:
61952 if (!Subtarget.hasFP16())
61953 break;
61954 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61955 case MVT::v16bf16:
61956 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61957 break;
61958 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61959 case MVT::v32i8:
61960 case MVT::v16i16:
61961 case MVT::v8i32:
61962 case MVT::v4i64:
61963 case MVT::v8f32:
61964 case MVT::v4f64:
61965 if (Subtarget.hasAVX())
61966 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61967 break;
61968 case MVT::v32f16:
61969 if (!Subtarget.hasFP16())
61970 break;
61971 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61972 case MVT::v32bf16:
61973 if (!Subtarget.hasBF16())
61974 break;
61975 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61976 case MVT::v64i8:
61977 case MVT::v32i16:
61978 case MVT::v8f64:
61979 case MVT::v16f32:
61980 case MVT::v16i32:
61981 case MVT::v8i64:
61982 if (Subtarget.hasAVX512())
61983 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61984 break;
61985 }
61986 break;
61987 case 'k':
61988 // This register class doesn't allocate k0 for masked vector operation.
61989 if (Subtarget.hasAVX512()) {
61990 if (VT == MVT::v1i1 || VT == MVT::i1)
61991 return std::make_pair(0U, &X86::VK1WMRegClass);
61992 if (VT == MVT::v8i1 || VT == MVT::i8)
61993 return std::make_pair(0U, &X86::VK8WMRegClass);
61994 if (VT == MVT::v16i1 || VT == MVT::i16)
61995 return std::make_pair(0U, &X86::VK16WMRegClass);
61996 }
61997 if (Subtarget.hasBWI()) {
61998 if (VT == MVT::v32i1 || VT == MVT::i32)
61999 return std::make_pair(0U, &X86::VK32WMRegClass);
62000 if (VT == MVT::v64i1 || VT == MVT::i64)
62001 return std::make_pair(0U, &X86::VK64WMRegClass);
62002 }
62003 break;
62004 }
62005 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
62006 switch (Constraint[1]) {
62007 default:
62008 break;
62009 case 'r':
62010 if (VT == MVT::i8 || VT == MVT::i1)
62011 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
62012 if (VT == MVT::i16)
62013 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
62014 if (VT == MVT::i32 || VT == MVT::f32)
62015 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
62016 if (VT != MVT::f80 && !VT.isVector())
62017 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
62018 break;
62019 case 'R':
62020 if (VT == MVT::i8 || VT == MVT::i1)
62021 return std::make_pair(0U, &X86::GR8RegClass);
62022 if (VT == MVT::i16)
62023 return std::make_pair(0U, &X86::GR16RegClass);
62024 if (VT == MVT::i32 || VT == MVT::f32)
62025 return std::make_pair(0U, &X86::GR32RegClass);
62026 if (VT != MVT::f80 && !VT.isVector())
62027 return std::make_pair(0U, &X86::GR64RegClass);
62028 break;
62029 }
62030 }
62031
62032 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
62033 return std::make_pair(0U, &X86::GR32RegClass);
62034
62035 // Use the default implementation in TargetLowering to convert the register
62036 // constraint into a member of a register class.
62037 std::pair<Register, const TargetRegisterClass*> Res;
62039
62040 // Not found as a standard register?
62041 if (!Res.second) {
62042 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
62043 // to/from f80.
62044 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
62045 // Map st(0) -> st(7) -> ST0
62046 if (Constraint.size() == 7 && Constraint[0] == '{' &&
62047 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
62048 Constraint[3] == '(' &&
62049 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
62050 Constraint[5] == ')' && Constraint[6] == '}') {
62051 // st(7) is not allocatable and thus not a member of RFP80. Return
62052 // singleton class in cases where we have a reference to it.
62053 if (Constraint[4] == '7')
62054 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
62055 return std::make_pair(X86::FP0 + Constraint[4] - '0',
62056 &X86::RFP80RegClass);
62057 }
62058
62059 // GCC allows "st(0)" to be called just plain "st".
62060 if (StringRef("{st}").equals_insensitive(Constraint))
62061 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
62062 }
62063
62064 // flags -> EFLAGS
62065 if (StringRef("{flags}").equals_insensitive(Constraint))
62066 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
62067
62068 // dirflag -> DF
62069 // Only allow for clobber.
62070 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
62071 VT == MVT::Other)
62072 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
62073
62074 // fpsr -> FPSW
62075 // Only allow for clobber.
62076 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
62077 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
62078
62079 return Res;
62080 }
62081
62082 // Make sure it isn't a register that requires 64-bit mode.
62083 if (!Subtarget.is64Bit() &&
62084 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
62085 TRI->getEncodingValue(Res.first) >= 8) {
62086 // Register requires REX prefix, but we're in 32-bit mode.
62087 return std::make_pair(0, nullptr);
62088 }
62089
62090 // Make sure it isn't a register that requires AVX512.
62091 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
62092 TRI->getEncodingValue(Res.first) & 0x10) {
62093 // Register requires EVEX prefix.
62094 return std::make_pair(0, nullptr);
62095 }
62096
62097 // Otherwise, check to see if this is a register class of the wrong value
62098 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
62099 // turn into {ax},{dx}.
62100 // MVT::Other is used to specify clobber names.
62101 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
62102 return Res; // Correct type already, nothing to do.
62103
62104 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
62105 // return "eax". This should even work for things like getting 64bit integer
62106 // registers when given an f64 type.
62107 const TargetRegisterClass *Class = Res.second;
62108 // The generic code will match the first register class that contains the
62109 // given register. Thus, based on the ordering of the tablegened file,
62110 // the "plain" GR classes might not come first.
62111 // Therefore, use a helper method.
62112 if (isGRClass(*Class)) {
62113 unsigned Size = VT.getSizeInBits();
62114 if (Size == 1) Size = 8;
62115 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
62116 return std::make_pair(0, nullptr);
62117 Register DestReg = getX86SubSuperRegister(Res.first, Size);
62118 if (DestReg.isValid()) {
62119 bool is64Bit = Subtarget.is64Bit();
62120 const TargetRegisterClass *RC =
62121 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
62122 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
62123 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
62124 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
62125 if (Size == 64 && !is64Bit) {
62126 // Model GCC's behavior here and select a fixed pair of 32-bit
62127 // registers.
62128 switch (DestReg) {
62129 case X86::RAX:
62130 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
62131 case X86::RDX:
62132 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
62133 case X86::RCX:
62134 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
62135 case X86::RBX:
62136 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
62137 case X86::RSI:
62138 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
62139 case X86::RDI:
62140 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
62141 case X86::RBP:
62142 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
62143 default:
62144 return std::make_pair(0, nullptr);
62145 }
62146 }
62147 if (RC && RC->contains(DestReg))
62148 return std::make_pair(DestReg, RC);
62149 return Res;
62150 }
62151 // No register found/type mismatch.
62152 return std::make_pair(0, nullptr);
62153 } else if (isFRClass(*Class)) {
62154 // Handle references to XMM physical registers that got mapped into the
62155 // wrong class. This can happen with constraints like {xmm0} where the
62156 // target independent register mapper will just pick the first match it can
62157 // find, ignoring the required type.
62158
62159 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
62160 if (VT == MVT::f16)
62161 Res.second = &X86::FR16XRegClass;
62162 else if (VT == MVT::f32 || VT == MVT::i32)
62163 Res.second = &X86::FR32XRegClass;
62164 else if (VT == MVT::f64 || VT == MVT::i64)
62165 Res.second = &X86::FR64XRegClass;
62166 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
62167 Res.second = &X86::VR128XRegClass;
62168 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
62169 Res.second = &X86::VR256XRegClass;
62170 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
62171 Res.second = &X86::VR512RegClass;
62172 else {
62173 // Type mismatch and not a clobber: Return an error;
62174 Res.first = 0;
62175 Res.second = nullptr;
62176 }
62177 } else if (isVKClass(*Class)) {
62178 if (VT == MVT::v1i1 || VT == MVT::i1)
62179 Res.second = &X86::VK1RegClass;
62180 else if (VT == MVT::v8i1 || VT == MVT::i8)
62181 Res.second = &X86::VK8RegClass;
62182 else if (VT == MVT::v16i1 || VT == MVT::i16)
62183 Res.second = &X86::VK16RegClass;
62184 else if (VT == MVT::v32i1 || VT == MVT::i32)
62185 Res.second = &X86::VK32RegClass;
62186 else if (VT == MVT::v64i1 || VT == MVT::i64)
62187 Res.second = &X86::VK64RegClass;
62188 else {
62189 // Type mismatch and not a clobber: Return an error;
62190 Res.first = 0;
62191 Res.second = nullptr;
62192 }
62193 }
62194
62195 return Res;
62196}
62197
62198bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
62199 // Integer division on x86 is expensive. However, when aggressively optimizing
62200 // for code size, we prefer to use a div instruction, as it is usually smaller
62201 // than the alternative sequence.
62202 // The exception to this is vector division. Since x86 doesn't have vector
62203 // integer division, leaving the division as-is is a loss even in terms of
62204 // size, because it will have to be scalarized, while the alternative code
62205 // sequence can be performed in vector form.
62206 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62207 return OptSize && !VT.isVector();
62208}
62209
62210void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62211 if (!Subtarget.is64Bit())
62212 return;
62213
62214 // Update IsSplitCSR in X86MachineFunctionInfo.
62216 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62217 AFI->setIsSplitCSR(true);
62218}
62219
62220void X86TargetLowering::insertCopiesSplitCSR(
62221 MachineBasicBlock *Entry,
62222 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62223 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62224 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62225 if (!IStart)
62226 return;
62227
62228 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62229 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62230 MachineBasicBlock::iterator MBBI = Entry->begin();
62231 for (const MCPhysReg *I = IStart; *I; ++I) {
62232 const TargetRegisterClass *RC = nullptr;
62233 if (X86::GR64RegClass.contains(*I))
62234 RC = &X86::GR64RegClass;
62235 else
62236 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62237
62238 Register NewVR = MRI->createVirtualRegister(RC);
62239 // Create copy from CSR to a virtual register.
62240 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62241 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62242 // nounwind. If we want to generalize this later, we may need to emit
62243 // CFI pseudo-instructions.
62244 assert(
62245 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62246 "Function should be nounwind in insertCopiesSplitCSR!");
62247 Entry->addLiveIn(*I);
62248 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62249 .addReg(*I);
62250
62251 // Insert the copy-back instructions right before the terminator.
62252 for (auto *Exit : Exits)
62253 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62254 TII->get(TargetOpcode::COPY), *I)
62255 .addReg(NewVR);
62256 }
62257}
62258
62260 return Subtarget.is64Bit();
62261}
62262
62266 const TargetInstrInfo *TII) const {
62267 assert(MBBI->isCall() && MBBI->getCFIType() &&
62268 "Invalid call instruction for a KCFI check");
62269
62270 MachineFunction &MF = *MBB.getParent();
62271 // If the call target is a memory operand, unfold it and use R11 for the
62272 // call, so KCFI_CHECK won't have to recompute the address.
62273 switch (MBBI->getOpcode()) {
62274 case X86::CALL64m:
62275 case X86::CALL64m_NT:
62276 case X86::TAILJMPm64:
62277 case X86::TAILJMPm64_REX: {
62280 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62281 /*UnfoldStore=*/false, NewMIs))
62282 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62283 for (auto *NewMI : NewMIs)
62284 MBBI = MBB.insert(OrigCall, NewMI);
62285 assert(MBBI->isCall() &&
62286 "Unexpected instruction after memory operand unfolding");
62287 if (OrigCall->shouldUpdateAdditionalCallInfo())
62288 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62289 MBBI->setCFIType(MF, OrigCall->getCFIType());
62290 OrigCall->eraseFromParent();
62291 break;
62292 }
62293 default:
62294 break;
62295 }
62296
62297 MachineOperand &Target = MBBI->getOperand(0);
62298 Register TargetReg;
62299 switch (MBBI->getOpcode()) {
62300 case X86::CALL64r:
62301 case X86::CALL64r_ImpCall:
62302 case X86::CALL64r_NT:
62303 case X86::TAILJMPr64:
62304 case X86::TAILJMPr64_REX:
62305 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62306 Target.setIsRenamable(false);
62307 TargetReg = Target.getReg();
62308 break;
62309 case X86::CALL64pcrel32:
62310 case X86::TAILJMPd64:
62311 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62312 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62313 // 64-bit indirect thunk calls.
62314 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62315 "Unexpected register for an indirect thunk call");
62316 TargetReg = X86::R11;
62317 break;
62318 default:
62319 llvm_unreachable("Unexpected CFI call opcode");
62320 break;
62321 }
62322
62323 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62324 .addReg(TargetReg)
62325 .addImm(MBBI->getCFIType())
62326 .getInstr();
62327}
62328
62329/// Returns true if stack probing through a function call is requested.
62333
62334/// Returns true if stack probing through inline assembly is requested.
62336
62337 // No inline stack probe for Windows, they have their own mechanism.
62338 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62339 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62340 return false;
62341
62342 // If the function specifically requests inline stack probes, emit them.
62343 if (MF.getFunction().hasFnAttribute("probe-stack"))
62344 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62345 "inline-asm";
62346
62347 return false;
62348}
62349
62350/// Returns the name of the symbol used to emit stack probes or the empty
62351/// string if not applicable.
62354 // Inline Stack probes disable stack probe call
62355 if (hasInlineStackProbe(MF))
62356 return "";
62357
62358 // If the function specifically requests stack probes, emit them.
62359 if (MF.getFunction().hasFnAttribute("probe-stack"))
62360 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62361
62362 // Generally, if we aren't on Windows, the platform ABI does not include
62363 // support for stack probes, so don't emit them.
62364 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62365 Subtarget.isTargetMachO() ||
62366 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62367 return "";
62368
62369 // We need a stack probe to conform to the Windows ABI. Choose the right
62370 // symbol.
62371 if (Subtarget.is64Bit())
62372 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62373 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62374}
62375
62376unsigned
62378 // The default stack probe size is 4096 if the function has no stackprobesize
62379 // attribute.
62380 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62381 4096);
62382}
62383
62385 if (ML && ML->isInnermost() &&
62386 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62389}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISDPAC...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true, bool AllowAVX512=true)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code?
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
The Input class is used to parse a yaml document into in-memory structs and vectors.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition APFloat.cpp:6082
void clearSign()
Definition APFloat.h:1298
opStatus next(bool nextDown)
Definition APFloat.h:1254
void changeSign()
Definition APFloat.h:1297
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
APInt abs() const
Get the absolute value.
Definition APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition APInt.h:1079
int32_t exactLogBase2() const
Definition APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
unsigned countTrailingZeros() const
Definition APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition APInt.h:1435
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:471
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
This is an SDNode representing atomic operations.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition BitVector.h:181
bool any() const
any - Returns true if any bit is set.
Definition BitVector.h:189
bool none() const
none - Returns true if none of the bits are set.
Definition BitVector.h:207
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_NE
not equal
Definition InstrTypes.h:698
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
Tagged union holding either a T or a Error.
Definition Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:437
ThreadLocalMode getThreadLocalMode() const
Module * getParent()
Get the module that this global value is contained inside of...
This class is used to form a handle around another node that is persistent and is updated across invo...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
succ_reverse_iterator succ_rbegin()
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
size_type size() const
Definition SmallSet.h:170
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
static constexpr size_t npos
Definition StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition StringRef.h:172
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition Triple.h:774
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
use_iterator use_begin()
Definition Value.h:364
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
Register getPtrSizedStackRegister(const MachineFunction &MF) const
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
bool hasAnyFMA() const
bool hasSSE1() const
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
bool hasBitScanPassThrough() const
bool hasSSE42() const
const X86TargetLowering * getTargetLowering() const override
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
bool canUseCMOV() const
bool isTargetDarwin() const
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
const X86InstrInfo * getInstrInfo() const override
bool useAVX512Regs() const
bool hasSSE3() const
bool isCallingConvWin64(CallingConv::ID CC) const
bool hasAVX512() const
bool canExtendTo512DQ() const
bool hasSSE41() const
bool hasSSE2() const
bool hasSSSE3() const
bool hasInt256() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
unsigned getPreferVectorWidth() const
const X86FrameLowering * getFrameLowering() const override
bool useBWIRegs() const
bool hasAVX2() const
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define INT64_MIN
Definition DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
@ COND_NE
Not equal.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ X86_ThisCall
Similar to X86_StdCall.
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition ISDOpcodes.h:140
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:892
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition ISDOpcodes.h:151
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:881
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isExtVecInRegOpcode(unsigned Opcode)
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Opcode_match m_Opc(unsigned Opcode)
BinaryOpc_match< LHS, RHS > m_Srl(const LHS &L, const RHS &R)
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
TernaryOpc_match< LHS, RHS, IDX > m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx)
UnaryOpc_match< Opnd > m_Abs(const Opnd &Op)
Or< Preds... > m_AnyOf(const Preds &...preds)
And< Preds... > m_AllOf(const Preds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
UnaryOpc_match< Opnd > m_AnyExt(const Opnd &Op)
auto m_Node(unsigned Opcode, const OpndPreds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
CondCode_match m_SpecificCondCode(ISD::CondCode CC)
Match a conditional code SDNode with a specific ISD::CondCode.
auto m_SpecificVectorElementVT(EVT RefVT, const Pattern &P)
Match a vector ValueType.
CondCode_match m_CondCode()
Match any conditional code SDNode.
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
Invariant opcodes: All instruction sets have these as their low opcodes.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ PTR32_UPTR
Definition X86.h:217
@ PTR64
Definition X86.h:218
@ PTR32_SPTR
Definition X86.h:216
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
Define some predicates that are used for node matching.
@ AddrNumOperands
Definition X86BaseInfo.h:36
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getRoundingModeX86(unsigned RM)
Convert LLVM rounding mode to X86 rounding mode.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
constexpr double e
Definition MathExtras.h:47
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition Utils.cpp:1607
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2078
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:331
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1779
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1741
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1996
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
To bit_cast(const From &from) noexcept
Definition bit.h:90
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1862
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1956
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1837
constexpr unsigned BitWidth
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1963
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2110
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition STLExtras.h:1584
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:316
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
@ SM_SentinelUndef
@ SM_SentinelZero
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
#define EQ(a, b)
Definition regexec.c:65
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition APFloat.h:308
static LLVM_ABI const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition APFloat.cpp:289
static LLVM_ABI const fltSemantics & IEEEquad() LLVM_READNONE
Definition APFloat.cpp:268
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
opStatus
IEEE-754R 7: Default exception handling.
Definition APFloat.h:320
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition KnownBits.h:186
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition KnownBits.h:274
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition KnownBits.h:289
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:111
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition KnownBits.h:196
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
void setNoSignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
X86AddressMode - This struct holds a generalized full x86 address mode.