LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
159 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
301 setOperationAction(ISD::LRINT, MVT::f32, Custom);
302 setOperationAction(ISD::LRINT, MVT::f64, Custom);
303 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
304 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
305
306 if (!Subtarget.is64Bit()) {
307 setOperationAction(ISD::LRINT, MVT::i64, Custom);
308 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
331 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
332 MVT::v4i64}) {
335 }
336 if (Subtarget.is64Bit()) {
339 }
340 }
341
342 // Handle address space casts between mixed sized pointers.
343 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
344 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
345
346 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
347 if (!Subtarget.hasSSE2()) {
348 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
349 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
352 if (Subtarget.is64Bit()) {
353 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
354 // Without SSE, i64->f64 goes through memory.
355 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
356 }
357 } else if (!Subtarget.is64Bit())
358 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
359
360 // Scalar integer divide and remainder are lowered to use operations that
361 // produce two results, to match the available instructions. This exposes
362 // the two-result form to trivial CSE, which is able to combine x/y and x%y
363 // into a single instruction.
364 //
365 // Scalar integer multiply-high is also lowered to use two-result
366 // operations, to match the available instructions. However, plain multiply
367 // (low) operations are left as Legal, as there are single-result
368 // instructions for this in x86. Using the two-result multiply instructions
369 // when both high and low results are needed must be arranged by dagcombine.
370 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
377 }
378
379 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
380 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
381 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
383 setOperationAction(ISD::BR_CC, VT, Expand);
385 }
386 if (Subtarget.is64Bit())
391
392 setOperationAction(ISD::FREM , MVT::f32 , Expand);
393 setOperationAction(ISD::FREM , MVT::f64 , Expand);
394 setOperationAction(ISD::FREM , MVT::f80 , Expand);
395 setOperationAction(ISD::FREM , MVT::f128 , Expand);
396
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
399 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
400 setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
401 setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
402 setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
403 }
404
405 // Promote the i8 variants and force them on up to i32 which has a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
409 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
410 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
411 // promote that too.
412 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
414
415 if (!Subtarget.hasBMI()) {
416 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
418 if (Subtarget.is64Bit()) {
419 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
421 }
422 }
423
424 if (Subtarget.hasLZCNT()) {
425 // When promoting the i8 variants, force them to i32 for a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
429 } else {
430 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 continue;
435 }
436 }
437
438 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
439 ISD::STRICT_FP_TO_FP16}) {
440 // Special handling for half-precision floating point conversions.
441 // If we don't have F16C support, then lower half float conversions
442 // into library calls.
444 Op, MVT::f32,
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
446 // There's never any support for operations beyond MVT::f32.
447 setOperationAction(Op, MVT::f64, Expand);
448 setOperationAction(Op, MVT::f80, Expand);
449 setOperationAction(Op, MVT::f128, Expand);
450 }
451
452 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
453 setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand);
454 setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand);
455 }
456
457 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
460 setTruncStoreAction(VT, MVT::f16, Expand);
461 setTruncStoreAction(VT, MVT::bf16, Expand);
462
463 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
464 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
465 }
466
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
473 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
474 // popcntw is longer to encode than popcntl and also has a false dependency
475 // on the dest that popcntl hasn't had since Cannon Lake.
476 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
477 } else {
482 }
483
484 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
485
486 if (!Subtarget.hasMOVBE())
488
489 // X86 wants to expand cmov itself.
490 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 }
496 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
498 continue;
501 }
502
503 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
506
508 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
509 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
513
514 // Darwin ABI issue.
515 for (auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
524 }
525
526 // 64-bit shl, sra, srl (iff 32-bit x86)
527 for (auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
529 continue;
533 }
534
535 if (Subtarget.hasSSEPrefetch())
536 setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
537
538 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
539
540 // Expand certain atomics
541 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
544 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
545 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
546 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
547 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
548 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
549 }
550
551 if (!Subtarget.is64Bit())
552 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
553
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
555 // All CPUs supporting AVX will atomically load/store aligned 128-bit
556 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
557 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
558 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
559 }
560
561 if (Subtarget.canUseCMPXCHG16B())
562 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
563
564 // FIXME - use subtarget debug flags
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
567 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
568 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
569 }
570
573
574 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
575 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
576
577 setOperationAction(ISD::TRAP, MVT::Other, Legal);
578 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
579 if (Subtarget.isTargetPS())
580 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
581 else
582 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
583
584 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
585 setOperationAction(ISD::VASTART , MVT::Other, Custom);
586 setOperationAction(ISD::VAEND , MVT::Other, Expand);
587 bool Is64Bit = Subtarget.is64Bit();
588 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
589 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
590
591 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
592 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
593
594 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
595
596 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
597 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
598 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
599
601
602 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
603 setOperationAction(ISD::FABS, VT, Action);
604 setOperationAction(ISD::FNEG, VT, Action);
606 setOperationAction(ISD::FREM, VT, Action);
607 setOperationAction(ISD::FMA, VT, Action);
608 setOperationAction(ISD::FMINNUM, VT, Action);
609 setOperationAction(ISD::FMAXNUM, VT, Action);
610 setOperationAction(ISD::FMINIMUM, VT, Action);
611 setOperationAction(ISD::FMAXIMUM, VT, Action);
612 setOperationAction(ISD::FMINIMUMNUM, VT, Action);
613 setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
614 setOperationAction(ISD::FSIN, VT, Action);
615 setOperationAction(ISD::FCOS, VT, Action);
616 setOperationAction(ISD::FSINCOS, VT, Action);
617 setOperationAction(ISD::FTAN, VT, Action);
618 setOperationAction(ISD::FSQRT, VT, Action);
619 setOperationAction(ISD::FPOW, VT, Action);
620 setOperationAction(ISD::FPOWI, VT, Action);
621 setOperationAction(ISD::FLOG, VT, Action);
622 setOperationAction(ISD::FLOG2, VT, Action);
623 setOperationAction(ISD::FLOG10, VT, Action);
624 setOperationAction(ISD::FEXP, VT, Action);
625 setOperationAction(ISD::FEXP2, VT, Action);
626 setOperationAction(ISD::FEXP10, VT, Action);
627 setOperationAction(ISD::FCEIL, VT, Action);
628 setOperationAction(ISD::FFLOOR, VT, Action);
629 setOperationAction(ISD::FNEARBYINT, VT, Action);
630 setOperationAction(ISD::FRINT, VT, Action);
631 setOperationAction(ISD::BR_CC, VT, Action);
632 setOperationAction(ISD::SETCC, VT, Action);
635 setOperationAction(ISD::FROUND, VT, Action);
636 setOperationAction(ISD::FROUNDEVEN, VT, Action);
637 setOperationAction(ISD::FTRUNC, VT, Action);
638 setOperationAction(ISD::FLDEXP, VT, Action);
639 };
640
641 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
642 // f16, f32 and f64 use SSE.
643 // Set up the FP register classes.
644 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
645 : &X86::FR16RegClass);
646 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
647 : &X86::FR32RegClass);
648 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
649 : &X86::FR64RegClass);
650
651 // Disable f32->f64 extload as we can only generate this in one instruction
652 // under optsize. So its easier to pattern match (fpext (load)) for that
653 // case instead of needing to emit 2 instructions for extload in the
654 // non-optsize case.
655 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
656
657 for (auto VT : { MVT::f32, MVT::f64 }) {
658 // Use ANDPD to simulate FABS.
659 setOperationAction(ISD::FABS, VT, Custom);
660
661 // Use XORP to simulate FNEG.
662 setOperationAction(ISD::FNEG, VT, Custom);
663
664 // Use ANDPD and ORPD to simulate FCOPYSIGN.
666
667 // These might be better off as horizontal vector ops.
670
671 // We don't support sin/cos/fmod
672 setOperationAction(ISD::FSIN , VT, Expand);
673 setOperationAction(ISD::FCOS , VT, Expand);
674 setOperationAction(ISD::FSINCOS, VT, Expand);
675 }
676
677 // Half type will be promoted by default.
678 setF16Action(MVT::f16, Promote);
683 setOperationAction(ISD::FABS, MVT::f16, Custom);
684 setOperationAction(ISD::FNEG, MVT::f16, Custom);
687 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
688 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
689
720 setOperationAction(ISD::LRINT, MVT::f16, Expand);
721 setOperationAction(ISD::LLRINT, MVT::f16, Expand);
722
723 // Lower this to MOVMSK plus an AND.
726
727 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
728 (UseX87 || Is64Bit)) {
729 // Use SSE for f32, x87 for f64.
730 // Set up the FP register classes.
731 addRegisterClass(MVT::f32, &X86::FR32RegClass);
732 if (UseX87)
733 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
734
735 // Use ANDPS to simulate FABS.
736 setOperationAction(ISD::FABS , MVT::f32, Custom);
737
738 // Use XORP to simulate FNEG.
739 setOperationAction(ISD::FNEG , MVT::f32, Custom);
740
741 if (UseX87)
743
744 // Use ANDPS and ORPS to simulate FCOPYSIGN.
745 if (UseX87)
748
749 // We don't support sin/cos/fmod
750 setOperationAction(ISD::FSIN , MVT::f32, Expand);
751 setOperationAction(ISD::FCOS , MVT::f32, Expand);
752 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
753
754 if (UseX87) {
755 // Always expand sin/cos functions even though x87 has an instruction.
756 setOperationAction(ISD::FSIN, MVT::f64, Expand);
757 setOperationAction(ISD::FCOS, MVT::f64, Expand);
758 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
759 }
760 } else if (UseX87) {
761 // f32 and f64 in x87.
762 // Set up the FP register classes.
763 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
764 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
765
766 for (auto VT : { MVT::f32, MVT::f64 }) {
769
770 // Always expand sin/cos functions even though x87 has an instruction.
771 setOperationAction(ISD::FSIN , VT, Expand);
772 setOperationAction(ISD::FCOS , VT, Expand);
773 setOperationAction(ISD::FSINCOS, VT, Expand);
774 }
775 }
776
777 // Expand FP32 immediates into loads from the stack, save special cases.
778 if (isTypeLegal(MVT::f32)) {
779 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
780 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
781 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
782 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
783 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
784 } else // SSE immediates.
785 addLegalFPImmediate(APFloat(+0.0f)); // xorps
786 }
787 // Expand FP64 immediates into loads from the stack, save special cases.
788 if (isTypeLegal(MVT::f64)) {
789 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
790 addLegalFPImmediate(APFloat(+0.0)); // FLD0
791 addLegalFPImmediate(APFloat(+1.0)); // FLD1
792 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
793 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
794 } else // SSE immediates.
795 addLegalFPImmediate(APFloat(+0.0)); // xorpd
796 }
797 // Support fp16 0 immediate.
798 if (isTypeLegal(MVT::f16))
799 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
800
801 // Handle constrained floating-point operations of scalar.
814
815 // We don't support FMA.
818
819 // f80 always uses X87.
820 if (UseX87) {
821 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
824 {
826 addLegalFPImmediate(TmpFlt); // FLD0
827 TmpFlt.changeSign();
828 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
829
830 bool ignored;
831 APFloat TmpFlt2(+1.0);
833 &ignored);
834 addLegalFPImmediate(TmpFlt2); // FLD1
835 TmpFlt2.changeSign();
836 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
837 }
838
839 // Always expand sin/cos functions even though x87 has an instruction.
840 // clang-format off
841 setOperationAction(ISD::FSIN , MVT::f80, Expand);
842 setOperationAction(ISD::FCOS , MVT::f80, Expand);
843 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
844 setOperationAction(ISD::FTAN , MVT::f80, Expand);
845 setOperationAction(ISD::FASIN , MVT::f80, Expand);
846 setOperationAction(ISD::FACOS , MVT::f80, Expand);
847 setOperationAction(ISD::FATAN , MVT::f80, Expand);
848 setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
849 setOperationAction(ISD::FSINH , MVT::f80, Expand);
850 setOperationAction(ISD::FCOSH , MVT::f80, Expand);
851 setOperationAction(ISD::FTANH , MVT::f80, Expand);
852 // clang-format on
853
854 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
855 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
856 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
857 setOperationAction(ISD::FRINT, MVT::f80, Expand);
858 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
859 setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
861 setOperationAction(ISD::LROUND, MVT::f80, LibCall);
862 setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
863 setOperationAction(ISD::LRINT, MVT::f80, Custom);
864 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
865
866 // Handle constrained floating-point operations of scalar.
873 if (isTypeLegal(MVT::f16)) {
874 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
876 } else {
878 }
879 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
880 // as Custom.
882 }
883
884 // f128 uses xmm registers, but most operations require libcalls.
885 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
886 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
890
901
902 setOperationAction(ISD::FABS, MVT::f128, Custom);
903 setOperationAction(ISD::FNEG, MVT::f128, Custom);
905
906 // clang-format off
907 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
909 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
911 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
912 setOperationAction(ISD::FTAN, MVT::f128, LibCall);
914 // clang-format on
915 // No STRICT_FSINCOS
916 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
918
919 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
921 // We need to custom handle any FP_ROUND with an f128 input, but
922 // LegalizeDAG uses the result type to know when to run a custom handler.
923 // So we have to list all legal floating point result types here.
924 if (isTypeLegal(MVT::f32)) {
927 }
928 if (isTypeLegal(MVT::f64)) {
931 }
932 if (isTypeLegal(MVT::f80)) {
936 }
937
939
940 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
941 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
942 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
943 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
944 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
945 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
946 }
947
948 // Always use a library call for pow.
949 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
950 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
951 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
952 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
953
954 setOperationAction(ISD::FLOG, MVT::f80, Expand);
955 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
956 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
957 setOperationAction(ISD::FEXP, MVT::f80, Expand);
958 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
959 setOperationAction(ISD::FEXP10, MVT::f80, Expand);
960 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
961 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
962
963 // Some FP actions are always expanded for vector types.
964 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
965 MVT::v4f32, MVT::v8f32, MVT::v16f32,
966 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
967 // clang-format off
968 setOperationAction(ISD::FSIN, VT, Expand);
969 setOperationAction(ISD::FSINCOS, VT, Expand);
970 setOperationAction(ISD::FCOS, VT, Expand);
971 setOperationAction(ISD::FTAN, VT, Expand);
974 setOperationAction(ISD::FPOW, VT, Expand);
975 setOperationAction(ISD::FLOG, VT, Expand);
976 setOperationAction(ISD::FLOG2, VT, Expand);
977 setOperationAction(ISD::FLOG10, VT, Expand);
978 setOperationAction(ISD::FEXP, VT, Expand);
979 setOperationAction(ISD::FEXP2, VT, Expand);
980 setOperationAction(ISD::FEXP10, VT, Expand);
981 // clang-format on
982 }
983
984 // First set operation action for all vector types to either promote
985 // (for widening) or expand (for scalarization). Then we will selectively
986 // turn on ones that can be effectively codegen'd.
997 setOperationAction(ISD::FFLOOR, VT, Expand);
998 setOperationAction(ISD::FCEIL, VT, Expand);
999 setOperationAction(ISD::FTRUNC, VT, Expand);
1000 setOperationAction(ISD::FRINT, VT, Expand);
1001 setOperationAction(ISD::FNEARBYINT, VT, Expand);
1002 setOperationAction(ISD::FROUNDEVEN, VT, Expand);
1026 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1027 setTruncStoreAction(InnerVT, VT, Expand);
1028
1029 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1030 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1031
1032 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1033 // types, we have to deal with them whether we ask for Expansion or not.
1034 // Setting Expand causes its own optimisation problems though, so leave
1035 // them legal.
1036 if (VT.getVectorElementType() == MVT::i1)
1037 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1038
1039 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1040 // split/scalarized right now.
1041 if (VT.getVectorElementType() == MVT::f16 ||
1042 VT.getVectorElementType() == MVT::bf16)
1043 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1044 }
1045 }
1046
1047 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1048 // with -msoft-float, disable use of MMX as well.
1049 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1050 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1051 // No operations on x86mmx supported, everything uses intrinsics.
1052 }
1053
1054 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1055 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1056 : &X86::VR128RegClass);
1057
1058 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1059 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1060 setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
1061 setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
1062
1063 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1064 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1072
1073 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1074 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1076
1082 }
1083
1084 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1085 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1086 : &X86::VR128RegClass);
1087
1088 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1089 // registers cannot be used even for integer operations.
1090 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1091 : &X86::VR128RegClass);
1092 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1093 : &X86::VR128RegClass);
1094 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1097 : &X86::VR128RegClass);
1098 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1099 : &X86::VR128RegClass);
1100
1101 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1102 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1103 setOperationAction(ISD::FMINIMUM, VT, Custom);
1104 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1105 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1106 }
1107
1108 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1109 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1114 }
1115
1116 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1117 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1118 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1119
1120 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1121 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1122 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1123 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1125 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1126 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1127 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1128 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1129 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1132
1133 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1134 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1135 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1136
1137 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1139 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1141
1142 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1143 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1144
1145 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1146 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1147 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1148 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1149 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1150 }
1151
1162
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1174
1175 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1176 // setcc all the way to isel and prefer SETGT in some isel patterns.
1179 }
1180
1181 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1182 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1187
1188 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1194 }
1195
1196 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1200
1201 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1202 continue;
1203
1206 }
1207 setF16Action(MVT::v8f16, Expand);
1208 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1209 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1210 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1212 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1213 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1215
1216 // Custom lower v2i64 and v2f64 selects.
1223
1230
1231 // Custom legalize these to avoid over promotion or custom promotion.
1232 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1237 }
1238
1243
1246
1249
1250 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1255
1256 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1260
1261 // We want to legalize this to an f64 load rather than an i64 load on
1262 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1263 // store.
1264 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1265 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1266 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1267 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1268 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1269 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1270
1271 // Add 32-bit vector stores to help vectorization opportunities.
1272 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1273 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1274
1275 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1276 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1277 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1278 if (!Subtarget.hasAVX512())
1279 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1280
1284
1286
1303
1304 // In the customized shift lowering, the legal v4i32/v2i64 cases
1305 // in AVX2 will be recognized.
1306 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1310 if (VT == MVT::v2i64) continue;
1315 }
1316
1322 }
1323
1324 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1329
1330 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1332 }
1333 }
1334
1335 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1336 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1337 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1338 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1339
1340 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1343 }
1344
1345 // These might be better off as horizontal vector ops.
1350 }
1351
1352 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1353 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1354 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1356 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1358 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1360 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1362 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1364 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1366
1367 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1368 }
1369
1370 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1371 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1372 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1373 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1374 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1375 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1376 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1377 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1378
1382
1383 // FIXME: Do we need to handle scalar-to-vector here?
1384 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1385 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1386
1387 // We directly match byte blends in the backend as they match the VSELECT
1388 // condition form.
1390
1391 // SSE41 brings specific instructions for doing vector sign extend even in
1392 // cases where we don't have SRA.
1393 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1396 }
1397
1398 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1399 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1400 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1405 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1406 }
1407
1408 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1409 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1410 // do the pre and post work in the vector domain.
1413 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1414 // so that DAG combine doesn't try to turn it into uint_to_fp.
1417 }
1418 }
1419
1420 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1422 }
1423
1424 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1425 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1426 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1429 }
1430
1431 // XOP can efficiently perform BITREVERSE with VPPERM.
1432 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1434 }
1435
1436 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1437 bool HasInt256 = Subtarget.hasInt256();
1438
1439 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1440 : &X86::VR256RegClass);
1441 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1442 : &X86::VR256RegClass);
1443 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1444 : &X86::VR256RegClass);
1445 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1446 : &X86::VR256RegClass);
1447 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1448 : &X86::VR256RegClass);
1449 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1450 : &X86::VR256RegClass);
1451 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1452 : &X86::VR256RegClass);
1453
1454 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1455 setOperationAction(ISD::FFLOOR, VT, Legal);
1457 setOperationAction(ISD::FCEIL, VT, Legal);
1459 setOperationAction(ISD::FTRUNC, VT, Legal);
1461 setOperationAction(ISD::FRINT, VT, Legal);
1463 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1465 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1467
1468 setOperationAction(ISD::FROUND, VT, Custom);
1469
1470 setOperationAction(ISD::FNEG, VT, Custom);
1471 setOperationAction(ISD::FABS, VT, Custom);
1473
1474 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1475 setOperationAction(ISD::FMINIMUM, VT, Custom);
1476 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1477 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1479 }
1480
1481 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1482 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1483
1484 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1485 // even though v8i16 is a legal type.
1486 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1487 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1488 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1489 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1493
1496 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1498 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1500
1512
1513 if (!Subtarget.hasAVX512())
1514 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1515
1516 // In the customized shift lowering, the legal v8i32/v4i64 cases
1517 // in AVX2 will be recognized.
1518 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1524 if (VT == MVT::v4i64) continue;
1529 }
1530
1531 // These types need custom splitting if their input is a 128-bit vector.
1536
1540 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1541 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1544
1545 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1549 }
1550
1555
1556 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1561
1562 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1563 // setcc all the way to isel and prefer SETGT in some isel patterns.
1566 }
1567
1568 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1569 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1574
1575 if (Subtarget.hasAnyFMA()) {
1576 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1577 MVT::v2f64, MVT::v4f64 }) {
1580 }
1581 }
1582
1583 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1584 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1585 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1586 }
1587
1588 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1589 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1592
1593 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1594 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1595 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1596 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1597 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1598 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1599 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1601
1602 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1603 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1604
1605 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1606 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1607 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1608 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1609 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1610
1611 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1623
1624 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1625 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1630 }
1631
1632 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1635 }
1636
1637 if (HasInt256) {
1638 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1639 // when we have a 256bit-wide blend with immediate.
1642
1643 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1644 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1645 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1646 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1647 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1650 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1651 }
1652 }
1653
1654 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1655 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1656 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1657 setOperationAction(ISD::MSTORE, VT, Legal);
1658 }
1659
1660 // Extract subvector is special because the value type
1661 // (result) is 128-bit but the source is 256-bit wide.
1662 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1663 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1665 }
1666
1667 // Custom lower several nodes for 256-bit types.
1668 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1669 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1678 setOperationAction(ISD::STORE, VT, Custom);
1679 }
1680 setF16Action(MVT::v16f16, Expand);
1681 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1682 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1684 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1685 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1686 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1687 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1688
1689 if (HasInt256) {
1691
1692 // Custom legalize 2x32 to get a little better code.
1693 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1694 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1695
1696 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1697 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1698 setOperationAction(ISD::MGATHER, VT, Custom);
1699 }
1700 }
1701
1702 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1703 Subtarget.hasF16C()) {
1704 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1707 }
1708 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1709 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1711 }
1712 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1713 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1714 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1715 }
1716 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1717 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1718 }
1719
1720 // This block controls legalization of the mask vector sizes that are
1721 // available with AVX512. 512-bit vectors are in a separate block controlled
1722 // by useAVX512Regs.
1723 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1724 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1725 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1726 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1727 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1728 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1729
1733
1734 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1735 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1736 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1737 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1738 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1739 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1741 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1749
1750 // There is no byte sized k-register load or store without AVX512DQ.
1751 if (!Subtarget.hasDQI()) {
1752 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1753 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1754 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1755 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1756
1757 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1758 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1759 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1760 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1761 }
1762
1763 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1764 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1768 }
1769
1770 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1772
1773 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1777
1784 }
1785
1786 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1788 }
1789 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1790 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1791 setOperationAction(ISD::LRINT, VT, Legal);
1792 setOperationAction(ISD::LLRINT, VT, Legal);
1793 }
1794 }
1795
1796 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1797 // elements. 512-bits can be disabled based on prefer-vector-width and
1798 // required-vector-width function attributes.
1799 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1800 bool HasBWI = Subtarget.hasBWI();
1801
1802 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1808 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1809
1810 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1811 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1812 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1813 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1814 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1815 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1816 if (HasBWI)
1817 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1818 }
1819
1820 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1821 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1822 setOperationAction(ISD::FMINIMUM, VT, Custom);
1823 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1824 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1825 setOperationAction(ISD::FNEG, VT, Custom);
1826 setOperationAction(ISD::FABS, VT, Custom);
1831 }
1832 setOperationAction(ISD::LRINT, MVT::v16f32,
1833 Subtarget.hasDQI() ? Legal : Custom);
1834 setOperationAction(ISD::LRINT, MVT::v8f64,
1835 Subtarget.hasDQI() ? Legal : Custom);
1836 if (Subtarget.hasDQI())
1837 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1838
1839 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1844 }
1845
1846 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1851 }
1852
1857 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1859
1871
1872 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1873 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1874 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1875 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1876 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1877 if (HasBWI)
1878 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1879
1880 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1881 // to 512-bit rather than use the AVX2 instructions so that we can use
1882 // k-masks.
1883 if (!Subtarget.hasVLX()) {
1884 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1885 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1886 setOperationAction(ISD::MLOAD, VT, Custom);
1887 setOperationAction(ISD::MSTORE, VT, Custom);
1888 }
1889 }
1890
1892 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1893 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1903
1904 if (HasBWI) {
1905 // Extends from v64i1 masks to 512-bit vectors.
1909 }
1910
1911 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1912 setOperationAction(ISD::FFLOOR, VT, Legal);
1914 setOperationAction(ISD::FCEIL, VT, Legal);
1916 setOperationAction(ISD::FTRUNC, VT, Legal);
1918 setOperationAction(ISD::FRINT, VT, Legal);
1920 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1922 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1924
1925 setOperationAction(ISD::FROUND, VT, Custom);
1926 }
1927
1928 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1931 }
1932
1933 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1934 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1935 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1936 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1937
1938 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1939 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1940 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1941 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1942
1943 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1944 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1945 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1946 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1947 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1948 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1949 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1951
1952 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1953 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1954
1955 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1965
1966 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1967 // setcc all the way to isel and prefer SETGT in some isel patterns.
1970 }
1971
1972 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1973 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1978
1979 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1986 }
1987
1988 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1989 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1990 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1992 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1993 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1994 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1995 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2000 }
2001
2002 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2003 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2004 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2005 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2006 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2007 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2008
2009 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2013 setOperationAction(Opc, MVT::v8i64, Custom);
2014
2015 if (Subtarget.hasDQI())
2016 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2017
2018 if (Subtarget.hasCDI()) {
2019 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2020 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2022 }
2023 } // Subtarget.hasCDI()
2024
2025 if (Subtarget.hasVPOPCNTDQ()) {
2026 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2028 }
2029
2030 // Extract subvector is special because the value type
2031 // (result) is 256-bit but the source is 512-bit wide.
2032 // 128-bit was made Legal under AVX1.
2033 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2034 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2036
2037 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2038 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2048 }
2049 setF16Action(MVT::v32f16, Expand);
2052 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2054 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2055 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2056 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2057
2058 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2059 setOperationAction(ISD::MLOAD, VT, Legal);
2060 setOperationAction(ISD::MSTORE, VT, Legal);
2061 setOperationAction(ISD::MGATHER, VT, Custom);
2062 setOperationAction(ISD::MSCATTER, VT, Custom);
2063 }
2064 if (HasBWI) {
2065 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2066 setOperationAction(ISD::MLOAD, VT, Legal);
2067 setOperationAction(ISD::MSTORE, VT, Legal);
2068 }
2069 } else {
2070 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2071 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2072 }
2073
2074 if (Subtarget.hasVBMI2()) {
2075 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2078 }
2079
2080 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2081 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2082 }
2083
2084 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2085 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2087 }// useAVX512Regs
2088
2089 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2090 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2091 MVT::v4i64}) {
2094 }
2095 }
2096
2097 // This block controls legalization for operations that don't have
2098 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2099 // narrower widths.
2100 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2101 // These operations are handled on non-VLX by artificially widening in
2102 // isel patterns.
2103
2107
2108 if (Subtarget.hasDQI()) {
2109 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2110 // v2f32 UINT_TO_FP is already custom under SSE2.
2113 "Unexpected operation action!");
2114 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2119 }
2120
2121 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2127 }
2128
2129 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2132 }
2133
2134 // Custom legalize 2x32 to get a little better code.
2135 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2136 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2137
2138 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2139 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2140 setOperationAction(ISD::MSCATTER, VT, Custom);
2141
2142 if (Subtarget.hasDQI()) {
2146 setOperationAction(Opc, MVT::v2i64, Custom);
2147 setOperationAction(Opc, MVT::v4i64, Custom);
2148 }
2149 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2150 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2151 }
2152
2153 if (Subtarget.hasCDI()) {
2154 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2156 }
2157 } // Subtarget.hasCDI()
2158
2159 if (Subtarget.hasVPOPCNTDQ()) {
2160 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2162 }
2163
2164 // We can try to convert vectors to different sizes to leverage legal
2165 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2166 // then specialize to Legal below.
2167 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2168 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2169 MVT::v16i16, MVT::v8i8})
2171
2172 // Legal vpcompress depends on various AVX512 extensions.
2173 // Legal in AVX512F
2174 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2176
2177 // Legal in AVX512F + AVX512VL
2178 if (Subtarget.hasVLX())
2179 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2180 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2182
2183 // Legal in AVX512F + AVX512VBMI2
2184 if (Subtarget.hasVBMI2())
2185 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2187
2188 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2189 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2190 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2192 }
2193
2194 // This block control legalization of v32i1/v64i1 which are available with
2195 // AVX512BW..
2196 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2197 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2198 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2199
2200 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2211 }
2212
2213 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2215
2216 // Extends from v32i1 masks to 256-bit vectors.
2220
2221 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2222 MVT::v16f16, MVT::v8f16}) {
2223 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2224 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2225 }
2226
2227 // These operations are handled on non-VLX by artificially widening in
2228 // isel patterns.
2229 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2230
2231 if (Subtarget.hasBITALG()) {
2232 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2234 }
2235 }
2236
2237 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2238 auto setGroup = [&] (MVT VT) {
2247 setOperationAction(ISD::FSQRT, VT, Legal);
2249
2250 setOperationAction(ISD::FFLOOR, VT, Legal);
2252 setOperationAction(ISD::FCEIL, VT, Legal);
2254 setOperationAction(ISD::FTRUNC, VT, Legal);
2256 setOperationAction(ISD::FRINT, VT, Legal);
2258 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2260 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
2262
2263 setOperationAction(ISD::FROUND, VT, Custom);
2264
2265 setOperationAction(ISD::LOAD, VT, Legal);
2266 setOperationAction(ISD::STORE, VT, Legal);
2267
2273
2274 setOperationAction(ISD::FNEG, VT, Custom);
2275 setOperationAction(ISD::FABS, VT, Custom);
2279
2283 };
2284
2285 // AVX512_FP16 scalar operations
2286 setGroup(MVT::f16);
2290 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2292 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2296 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2297 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2298 setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
2299 setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
2300 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2302 setOperationAction(ISD::LRINT, MVT::f16, Legal);
2303 setOperationAction(ISD::LLRINT, MVT::f16, Legal);
2304
2307
2308 if (Subtarget.useAVX512Regs()) {
2309 setGroup(MVT::v32f16);
2315 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2317 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2319 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
2322
2327 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2329 MVT::v32i16);
2330 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2332 MVT::v32i16);
2333 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2335 MVT::v32i16);
2336 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2338 MVT::v32i16);
2339
2343
2344 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2345 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2346
2347 setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2348 setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
2349 setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
2350 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
2351 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2352 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2353 }
2354
2359
2360 if (Subtarget.hasVLX()) {
2361 setGroup(MVT::v8f16);
2362 setGroup(MVT::v16f16);
2363
2374
2377 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
2379 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
2381
2382 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2385
2389
2390 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2391 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2393 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2394
2395 // Need to custom widen these to prevent scalarization.
2396 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2397 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2398
2399 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2400 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2401 setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
2402 setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
2403
2404 setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2405 setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
2406 setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
2407 setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
2408 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2409 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2410 }
2411 }
2412
2413 if (!Subtarget.useSoftFloat() &&
2414 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2415 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2416 : &X86::VR128RegClass);
2417 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2418 : &X86::VR256RegClass);
2419 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2420 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2421 // Set the operation action Custom to do the customization later.
2424 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2425 setF16Action(VT, Expand);
2426 if (!Subtarget.hasBF16())
2432 }
2433 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2434 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2435 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2436 }
2437 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2438 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2440 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2441 }
2442
2443 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2444 Subtarget.useAVX512Regs()) {
2445 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2446 setF16Action(MVT::v32bf16, Expand);
2447 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2448 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2449 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2451 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2455 }
2456
2457 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2458 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2459 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2460 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2461 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2462 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2463 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2464 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2465 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2466 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2467 setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
2468 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
2469 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2474 setOperationAction(ISD::FSQRT, VT, Legal);
2477 setOperationAction(ISD::FMINIMUM, VT, Custom);
2478 setOperationAction(ISD::FMAXIMUM, VT, Custom);
2479 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
2480 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
2481 }
2482 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2485 }
2486 }
2487
2488 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2489 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2490 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2491 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2492 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2493 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2494
2495 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2496 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2497 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2498 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2499 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2500
2501 if (Subtarget.hasBWI()) {
2502 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2503 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2504 }
2505
2506 if (Subtarget.hasFP16()) {
2507 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2516 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2525 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2530 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2531 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2533 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2535 }
2536 }
2537
2538 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2539 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2540 }
2541
2542 // We want to custom lower some of our intrinsics.
2546 if (!Subtarget.is64Bit()) {
2548 }
2549
2550 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2551 // handle type legalization for these operations here.
2552 //
2553 // FIXME: We really should do custom legalization for addition and
2554 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2555 // than generic legalization for 64-bit multiplication-with-overflow, though.
2556 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2557 if (VT == MVT::i64 && !Subtarget.is64Bit())
2558 continue;
2559 // Add/Sub/Mul with overflow operations are custom lowered.
2566
2567 // Support carry in as value rather than glue.
2573 }
2574
2575 // Combine sin / cos into _sincos_stret if it is available.
2576 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2577 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2578 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2579 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2580 }
2581
2582 if (Subtarget.isTargetWin64()) {
2583 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2584 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2585 setOperationAction(ISD::SREM, MVT::i128, Custom);
2586 setOperationAction(ISD::UREM, MVT::i128, Custom);
2595 }
2596
2597 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2598 // is. We should promote the value to 64-bits to solve this.
2599 // This is what the CRT headers do - `fmodf` is an inline header
2600 // function casting to f64 and calling `fmod`.
2601 if (Subtarget.is32Bit() &&
2602 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2603 // clang-format off
2604 for (ISD::NodeType Op :
2605 {ISD::FACOS, ISD::STRICT_FACOS,
2606 ISD::FASIN, ISD::STRICT_FASIN,
2607 ISD::FATAN, ISD::STRICT_FATAN,
2608 ISD::FATAN2, ISD::STRICT_FATAN2,
2609 ISD::FCEIL, ISD::STRICT_FCEIL,
2610 ISD::FCOS, ISD::STRICT_FCOS,
2611 ISD::FCOSH, ISD::STRICT_FCOSH,
2612 ISD::FEXP, ISD::STRICT_FEXP,
2613 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2615 ISD::FLOG, ISD::STRICT_FLOG,
2616 ISD::FLOG10, ISD::STRICT_FLOG10,
2617 ISD::FPOW, ISD::STRICT_FPOW,
2618 ISD::FSIN, ISD::STRICT_FSIN,
2619 ISD::FSINH, ISD::STRICT_FSINH,
2620 ISD::FTAN, ISD::STRICT_FTAN,
2621 ISD::FTANH, ISD::STRICT_FTANH,
2622 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2623 ISD::FMODF})
2624 if (isOperationExpand(Op, MVT::f32))
2625 setOperationAction(Op, MVT::f32, Promote);
2626 // clang-format on
2627
2628 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2629 // it, but it's just a wrapper around ldexp.
2630 if (Subtarget.isOSWindows()) {
2631 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2632 if (isOperationExpand(Op, MVT::f32))
2633 setOperationAction(Op, MVT::f32, Promote);
2634 }
2635
2636 // We have target-specific dag combine patterns for the following nodes:
2644 ISD::BITCAST,
2647 ISD::SHL,
2648 ISD::SRA,
2649 ISD::SRL,
2650 ISD::OR,
2651 ISD::AND,
2657 ISD::ADD,
2658 ISD::FADD,
2659 ISD::FSUB,
2660 ISD::FNEG,
2661 ISD::FMA,
2663 ISD::FMINNUM,
2664 ISD::FMAXNUM,
2665 ISD::SUB,
2666 ISD::LOAD,
2667 ISD::LRINT,
2668 ISD::LLRINT,
2669 ISD::MLOAD,
2670 ISD::STORE,
2671 ISD::MSTORE,
2687 ISD::SETCC,
2688 ISD::MUL,
2689 ISD::XOR,
2690 ISD::MSCATTER,
2691 ISD::MGATHER,
2692 ISD::FP16_TO_FP,
2693 ISD::FP_EXTEND,
2700
2701 computeRegisterProperties(Subtarget.getRegisterInfo());
2702
2703 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2705 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2707 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2709
2710 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2711 // that needs to benchmarked and balanced with the potential use of vector
2712 // load/store types (PR33329, PR33914).
2715
2716 // Default loop alignment, which can be overridden by -align-loops.
2718
2719 // An out-of-order CPU can speculatively execute past a predictable branch,
2720 // but a conditional move could be stalled by an expensive earlier operation.
2721 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2722 EnableExtLdPromotion = true;
2724
2726
2727 // Default to having -disable-strictnode-mutation on
2728 IsStrictFPEnabled = true;
2729}
2730
2731// This has so far only been implemented for 64-bit MachO.
2733 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2734}
2735
2737 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2738 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2739}
2740
2742 const SDLoc &DL) const {
2743 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2744 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2745 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2746 return SDValue(Node, 0);
2747}
2748
2751 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2752 !Subtarget.hasBWI())
2753 return TypeSplitVector;
2754
2755 // Since v8f16 is legal, widen anything over v4f16.
2756 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2757 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2758 VT.getVectorElementType() == MVT::f16)
2759 return TypeSplitVector;
2760
2761 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2762 VT.getVectorElementType() != MVT::i1)
2763 return TypeWidenVector;
2764
2766}
2767
2768FastISel *
2770 const TargetLibraryInfo *libInfo) const {
2771 return X86::createFastISel(funcInfo, libInfo);
2772}
2773
2774//===----------------------------------------------------------------------===//
2775// Other Lowering Hooks
2776//===----------------------------------------------------------------------===//
2777
2779 bool AssumeSingleUse) {
2780 if (!AssumeSingleUse && !Op.hasOneUse())
2781 return false;
2782 if (!ISD::isNormalLoad(Op.getNode()))
2783 return false;
2784
2785 // If this is an unaligned vector, make sure the target supports folding it.
2786 auto *Ld = cast<LoadSDNode>(Op.getNode());
2787 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2788 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2789 return false;
2790
2791 // TODO: If this is a non-temporal load and the target has an instruction
2792 // for it, it should not be folded. See "useNonTemporalLoad()".
2793
2794 return true;
2795}
2796
2798 const X86Subtarget &Subtarget,
2799 bool AssumeSingleUse) {
2800 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2801 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2802 return false;
2803
2804 // We can not replace a wide volatile load with a broadcast-from-memory,
2805 // because that would narrow the load, which isn't legal for volatiles.
2806 auto *Ld = cast<LoadSDNode>(Op.getNode());
2807 return !Ld->isVolatile() ||
2808 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2809}
2810
2812 if (!Op.hasOneUse())
2813 return false;
2814 // Peek through (oneuse) bitcast users
2815 SDNode *User = *Op->user_begin();
2816 while (User->getOpcode() == ISD::BITCAST) {
2817 if (!User->hasOneUse())
2818 return false;
2819 User = *User->user_begin();
2820 }
2821 return ISD::isNormalStore(User);
2822}
2823
2825 if (Op.hasOneUse()) {
2826 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2827 return (ISD::ZERO_EXTEND == Opcode);
2828 }
2829 return false;
2830}
2831
2832static bool isLogicOp(unsigned Opcode) {
2833 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2834 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2835}
2836
2837static bool isTargetShuffle(unsigned Opcode) {
2838 switch(Opcode) {
2839 default: return false;
2840 case X86ISD::BLENDI:
2841 case X86ISD::PSHUFB:
2842 case X86ISD::PSHUFD:
2843 case X86ISD::PSHUFHW:
2844 case X86ISD::PSHUFLW:
2845 case X86ISD::SHUFP:
2846 case X86ISD::INSERTPS:
2847 case X86ISD::EXTRQI:
2848 case X86ISD::INSERTQI:
2849 case X86ISD::VALIGN:
2850 case X86ISD::PALIGNR:
2851 case X86ISD::VSHLDQ:
2852 case X86ISD::VSRLDQ:
2853 case X86ISD::MOVLHPS:
2854 case X86ISD::MOVHLPS:
2855 case X86ISD::MOVSHDUP:
2856 case X86ISD::MOVSLDUP:
2857 case X86ISD::MOVDDUP:
2858 case X86ISD::MOVSS:
2859 case X86ISD::MOVSD:
2860 case X86ISD::MOVSH:
2861 case X86ISD::UNPCKL:
2862 case X86ISD::UNPCKH:
2863 case X86ISD::VBROADCAST:
2864 case X86ISD::VPERMILPI:
2865 case X86ISD::VPERMILPV:
2866 case X86ISD::VPERM2X128:
2867 case X86ISD::SHUF128:
2868 case X86ISD::VPERMIL2:
2869 case X86ISD::VPERMI:
2870 case X86ISD::VPPERM:
2871 case X86ISD::VPERMV:
2872 case X86ISD::VPERMV3:
2873 case X86ISD::VZEXT_MOVL:
2874 return true;
2875 }
2876}
2877
2878static bool isTargetShuffleVariableMask(unsigned Opcode) {
2879 switch (Opcode) {
2880 default: return false;
2881 // Target Shuffles.
2882 case X86ISD::PSHUFB:
2883 case X86ISD::VPERMILPV:
2884 case X86ISD::VPERMIL2:
2885 case X86ISD::VPPERM:
2886 case X86ISD::VPERMV:
2887 case X86ISD::VPERMV3:
2888 return true;
2889 // 'Faux' Target Shuffles.
2890 case ISD::OR:
2891 case ISD::AND:
2892 case X86ISD::ANDNP:
2893 return true;
2894 }
2895}
2896
2899 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2901 int ReturnAddrIndex = FuncInfo->getRAIndex();
2902
2903 if (ReturnAddrIndex == 0) {
2904 // Set up a frame object for the return address.
2905 unsigned SlotSize = RegInfo->getSlotSize();
2906 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2907 -(int64_t)SlotSize,
2908 false);
2909 FuncInfo->setRAIndex(ReturnAddrIndex);
2910 }
2911
2912 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2913}
2914
2916 bool HasSymbolicDisplacement) {
2917 // Offset should fit into 32 bit immediate field.
2918 if (!isInt<32>(Offset))
2919 return false;
2920
2921 // If we don't have a symbolic displacement - we don't have any extra
2922 // restrictions.
2923 if (!HasSymbolicDisplacement)
2924 return true;
2925
2926 // We can fold large offsets in the large code model because we always use
2927 // 64-bit offsets.
2928 if (CM == CodeModel::Large)
2929 return true;
2930
2931 // For kernel code model we know that all object resist in the negative half
2932 // of 32bits address space. We may not accept negative offsets, since they may
2933 // be just off and we may accept pretty large positive ones.
2934 if (CM == CodeModel::Kernel)
2935 return Offset >= 0;
2936
2937 // For other non-large code models we assume that latest small object is 16MB
2938 // before end of 31 bits boundary. We may also accept pretty large negative
2939 // constants knowing that all objects are in the positive half of address
2940 // space.
2941 return Offset < 16 * 1024 * 1024;
2942}
2943
2944/// Return true if the condition is an signed comparison operation.
2945static bool isX86CCSigned(X86::CondCode X86CC) {
2946 switch (X86CC) {
2947 default:
2948 llvm_unreachable("Invalid integer condition!");
2949 case X86::COND_E:
2950 case X86::COND_NE:
2951 case X86::COND_B:
2952 case X86::COND_A:
2953 case X86::COND_BE:
2954 case X86::COND_AE:
2955 return false;
2956 case X86::COND_G:
2957 case X86::COND_GE:
2958 case X86::COND_L:
2959 case X86::COND_LE:
2960 return true;
2961 }
2962}
2963
2965 switch (SetCCOpcode) {
2966 // clang-format off
2967 default: llvm_unreachable("Invalid integer condition!");
2968 case ISD::SETEQ: return X86::COND_E;
2969 case ISD::SETGT: return X86::COND_G;
2970 case ISD::SETGE: return X86::COND_GE;
2971 case ISD::SETLT: return X86::COND_L;
2972 case ISD::SETLE: return X86::COND_LE;
2973 case ISD::SETNE: return X86::COND_NE;
2974 case ISD::SETULT: return X86::COND_B;
2975 case ISD::SETUGT: return X86::COND_A;
2976 case ISD::SETULE: return X86::COND_BE;
2977 case ISD::SETUGE: return X86::COND_AE;
2978 // clang-format on
2979 }
2980}
2981
2982/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2983/// condition code, returning the condition code and the LHS/RHS of the
2984/// comparison to make.
2986 bool isFP, SDValue &LHS, SDValue &RHS,
2987 SelectionDAG &DAG) {
2988 if (!isFP) {
2990 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2991 // X > -1 -> X == 0, jump !sign.
2992 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2993 return X86::COND_NS;
2994 }
2995 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2996 // X < 0 -> X == 0, jump on sign.
2997 return X86::COND_S;
2998 }
2999 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3000 // X >= 0 -> X == 0, jump on !sign.
3001 return X86::COND_NS;
3002 }
3003 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3004 // X < 1 -> X <= 0
3005 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3006 return X86::COND_LE;
3007 }
3008 }
3009
3010 return TranslateIntegerX86CC(SetCCOpcode);
3011 }
3012
3013 // First determine if it is required or is profitable to flip the operands.
3014
3015 // If LHS is a foldable load, but RHS is not, flip the condition.
3016 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3017 !ISD::isNON_EXTLoad(RHS.getNode())) {
3018 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3019 std::swap(LHS, RHS);
3020 }
3021
3022 switch (SetCCOpcode) {
3023 default: break;
3024 case ISD::SETOLT:
3025 case ISD::SETOLE:
3026 case ISD::SETUGT:
3027 case ISD::SETUGE:
3028 std::swap(LHS, RHS);
3029 break;
3030 }
3031
3032 // On a floating point condition, the flags are set as follows:
3033 // ZF PF CF op
3034 // 0 | 0 | 0 | X > Y
3035 // 0 | 0 | 1 | X < Y
3036 // 1 | 0 | 0 | X == Y
3037 // 1 | 1 | 1 | unordered
3038 switch (SetCCOpcode) {
3039 // clang-format off
3040 default: llvm_unreachable("Condcode should be pre-legalized away");
3041 case ISD::SETUEQ:
3042 case ISD::SETEQ: return X86::COND_E;
3043 case ISD::SETOLT: // flipped
3044 case ISD::SETOGT:
3045 case ISD::SETGT: return X86::COND_A;
3046 case ISD::SETOLE: // flipped
3047 case ISD::SETOGE:
3048 case ISD::SETGE: return X86::COND_AE;
3049 case ISD::SETUGT: // flipped
3050 case ISD::SETULT:
3051 case ISD::SETLT: return X86::COND_B;
3052 case ISD::SETUGE: // flipped
3053 case ISD::SETULE:
3054 case ISD::SETLE: return X86::COND_BE;
3055 case ISD::SETONE:
3056 case ISD::SETNE: return X86::COND_NE;
3057 case ISD::SETUO: return X86::COND_P;
3058 case ISD::SETO: return X86::COND_NP;
3059 case ISD::SETOEQ:
3060 case ISD::SETUNE: return X86::COND_INVALID;
3061 // clang-format on
3062 }
3063}
3064
3065/// Is there a floating point cmov for the specific X86 condition code?
3066/// Current x86 isa includes the following FP cmov instructions:
3067/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3068static bool hasFPCMov(unsigned X86CC) {
3069 switch (X86CC) {
3070 default:
3071 return false;
3072 case X86::COND_B:
3073 case X86::COND_BE:
3074 case X86::COND_E:
3075 case X86::COND_P:
3076 case X86::COND_A:
3077 case X86::COND_AE:
3078 case X86::COND_NE:
3079 case X86::COND_NP:
3080 return true;
3081 }
3082}
3083
3084static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3085 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3086 VT.is512BitVector();
3087}
3088
3090 const CallInst &I,
3091 MachineFunction &MF,
3092 unsigned Intrinsic) const {
3093 Info.flags = MachineMemOperand::MONone;
3094 Info.offset = 0;
3095
3097 if (!IntrData) {
3098 switch (Intrinsic) {
3099 case Intrinsic::x86_aesenc128kl:
3100 case Intrinsic::x86_aesdec128kl:
3101 Info.opc = ISD::INTRINSIC_W_CHAIN;
3102 Info.ptrVal = I.getArgOperand(1);
3103 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3104 Info.align = Align(1);
3105 Info.flags |= MachineMemOperand::MOLoad;
3106 return true;
3107 case Intrinsic::x86_aesenc256kl:
3108 case Intrinsic::x86_aesdec256kl:
3109 Info.opc = ISD::INTRINSIC_W_CHAIN;
3110 Info.ptrVal = I.getArgOperand(1);
3111 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3112 Info.align = Align(1);
3113 Info.flags |= MachineMemOperand::MOLoad;
3114 return true;
3115 case Intrinsic::x86_aesencwide128kl:
3116 case Intrinsic::x86_aesdecwide128kl:
3117 Info.opc = ISD::INTRINSIC_W_CHAIN;
3118 Info.ptrVal = I.getArgOperand(0);
3119 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3120 Info.align = Align(1);
3121 Info.flags |= MachineMemOperand::MOLoad;
3122 return true;
3123 case Intrinsic::x86_aesencwide256kl:
3124 case Intrinsic::x86_aesdecwide256kl:
3125 Info.opc = ISD::INTRINSIC_W_CHAIN;
3126 Info.ptrVal = I.getArgOperand(0);
3127 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3128 Info.align = Align(1);
3129 Info.flags |= MachineMemOperand::MOLoad;
3130 return true;
3131 case Intrinsic::x86_cmpccxadd32:
3132 case Intrinsic::x86_cmpccxadd64:
3133 case Intrinsic::x86_atomic_bts:
3134 case Intrinsic::x86_atomic_btc:
3135 case Intrinsic::x86_atomic_btr: {
3136 Info.opc = ISD::INTRINSIC_W_CHAIN;
3137 Info.ptrVal = I.getArgOperand(0);
3138 unsigned Size = I.getType()->getScalarSizeInBits();
3139 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3140 Info.align = Align(Size);
3143 return true;
3144 }
3145 case Intrinsic::x86_atomic_bts_rm:
3146 case Intrinsic::x86_atomic_btc_rm:
3147 case Intrinsic::x86_atomic_btr_rm: {
3148 Info.opc = ISD::INTRINSIC_W_CHAIN;
3149 Info.ptrVal = I.getArgOperand(0);
3150 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3151 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3152 Info.align = Align(Size);
3155 return true;
3156 }
3157 case Intrinsic::x86_aadd32:
3158 case Intrinsic::x86_aadd64:
3159 case Intrinsic::x86_aand32:
3160 case Intrinsic::x86_aand64:
3161 case Intrinsic::x86_aor32:
3162 case Intrinsic::x86_aor64:
3163 case Intrinsic::x86_axor32:
3164 case Intrinsic::x86_axor64:
3165 case Intrinsic::x86_atomic_add_cc:
3166 case Intrinsic::x86_atomic_sub_cc:
3167 case Intrinsic::x86_atomic_or_cc:
3168 case Intrinsic::x86_atomic_and_cc:
3169 case Intrinsic::x86_atomic_xor_cc: {
3170 Info.opc = ISD::INTRINSIC_W_CHAIN;
3171 Info.ptrVal = I.getArgOperand(0);
3172 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3173 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3174 Info.align = Align(Size);
3177 return true;
3178 }
3179 }
3180 return false;
3181 }
3182
3183 switch (IntrData->Type) {
3186 case TRUNCATE_TO_MEM_VI32: {
3187 Info.opc = ISD::INTRINSIC_VOID;
3188 Info.ptrVal = I.getArgOperand(0);
3189 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3191 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3192 ScalarVT = MVT::i8;
3193 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3194 ScalarVT = MVT::i16;
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3196 ScalarVT = MVT::i32;
3197
3198 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3199 Info.align = Align(1);
3200 Info.flags |= MachineMemOperand::MOStore;
3201 break;
3202 }
3203 case GATHER:
3204 case GATHER_AVX2: {
3205 Info.opc = ISD::INTRINSIC_W_CHAIN;
3206 Info.ptrVal = nullptr;
3207 MVT DataVT = MVT::getVT(I.getType());
3208 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3209 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3210 IndexVT.getVectorNumElements());
3211 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3212 Info.align = Align(1);
3213 Info.flags |= MachineMemOperand::MOLoad;
3214 break;
3215 }
3216 case SCATTER: {
3217 Info.opc = ISD::INTRINSIC_VOID;
3218 Info.ptrVal = nullptr;
3219 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3220 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3221 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3222 IndexVT.getVectorNumElements());
3223 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3224 Info.align = Align(1);
3225 Info.flags |= MachineMemOperand::MOStore;
3226 break;
3227 }
3228 default:
3229 return false;
3230 }
3231
3232 return true;
3233}
3234
3235/// Returns true if the target can instruction select the
3236/// specified FP immediate natively. If false, the legalizer will
3237/// materialize the FP immediate as a load from a constant pool.
3239 bool ForCodeSize) const {
3240 for (const APFloat &FPImm : LegalFPImmediates)
3241 if (Imm.bitwiseIsEqual(FPImm))
3242 return true;
3243 return false;
3244}
3245
3247 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3248 std::optional<unsigned> ByteOffset) const {
3249 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3250
3251 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3252 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3253 N = *N->user_begin();
3254 return N;
3255 };
3256
3257 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3258 // relocation target a movq or addq instruction: don't let the load shrink.
3259 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3260 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3261 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3262 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3263
3264 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3265 // those uses are extracted directly into a store, then the extract + store
3266 // can be store-folded, or (4) any use will be used by legal full width
3267 // instruction. Then, it's probably not worth splitting the load.
3268 EVT VT = Load->getValueType(0);
3269 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3270 !SDValue(Load, 0).hasOneUse()) {
3271 bool FullWidthUse = false;
3272 bool AllExtractStores = true;
3273 for (SDUse &Use : Load->uses()) {
3274 // Skip uses of the chain value. Result 0 of the node is the load value.
3275 if (Use.getResNo() != 0)
3276 continue;
3277
3278 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3279
3280 // If this use is an extract + store, it's probably not worth splitting.
3281 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3282 all_of(User->uses(), [&](const SDUse &U) {
3283 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3284 return Inner->getOpcode() == ISD::STORE;
3285 }))
3286 continue;
3287
3288 AllExtractStores = false;
3289
3290 // If any use is a full width legal/target bin op, then assume its legal
3291 // and won't split.
3292 if (isBinOp(User->getOpcode()) &&
3293 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3294 User->getOpcode() > ISD::BUILTIN_OP_END))
3295 FullWidthUse = true;
3296 }
3297
3298 if (AllExtractStores)
3299 return false;
3300
3301 // If we have an user that uses the full vector width, then this use is
3302 // only worth splitting if the offset isn't 0 (to avoid an
3303 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3304 if (FullWidthUse)
3305 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3306 }
3307
3308 return true;
3309}
3310
3311/// Returns true if it is beneficial to convert a load of a constant
3312/// to just the constant itself.
3314 Type *Ty) const {
3315 assert(Ty->isIntegerTy());
3316
3317 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3318 if (BitSize == 0 || BitSize > 64)
3319 return false;
3320 return true;
3321}
3322
3324 // If we are using XMM registers in the ABI and the condition of the select is
3325 // a floating-point compare and we have blendv or conditional move, then it is
3326 // cheaper to select instead of doing a cross-register move and creating a
3327 // load that depends on the compare result.
3328 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3329 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3330}
3331
3333 // TODO: It might be a win to ease or lift this restriction, but the generic
3334 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3335 if (VT.isVector() && Subtarget.hasAVX512())
3336 return false;
3337
3338 return true;
3339}
3340
3342 SDValue C) const {
3343 // TODO: We handle scalars using custom code, but generic combining could make
3344 // that unnecessary.
3345 APInt MulC;
3346 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3347 return false;
3348
3349 // Find the type this will be legalized too. Otherwise we might prematurely
3350 // convert this to shl+add/sub and then still have to type legalize those ops.
3351 // Another choice would be to defer the decision for illegal types until
3352 // after type legalization. But constant splat vectors of i64 can't make it
3353 // through type legalization on 32-bit targets so we would need to special
3354 // case vXi64.
3355 while (getTypeAction(Context, VT) != TypeLegal)
3356 VT = getTypeToTransformTo(Context, VT);
3357
3358 // If vector multiply is legal, assume that's faster than shl + add/sub.
3359 // Multiply is a complex op with higher latency and lower throughput in
3360 // most implementations, sub-vXi32 vector multiplies are always fast,
3361 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3362 // is always going to be slow.
3363 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3364 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3365 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3366 return false;
3367
3368 // shl+add, shl+sub, shl+add+neg
3369 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3370 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3371}
3372
3374 unsigned Index) const {
3376 return false;
3377
3378 // Mask vectors support all subregister combinations and operations that
3379 // extract half of vector.
3380 if (ResVT.getVectorElementType() == MVT::i1)
3381 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3382 (Index == ResVT.getVectorNumElements()));
3383
3384 return (Index % ResVT.getVectorNumElements()) == 0;
3385}
3386
3388 unsigned Opc = VecOp.getOpcode();
3389
3390 // Assume target opcodes can't be scalarized.
3391 // TODO - do we have any exceptions?
3392 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3393 return false;
3394
3395 // If the vector op is not supported, try to convert to scalar.
3396 EVT VecVT = VecOp.getValueType();
3398 return true;
3399
3400 // If the vector op is supported, but the scalar op is not, the transform may
3401 // not be worthwhile.
3402 EVT ScalarVT = VecVT.getScalarType();
3403 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3404}
3405
3407 bool) const {
3408 // TODO: Allow vectors?
3409 if (VT.isVector())
3410 return false;
3411 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3412}
3413
3415 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3416 // i32/i64 or can rely on BSF passthrough value.
3417 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3418 Subtarget.hasBitScanPassThrough() ||
3419 (!Ty->isVectorTy() &&
3420 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3421}
3422
3424 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3425 // passthrough value.
3426 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3427 Subtarget.hasBitScanPassThrough();
3428}
3429
3431 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3432 // expensive than a straight movsd. On the other hand, it's important to
3433 // shrink long double fp constant since fldt is very slow.
3434 return !Subtarget.hasSSE2() || VT == MVT::f80;
3435}
3436
3438 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3439 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3440}
3441
3443 const SelectionDAG &DAG,
3444 const MachineMemOperand &MMO) const {
3445 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3446 BitcastVT.getVectorElementType() == MVT::i1)
3447 return false;
3448
3449 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3450 return false;
3451
3452 // If both types are legal vectors, it's always ok to convert them.
3453 if (LoadVT.isVector() && BitcastVT.isVector() &&
3454 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3455 return true;
3456
3457 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3458}
3459
3461 const MachineFunction &MF) const {
3462 // Do not merge to float value size (128 bytes) if no implicit
3463 // float attribute is set.
3464 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3465
3466 if (NoFloat) {
3467 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3468 return (MemVT.getSizeInBits() <= MaxIntSize);
3469 }
3470 // Make sure we don't merge greater than our preferred vector
3471 // width.
3472 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3473 return false;
3474
3475 return true;
3476}
3477
3479 return Subtarget.hasFastLZCNT();
3480}
3481
3483 const Instruction &AndI) const {
3484 return true;
3485}
3486
3488 EVT VT = Y.getValueType();
3489
3490 if (VT.isVector())
3491 return false;
3492
3493 if (!Subtarget.hasBMI())
3494 return false;
3495
3496 // There are only 32-bit and 64-bit forms for 'andn'.
3497 if (VT != MVT::i32 && VT != MVT::i64)
3498 return false;
3499
3500 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3501}
3502
3504 EVT VT = Y.getValueType();
3505
3506 if (!VT.isVector())
3507 return hasAndNotCompare(Y);
3508
3509 // Vector.
3510
3511 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3512 return false;
3513
3514 if (VT == MVT::v4i32)
3515 return true;
3516
3517 return Subtarget.hasSSE2();
3518}
3519
3521 return X.getValueType().isScalarInteger(); // 'bt'
3522}
3523
3527 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3528 SelectionDAG &DAG) const {
3529 // Does baseline recommend not to perform the fold by default?
3531 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3532 return false;
3533 // For scalars this transform is always beneficial.
3534 if (X.getValueType().isScalarInteger())
3535 return true;
3536 // If all the shift amounts are identical, then transform is beneficial even
3537 // with rudimentary SSE2 shifts.
3538 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3539 return true;
3540 // If we have AVX2 with it's powerful shift operations, then it's also good.
3541 if (Subtarget.hasAVX2())
3542 return true;
3543 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3544 return NewShiftOpcode == ISD::SHL;
3545}
3546
3548 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3549 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3550 if (!VT.isInteger())
3551 return ShiftOpc;
3552
3553 bool PreferRotate = false;
3554 if (VT.isVector()) {
3555 // For vectors, if we have rotate instruction support, then its definetly
3556 // best. Otherwise its not clear what the best so just don't make changed.
3557 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3558 VT.getScalarType() == MVT::i64);
3559 } else {
3560 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3561 // rotate unless we have a zext mask+shr.
3562 PreferRotate = Subtarget.hasBMI2();
3563 if (!PreferRotate) {
3564 unsigned MaskBits =
3565 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3566 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3567 }
3568 }
3569
3570 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3571 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3572
3573 if (PreferRotate && MayTransformRotate)
3574 return ISD::ROTL;
3575
3576 // If vector we don't really get much benefit swapping around constants.
3577 // Maybe we could check if the DAG has the flipped node already in the
3578 // future.
3579 if (VT.isVector())
3580 return ShiftOpc;
3581
3582 // See if the beneficial to swap shift type.
3583 if (ShiftOpc == ISD::SHL) {
3584 // If the current setup has imm64 mask, then inverse will have
3585 // at least imm32 mask (or be zext i32 -> i64).
3586 if (VT == MVT::i64)
3587 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3588 : ShiftOpc;
3589
3590 // We can only benefit if req at least 7-bit for the mask. We
3591 // don't want to replace shl of 1,2,3 as they can be implemented
3592 // with lea/add.
3593 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3594 }
3595
3596 if (VT == MVT::i64)
3597 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3598 // extremely efficient.
3599 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3600
3601 // Keep small shifts as shl so we can generate add/lea.
3602 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3603 }
3604
3605 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3606 // (PreferRotate will be set in the latter case).
3607 if (PreferRotate || !MayTransformRotate || VT.isVector())
3608 return ShiftOpc;
3609
3610 // Non-vector type and we have a zext mask with SRL.
3611 return ISD::SRL;
3612}
3613
3616 const Value *Lhs,
3617 const Value *Rhs) const {
3618 using namespace llvm::PatternMatch;
3619 int BaseCost = BrMergingBaseCostThresh.getValue();
3620 // With CCMP, branches can be merged in a more efficient way.
3621 if (BaseCost >= 0 && Subtarget.hasCCMP())
3622 BaseCost += BrMergingCcmpBias;
3623 // a == b && a == c is a fast pattern on x86.
3624 if (BaseCost >= 0 && Opc == Instruction::And &&
3627 BaseCost += 1;
3628 return {BaseCost, BrMergingLikelyBias.getValue(),
3629 BrMergingUnlikelyBias.getValue()};
3630}
3631
3633 return N->getOpcode() != ISD::FP_EXTEND;
3634}
3635
3637 const SDNode *N, CombineLevel Level) const {
3638 assert(((N->getOpcode() == ISD::SHL &&
3639 N->getOperand(0).getOpcode() == ISD::SRL) ||
3640 (N->getOpcode() == ISD::SRL &&
3641 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3642 "Expected shift-shift mask");
3643 // TODO: Should we always create i64 masks? Or only folded immediates?
3644 EVT VT = N->getValueType(0);
3645 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3646 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3647 // Only fold if the shift values are equal - so it folds to AND.
3648 // TODO - we should fold if either is a non-uniform vector but we don't do
3649 // the fold for non-splats yet.
3650 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3651 }
3653}
3654
3656 EVT VT = Y.getValueType();
3657
3658 // For vectors, we don't have a preference, but we probably want a mask.
3659 if (VT.isVector())
3660 return false;
3661
3662 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3663 if (VT == MVT::i64 && !Subtarget.is64Bit())
3664 return false;
3665
3666 return true;
3667}
3668
3671 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3673 !Subtarget.isOSWindows())
3676 ExpansionFactor);
3677}
3678
3680 // Any legal vector type can be splatted more efficiently than
3681 // loading/spilling from memory.
3682 return isTypeLegal(VT);
3683}
3684
3686 MVT VT = MVT::getIntegerVT(NumBits);
3687 if (isTypeLegal(VT))
3688 return VT;
3689
3690 // PMOVMSKB can handle this.
3691 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3692 return MVT::v16i8;
3693
3694 // VPMOVMSKB can handle this.
3695 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3696 return MVT::v32i8;
3697
3698 // TODO: Allow 64-bit type for 32-bit target.
3699 // TODO: 512-bit types should be allowed, but make sure that those
3700 // cases are handled in combineVectorSizedSetCCEquality().
3701
3703}
3704
3705/// Val is the undef sentinel value or equal to the specified value.
3706static bool isUndefOrEqual(int Val, int CmpVal) {
3707 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3708}
3709
3710/// Return true if every element in Mask is the undef sentinel value or equal to
3711/// the specified value.
3712static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3713 return llvm::all_of(Mask, [CmpVal](int M) {
3714 return (M == SM_SentinelUndef) || (M == CmpVal);
3715 });
3716}
3717
3718/// Return true if every element in Mask, beginning from position Pos and ending
3719/// in Pos+Size is the undef sentinel value or equal to the specified value.
3720static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3721 unsigned Size) {
3722 return llvm::all_of(Mask.slice(Pos, Size),
3723 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3724}
3725
3726/// Val is either the undef or zero sentinel value.
3727static bool isUndefOrZero(int Val) {
3728 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3729}
3730
3731/// Return true if every element in Mask, beginning from position Pos and ending
3732/// in Pos+Size is the undef sentinel value.
3733static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3734 return llvm::all_of(Mask.slice(Pos, Size),
3735 [](int M) { return M == SM_SentinelUndef; });
3736}
3737
3738/// Return true if the mask creates a vector whose lower half is undefined.
3740 unsigned NumElts = Mask.size();
3741 return isUndefInRange(Mask, 0, NumElts / 2);
3742}
3743
3744/// Return true if the mask creates a vector whose upper half is undefined.
3746 unsigned NumElts = Mask.size();
3747 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3748}
3749
3750/// Return true if Val falls within the specified range (L, H].
3751static bool isInRange(int Val, int Low, int Hi) {
3752 return (Val >= Low && Val < Hi);
3753}
3754
3755/// Return true if the value of any element in Mask falls within the specified
3756/// range (L, H].
3757static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3758 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3759}
3760
3761/// Return true if the value of any element in Mask is the zero sentinel value.
3762static bool isAnyZero(ArrayRef<int> Mask) {
3763 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3764}
3765
3766/// Return true if Val is undef or if its value falls within the
3767/// specified range (L, H].
3768static bool isUndefOrInRange(int Val, int Low, int Hi) {
3769 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3770}
3771
3772/// Return true if every element in Mask is undef or if its value
3773/// falls within the specified range (L, H].
3774static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3775 return llvm::all_of(
3776 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3777}
3778
3779/// Return true if Val is undef, zero or if its value falls within the
3780/// specified range (L, H].
3781static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3782 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3783}
3784
3785/// Return true if every element in Mask is undef, zero or if its value
3786/// falls within the specified range (L, H].
3787static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3788 return llvm::all_of(
3789 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3790}
3791
3792/// Return true if every element in Mask, is an in-place blend/select mask or is
3793/// undef.
3795 unsigned NumElts = Mask.size();
3796 for (auto [I, M] : enumerate(Mask))
3797 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3798 return false;
3799 return true;
3800}
3801
3802/// Return true if every element in Mask, beginning
3803/// from position Pos and ending in Pos + Size, falls within the specified
3804/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3805static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3806 unsigned Size, int Low, int Step = 1) {
3807 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3808 if (!isUndefOrEqual(Mask[i], Low))
3809 return false;
3810 return true;
3811}
3812
3813/// Return true if every element in Mask, beginning
3814/// from position Pos and ending in Pos+Size, falls within the specified
3815/// sequential range (Low, Low+Size], or is undef or is zero.
3817 unsigned Size, int Low,
3818 int Step = 1) {
3819 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3820 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3821 return false;
3822 return true;
3823}
3824
3825/// Return true if every element in Mask, beginning
3826/// from position Pos and ending in Pos+Size is undef or is zero.
3827static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3828 unsigned Size) {
3829 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3830}
3831
3832/// Return true if every element of a single input is referenced by the shuffle
3833/// mask. i.e. it just permutes them all.
3835 unsigned NumElts = Mask.size();
3836 APInt DemandedElts = APInt::getZero(NumElts);
3837 for (int M : Mask)
3838 if (isInRange(M, 0, NumElts))
3839 DemandedElts.setBit(M);
3840 return DemandedElts.isAllOnes();
3841}
3842
3843/// Helper function to test whether a shuffle mask could be
3844/// simplified by widening the elements being shuffled.
3845///
3846/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3847/// leaves it in an unspecified state.
3848///
3849/// NOTE: This must handle normal vector shuffle masks and *target* vector
3850/// shuffle masks. The latter have the special property of a '-2' representing
3851/// a zero-ed lane of a vector.
3853 SmallVectorImpl<int> &WidenedMask) {
3854 WidenedMask.assign(Mask.size() / 2, 0);
3855 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3856 int M0 = Mask[i];
3857 int M1 = Mask[i + 1];
3858
3859 // If both elements are undef, its trivial.
3860 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3861 WidenedMask[i / 2] = SM_SentinelUndef;
3862 continue;
3863 }
3864
3865 // Check for an undef mask and a mask value properly aligned to fit with
3866 // a pair of values. If we find such a case, use the non-undef mask's value.
3867 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3868 WidenedMask[i / 2] = M1 / 2;
3869 continue;
3870 }
3871 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3872 WidenedMask[i / 2] = M0 / 2;
3873 continue;
3874 }
3875
3876 // When zeroing, we need to spread the zeroing across both lanes to widen.
3877 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3878 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3880 WidenedMask[i / 2] = SM_SentinelZero;
3881 continue;
3882 }
3883 return false;
3884 }
3885
3886 // Finally check if the two mask values are adjacent and aligned with
3887 // a pair.
3888 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3889 WidenedMask[i / 2] = M0 / 2;
3890 continue;
3891 }
3892
3893 // Otherwise we can't safely widen the elements used in this shuffle.
3894 return false;
3895 }
3896 assert(WidenedMask.size() == Mask.size() / 2 &&
3897 "Incorrect size of mask after widening the elements!");
3898
3899 return true;
3900}
3901
3903 const APInt &Zeroable,
3904 bool V2IsZero,
3905 SmallVectorImpl<int> &WidenedMask) {
3906 // Create an alternative mask with info about zeroable elements.
3907 // Here we do not set undef elements as zeroable.
3908 SmallVector<int, 64> ZeroableMask(Mask);
3909 if (V2IsZero) {
3910 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3911 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3912 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3913 ZeroableMask[i] = SM_SentinelZero;
3914 }
3915 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3916}
3917
3919 SmallVector<int, 32> WidenedMask;
3920 return canWidenShuffleElements(Mask, WidenedMask);
3921}
3922
3923// Attempt to narrow/widen shuffle mask until it matches the target number of
3924// elements.
3925static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3926 SmallVectorImpl<int> &ScaledMask) {
3927 unsigned NumSrcElts = Mask.size();
3928 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3929 "Illegal shuffle scale factor");
3930
3931 // Narrowing is guaranteed to work.
3932 if (NumDstElts >= NumSrcElts) {
3933 int Scale = NumDstElts / NumSrcElts;
3934 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3935 return true;
3936 }
3937
3938 // We have to repeat the widening until we reach the target size, but we can
3939 // split out the first widening as it sets up ScaledMask for us.
3940 if (canWidenShuffleElements(Mask, ScaledMask)) {
3941 while (ScaledMask.size() > NumDstElts) {
3942 SmallVector<int, 16> WidenedMask;
3943 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3944 return false;
3945 ScaledMask = std::move(WidenedMask);
3946 }
3947 return true;
3948 }
3949
3950 return false;
3951}
3952
3953static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3954 SmallVector<int, 32> ScaledMask;
3955 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3956}
3957
3958// Helper to grow the shuffle mask for a larger value type.
3959// NOTE: This is different to scaleShuffleElements which is a same size type.
3960static void growShuffleMask(ArrayRef<int> SrcMask,
3961 SmallVectorImpl<int> &DstMask,
3962 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3963 assert(DstMask.empty() && "Expected an empty shuffle mas");
3964 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3965 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3966 unsigned NumSrcElts = SrcMask.size();
3967 DstMask.assign(SrcMask.begin(), SrcMask.end());
3968 for (int &M : DstMask) {
3969 if (M < 0)
3970 continue;
3971 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3972 }
3973 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3974}
3975
3976/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3978 return isNullConstant(Elt) || isNullFPConstant(Elt);
3979}
3980
3981// Build a vector of constants.
3982// Use an UNDEF node if MaskElt == -1.
3983// Split 64-bit constants in the 32-bit mode.
3985 const SDLoc &dl, bool IsMask = false) {
3986
3988 bool Split = false;
3989
3990 MVT ConstVecVT = VT;
3991 unsigned NumElts = VT.getVectorNumElements();
3992 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3993 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3994 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3995 Split = true;
3996 }
3997
3998 MVT EltVT = ConstVecVT.getVectorElementType();
3999 for (unsigned i = 0; i < NumElts; ++i) {
4000 bool IsUndef = Values[i] < 0 && IsMask;
4001 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4002 DAG.getConstant(Values[i], dl, EltVT);
4003 Ops.push_back(OpNode);
4004 if (Split)
4005 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4006 DAG.getConstant(0, dl, EltVT));
4007 }
4008 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4009 if (Split)
4010 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4011 return ConstsNode;
4012}
4013
4014static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4015 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4016 assert(Bits.size() == Undefs.getBitWidth() &&
4017 "Unequal constant and undef arrays");
4019 bool Split = false;
4020
4021 MVT ConstVecVT = VT;
4022 unsigned NumElts = VT.getVectorNumElements();
4023 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4024 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4025 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4026 Split = true;
4027 }
4028
4029 MVT EltVT = ConstVecVT.getVectorElementType();
4030 MVT EltIntVT = EltVT.changeTypeToInteger();
4031 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4032 if (Undefs[i]) {
4033 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4034 continue;
4035 }
4036 const APInt &V = Bits[i];
4037 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4038 if (Split) {
4039 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4040 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4041 } else {
4042 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4043 }
4044 }
4045
4046 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4047 return DAG.getBitcast(VT, ConstsNode);
4048}
4049
4051 SelectionDAG &DAG, const SDLoc &dl) {
4052 APInt Undefs = APInt::getZero(Bits.size());
4053 return getConstVector(Bits, Undefs, VT, DAG, dl);
4054}
4055
4056/// Returns a vector of specified type with all zero elements.
4057static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4058 SelectionDAG &DAG, const SDLoc &dl) {
4059 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4060 VT.getVectorElementType() == MVT::i1) &&
4061 "Unexpected vector type");
4062
4063 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4064 // type. This ensures they get CSE'd. But if the integer type is not
4065 // available, use a floating-point +0.0 instead.
4066 SDValue Vec;
4067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4068 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4069 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4070 } else if (VT.isFloatingPoint() &&
4072 Vec = DAG.getConstantFP(+0.0, dl, VT);
4073 } else if (VT.getVectorElementType() == MVT::i1) {
4074 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4075 "Unexpected vector type");
4076 Vec = DAG.getConstant(0, dl, VT);
4077 } else {
4078 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4079 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4080 }
4081 return DAG.getBitcast(VT, Vec);
4082}
4083
4084// Helper to determine if the ops are all the extracted subvectors come from a
4085// single source. If we allow commute they don't have to be in order (Lo/Hi).
4086static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4087 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4088 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4089 LHS.getValueType() != RHS.getValueType() ||
4090 LHS.getOperand(0) != RHS.getOperand(0))
4091 return SDValue();
4092
4093 SDValue Src = LHS.getOperand(0);
4094 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4095 return SDValue();
4096
4097 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4098 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4099 RHS.getConstantOperandAPInt(1) == NumElts) ||
4100 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4101 LHS.getConstantOperandAPInt(1) == NumElts))
4102 return Src;
4103
4104 return SDValue();
4105}
4106
4107static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4108 const SDLoc &dl, unsigned vectorWidth) {
4109 EVT VT = Vec.getValueType();
4110 EVT ElVT = VT.getVectorElementType();
4111 unsigned ResultNumElts =
4112 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4113 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4114
4115 assert(ResultVT.getSizeInBits() == vectorWidth &&
4116 "Illegal subvector extraction");
4117
4118 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4119 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4120 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4121
4122 // This is the index of the first element of the vectorWidth-bit chunk
4123 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4124 IdxVal &= ~(ElemsPerChunk - 1);
4125
4126 // If the input is a buildvector just emit a smaller one.
4127 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4128 return DAG.getBuildVector(ResultVT, dl,
4129 Vec->ops().slice(IdxVal, ElemsPerChunk));
4130
4131 // Check if we're extracting the upper undef of a widening pattern.
4132 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4133 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4134 isNullConstant(Vec.getOperand(2)))
4135 return DAG.getUNDEF(ResultVT);
4136
4137 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4138}
4139
4140/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4141/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4142/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4143/// instructions or a simple subregister reference. Idx is an index in the
4144/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4145/// lowering EXTRACT_VECTOR_ELT operations easier.
4146static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4147 SelectionDAG &DAG, const SDLoc &dl) {
4149 Vec.getValueType().is512BitVector()) &&
4150 "Unexpected vector size!");
4151 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4152}
4153
4154/// Generate a DAG to grab 256-bits from a 512-bit vector.
4155static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4156 SelectionDAG &DAG, const SDLoc &dl) {
4157 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4158 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4159}
4160
4161static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4162 SelectionDAG &DAG, const SDLoc &dl,
4163 unsigned vectorWidth) {
4164 assert((vectorWidth == 128 || vectorWidth == 256) &&
4165 "Unsupported vector width");
4166 // Inserting UNDEF is Result
4167 if (Vec.isUndef())
4168 return Result;
4169
4170 // Insert the relevant vectorWidth bits.
4171 EVT VT = Vec.getValueType();
4172 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4173 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4174
4175 // This is the index of the first element of the vectorWidth-bit chunk
4176 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4177 IdxVal &= ~(ElemsPerChunk - 1);
4178 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4179}
4180
4181/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4182/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4183/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4184/// simple superregister reference. Idx is an index in the 128 bits
4185/// we want. It need not be aligned to a 128-bit boundary. That makes
4186/// lowering INSERT_VECTOR_ELT operations easier.
4187static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4188 SelectionDAG &DAG, const SDLoc &dl) {
4189 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4190 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4191}
4192
4193/// Widen a vector to a larger size with the same scalar type, with the new
4194/// elements either zero or undef.
4195static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4196 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4197 const SDLoc &dl) {
4198 EVT VecVT = Vec.getValueType();
4200 VecVT.getScalarType() == VT.getScalarType() &&
4201 "Unsupported vector widening type");
4202 // If the upper 128-bits of a build vector are already undef/zero, then try to
4203 // widen from the lower 128-bits.
4204 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4205 unsigned NumSrcElts = VecVT.getVectorNumElements();
4206 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4207 if (all_of(Hi, [&](SDValue V) {
4208 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4209 }))
4210 Vec = extract128BitVector(Vec, 0, DAG, dl);
4211 }
4212 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4213 : DAG.getUNDEF(VT);
4214 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4215}
4216
4217/// Widen a vector to a larger size with the same scalar type, with the new
4218/// elements either zero or undef.
4219static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4220 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4221 const SDLoc &dl, unsigned WideSizeInBits) {
4222 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4223 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4224 "Unsupported vector widening type");
4225 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4226 MVT SVT = Vec.getSimpleValueType().getScalarType();
4227 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4228 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4229}
4230
4231/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4232/// and bitcast with integer types.
4233static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4234 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4235 unsigned NumElts = VT.getVectorNumElements();
4236 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4237 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4238 return VT;
4239}
4240
4241/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4242/// bitcast with integer types.
4243static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4244 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4245 const SDLoc &dl) {
4246 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4247 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4248}
4249
4250// Helper function to collect subvector ops that are concatenated together,
4251// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4252// The subvectors in Ops are guaranteed to be the same type.
4254 SelectionDAG &DAG) {
4255 assert(Ops.empty() && "Expected an empty ops vector");
4256
4257 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4258 Ops.append(N->op_begin(), N->op_end());
4259 return true;
4260 }
4261
4262 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4263 SDValue Src = N->getOperand(0);
4264 SDValue Sub = N->getOperand(1);
4265 const APInt &Idx = N->getConstantOperandAPInt(2);
4266 EVT VT = Src.getValueType();
4267 EVT SubVT = Sub.getValueType();
4268
4269 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4270 // insert_subvector(undef, x, lo)
4271 if (Idx == 0 && Src.isUndef()) {
4272 Ops.push_back(Sub);
4273 Ops.push_back(DAG.getUNDEF(SubVT));
4274 return true;
4275 }
4276 if (Idx == (VT.getVectorNumElements() / 2)) {
4277 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4278 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4279 Src.getOperand(1).getValueType() == SubVT &&
4280 isNullConstant(Src.getOperand(2))) {
4281 // Attempt to recurse into inner (matching) concats.
4282 SDValue Lo = Src.getOperand(1);
4283 SDValue Hi = Sub;
4284 SmallVector<SDValue, 2> LoOps, HiOps;
4285 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4286 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4287 LoOps.size() == HiOps.size()) {
4288 Ops.append(LoOps);
4289 Ops.append(HiOps);
4290 return true;
4291 }
4292 Ops.push_back(Lo);
4293 Ops.push_back(Hi);
4294 return true;
4295 }
4296 // insert_subvector(x, extract_subvector(x, lo), hi)
4297 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4298 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4299 Ops.append(2, Sub);
4300 return true;
4301 }
4302 // insert_subvector(undef, x, hi)
4303 if (Src.isUndef()) {
4304 Ops.push_back(DAG.getUNDEF(SubVT));
4305 Ops.push_back(Sub);
4306 return true;
4307 }
4308 }
4309 }
4310 }
4311
4312 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4313 EVT VT = N->getValueType(0);
4314 SDValue Src = N->getOperand(0);
4315 uint64_t Idx = N->getConstantOperandVal(1);
4316
4317 // Collect all the subvectors from the source vector and slice off the
4318 // extraction.
4320 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4321 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4322 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4323 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4324 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4325 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4326 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4327 return true;
4328 }
4329 }
4330
4331 assert(Ops.empty() && "Expected an empty ops vector");
4332 return false;
4333}
4334
4335// Helper to check if \p V can be split into subvectors and the upper subvectors
4336// are all undef. In which case return the lower subvector.
4338 SelectionDAG &DAG) {
4339 SmallVector<SDValue> SubOps;
4340 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4341 return SDValue();
4342
4343 unsigned NumSubOps = SubOps.size();
4344 unsigned HalfNumSubOps = NumSubOps / 2;
4345 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4346
4347 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4348 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4349 return SDValue();
4350
4351 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4352 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4353 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4354}
4355
4356// Helper to check if we can access all the constituent subvectors without any
4357// extract ops.
4360 return collectConcatOps(V.getNode(), Ops, DAG);
4361}
4362
4363static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4364 const SDLoc &dl) {
4365 EVT VT = Op.getValueType();
4366 unsigned NumElems = VT.getVectorNumElements();
4367 unsigned SizeInBits = VT.getSizeInBits();
4368 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4369 "Can't split odd sized vector");
4370
4372 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4373 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4374 unsigned HalfOps = SubOps.size() / 2;
4375 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4376 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4377 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4378 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4379 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4380 return std::make_pair(Lo, Hi);
4381 }
4382
4383 // If this is a splat value (with no-undefs) then use the lower subvector,
4384 // which should be a free extraction.
4385 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4386 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4387 return std::make_pair(Lo, Lo);
4388
4389 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4390 return std::make_pair(Lo, Hi);
4391}
4392
4393/// Break an operation into 2 half sized ops and then concatenate the results.
4395 unsigned NumOps = Op.getNumOperands();
4396 EVT VT = Op.getValueType();
4397
4398 // Extract the LHS Lo/Hi vectors
4401 for (unsigned I = 0; I != NumOps; ++I) {
4402 SDValue SrcOp = Op.getOperand(I);
4403 if (!SrcOp.getValueType().isVector()) {
4404 LoOps[I] = HiOps[I] = SrcOp;
4405 continue;
4406 }
4407 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4408 }
4409
4410 EVT LoVT, HiVT;
4411 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4412 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4413 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4414 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4415}
4416
4417/// Break an unary integer operation into 2 half sized ops and then
4418/// concatenate the result back.
4420 const SDLoc &dl) {
4421 // Make sure we only try to split 256/512-bit types to avoid creating
4422 // narrow vectors.
4423 [[maybe_unused]] EVT VT = Op.getValueType();
4424 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4425 Op.getOperand(0).getValueType().is512BitVector()) &&
4426 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4427 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4428 VT.getVectorNumElements() &&
4429 "Unexpected VTs!");
4430 return splitVectorOp(Op, DAG, dl);
4431}
4432
4433/// Break a binary integer operation into 2 half sized ops and then
4434/// concatenate the result back.
4436 const SDLoc &dl) {
4437 // Assert that all the types match.
4438 [[maybe_unused]] EVT VT = Op.getValueType();
4439 assert(Op.getOperand(0).getValueType() == VT &&
4440 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4441 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4442 return splitVectorOp(Op, DAG, dl);
4443}
4444
4445// Helper for splitting operands of an operation to legal target size and
4446// apply a function on each part.
4447// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4448// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4449// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4450// The argument Builder is a function that will be applied on each split part:
4451// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4452template <typename F>
4454 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4455 F Builder, bool CheckBWI = true,
4456 bool AllowAVX512 = true) {
4457 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4458 unsigned NumSubs = 1;
4459 if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
4460 (!CheckBWI && Subtarget.useAVX512Regs()))) {
4461 if (VT.getSizeInBits() > 512) {
4462 NumSubs = VT.getSizeInBits() / 512;
4463 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4464 }
4465 } else if (Subtarget.hasAVX2()) {
4466 if (VT.getSizeInBits() > 256) {
4467 NumSubs = VT.getSizeInBits() / 256;
4468 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4469 }
4470 } else {
4471 if (VT.getSizeInBits() > 128) {
4472 NumSubs = VT.getSizeInBits() / 128;
4473 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4474 }
4475 }
4476
4477 if (NumSubs == 1)
4478 return Builder(DAG, DL, Ops);
4479
4481 for (unsigned i = 0; i != NumSubs; ++i) {
4483 for (SDValue Op : Ops) {
4484 EVT OpVT = Op.getValueType();
4485 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4486 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4487 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4488 }
4489 Subs.push_back(Builder(DAG, DL, SubOps));
4490 }
4491 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4492}
4493
4494// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4495// targets.
4496static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4498 const X86Subtarget &Subtarget) {
4499 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4500 MVT SVT = VT.getScalarType();
4501
4502 // If we have a 32/64 splatted constant, splat it to DstTy to
4503 // encourage a foldable broadcast'd operand.
4504 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4505 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4506 // AVX512 broadcasts 32/64-bit operands.
4507 // TODO: Support float once getAVX512Node is used by fp-ops.
4508 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4510 return SDValue();
4511 // If we're not widening, don't bother if we're not bitcasting.
4512 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4513 return SDValue();
4515 APInt SplatValue, SplatUndef;
4516 unsigned SplatBitSize;
4517 bool HasAnyUndefs;
4518 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4519 HasAnyUndefs, OpEltSizeInBits) &&
4520 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4521 return DAG.getConstant(SplatValue, DL, DstVT);
4522 }
4523 return SDValue();
4524 };
4525
4526 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4527
4528 MVT DstVT = VT;
4529 if (Widen)
4530 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4531
4532 // Canonicalize src operands.
4533 SmallVector<SDValue> SrcOps(Ops);
4534 for (SDValue &Op : SrcOps) {
4535 MVT OpVT = Op.getSimpleValueType();
4536 // Just pass through scalar operands.
4537 if (!OpVT.isVector())
4538 continue;
4539 assert(OpVT == VT && "Vector type mismatch");
4540
4541 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4542 Op = BroadcastOp;
4543 continue;
4544 }
4545
4546 // Just widen the subvector by inserting into an undef wide vector.
4547 if (Widen)
4548 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4549 }
4550
4551 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4552
4553 // Perform the 512-bit op then extract the bottom subvector.
4554 if (Widen)
4555 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4556 return Res;
4557}
4558
4559/// Insert i1-subvector to i1-vector.
4561 const X86Subtarget &Subtarget) {
4562
4563 SDLoc dl(Op);
4564 SDValue Vec = Op.getOperand(0);
4565 SDValue SubVec = Op.getOperand(1);
4566 SDValue Idx = Op.getOperand(2);
4567 unsigned IdxVal = Op.getConstantOperandVal(2);
4568
4569 // Inserting undef is a nop. We can just return the original vector.
4570 if (SubVec.isUndef())
4571 return Vec;
4572
4573 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4574 return Op;
4575
4576 MVT OpVT = Op.getSimpleValueType();
4577 unsigned NumElems = OpVT.getVectorNumElements();
4578 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4579
4580 // Extend to natively supported kshift.
4581 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4582
4583 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4584 // if necessary.
4585 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4586 // May need to promote to a legal type.
4587 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4588 DAG.getConstant(0, dl, WideOpVT),
4589 SubVec, Idx);
4590 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4591 }
4592
4593 MVT SubVecVT = SubVec.getSimpleValueType();
4594 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4595 assert(IdxVal + SubVecNumElems <= NumElems &&
4596 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4597 "Unexpected index value in INSERT_SUBVECTOR");
4598
4599 SDValue Undef = DAG.getUNDEF(WideOpVT);
4600
4601 if (IdxVal == 0) {
4602 // Zero lower bits of the Vec
4603 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4604 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4605 ZeroIdx);
4606 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4607 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4608 // Merge them together, SubVec should be zero extended.
4609 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4610 DAG.getConstant(0, dl, WideOpVT),
4611 SubVec, ZeroIdx);
4612 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4613 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4614 }
4615
4616 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4617 Undef, SubVec, ZeroIdx);
4618
4619 if (Vec.isUndef()) {
4620 assert(IdxVal != 0 && "Unexpected index");
4621 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4622 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4623 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4624 }
4625
4627 assert(IdxVal != 0 && "Unexpected index");
4628 // If upper elements of Vec are known undef, then just shift into place.
4629 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4630 [](SDValue V) { return V.isUndef(); })) {
4631 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4632 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4633 } else {
4634 NumElems = WideOpVT.getVectorNumElements();
4635 unsigned ShiftLeft = NumElems - SubVecNumElems;
4636 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4637 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4638 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4639 if (ShiftRight != 0)
4640 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4641 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4642 }
4643 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4644 }
4645
4646 // Simple case when we put subvector in the upper part
4647 if (IdxVal + SubVecNumElems == NumElems) {
4648 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4649 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4650 if (SubVecNumElems * 2 == NumElems) {
4651 // Special case, use legal zero extending insert_subvector. This allows
4652 // isel to optimize when bits are known zero.
4653 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4654 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4655 DAG.getConstant(0, dl, WideOpVT),
4656 Vec, ZeroIdx);
4657 } else {
4658 // Otherwise use explicit shifts to zero the bits.
4659 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4660 Undef, Vec, ZeroIdx);
4661 NumElems = WideOpVT.getVectorNumElements();
4662 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4663 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4664 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4665 }
4666 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4667 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4668 }
4669
4670 // Inserting into the middle is more complicated.
4671
4672 NumElems = WideOpVT.getVectorNumElements();
4673
4674 // Widen the vector if needed.
4675 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4676
4677 unsigned ShiftLeft = NumElems - SubVecNumElems;
4678 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4679
4680 // Do an optimization for the most frequently used types.
4681 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4682 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4683 Mask0.flipAllBits();
4684 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4685 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4686 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4687 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4688 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4689 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4690 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4691 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4692
4693 // Reduce to original width if needed.
4694 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4695 }
4696
4697 // Clear the upper bits of the subvector and move it to its insert position.
4698 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4699 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4700 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4701 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4702
4703 // Isolate the bits below the insertion point.
4704 unsigned LowShift = NumElems - IdxVal;
4705 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4706 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4707 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4708 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4709
4710 // Isolate the bits after the last inserted bit.
4711 unsigned HighShift = IdxVal + SubVecNumElems;
4712 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4713 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4714 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4715 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4716
4717 // Now OR all 3 pieces together.
4718 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4719 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4720
4721 // Reduce to original width if needed.
4722 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4723}
4724
4726 const SDLoc &dl) {
4727 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4728 EVT SubVT = V1.getValueType();
4729 EVT SubSVT = SubVT.getScalarType();
4730 unsigned SubNumElts = SubVT.getVectorNumElements();
4731 unsigned SubVectorWidth = SubVT.getSizeInBits();
4732 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4733 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4734 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4735}
4736
4737/// Returns a vector of specified type with all bits set.
4738/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4739/// Then bitcast to their original type, ensuring they get CSE'd.
4740static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4741 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4742 "Expected a 128/256/512-bit vector type");
4743 unsigned NumElts = VT.getSizeInBits() / 32;
4744 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4745 return DAG.getBitcast(VT, Vec);
4746}
4747
4748static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4749 SDValue In, SelectionDAG &DAG) {
4750 EVT InVT = In.getValueType();
4751 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4752
4753 // Canonicalize Opcode to general extension version.
4754 switch (Opcode) {
4755 case ISD::ANY_EXTEND:
4757 Opcode = ISD::ANY_EXTEND;
4758 break;
4759 case ISD::SIGN_EXTEND:
4761 Opcode = ISD::SIGN_EXTEND;
4762 break;
4763 case ISD::ZERO_EXTEND:
4765 Opcode = ISD::ZERO_EXTEND;
4766 break;
4767 default:
4768 llvm_unreachable("Unknown extension opcode");
4769 }
4770
4771 // For 256-bit vectors, we only need the lower (128-bit) input half.
4772 // For 512-bit vectors, we only need the lower input half or quarter.
4773 if (InVT.getSizeInBits() > 128) {
4774 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4775 "Expected VTs to be the same size!");
4776 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4777 In = extractSubVector(In, 0, DAG, DL,
4778 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4779 InVT = In.getValueType();
4780 }
4781
4782 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4783 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4784
4785 return DAG.getNode(Opcode, DL, VT, In);
4786}
4787
4788// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4790 SDValue Mask, SelectionDAG &DAG) {
4791 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4792 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4793 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4794}
4795
4797 bool Lo, bool Unary) {
4798 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4799 "Illegal vector type to unpack");
4800 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4801 int NumElts = VT.getVectorNumElements();
4802 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4803 for (int i = 0; i < NumElts; ++i) {
4804 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4805 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4806 Pos += (Unary ? 0 : NumElts * (i % 2));
4807 Pos += (Lo ? 0 : NumEltsInLane / 2);
4808 Mask.push_back(Pos);
4809 }
4810}
4811
4812/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4813/// imposed by AVX and specific to the unary pattern. Example:
4814/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4815/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4817 bool Lo) {
4818 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4819 int NumElts = VT.getVectorNumElements();
4820 for (int i = 0; i < NumElts; ++i) {
4821 int Pos = i / 2;
4822 Pos += (Lo ? 0 : NumElts / 2);
4823 Mask.push_back(Pos);
4824 }
4825}
4826
4827// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4828static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4829 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4832 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4833 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4834 int M = Mask[I];
4835 if (M < 0)
4836 continue;
4837 SDValue V = (M < NumElts) ? V1 : V2;
4838 if (V.isUndef())
4839 continue;
4840 Ops[I] = V.getOperand(M % NumElts);
4841 }
4842 return DAG.getBuildVector(VT, dl, Ops);
4843 }
4844
4845 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4846}
4847
4848/// Returns a vector_shuffle node for an unpackl operation.
4849static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4850 SDValue V1, SDValue V2) {
4852 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4853 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4854}
4855
4856/// Returns a vector_shuffle node for an unpackh operation.
4857static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4858 SDValue V1, SDValue V2) {
4860 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4861 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4862}
4863
4864/// Returns a node that packs the LHS + RHS nodes together at half width.
4865/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4866/// TODO: Add subvector splitting if/when we have a need for it.
4867static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4868 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4869 bool PackHiHalf = false) {
4870 MVT OpVT = LHS.getSimpleValueType();
4871 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4872 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4873 assert(OpVT == RHS.getSimpleValueType() &&
4874 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4875 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4876 "Unexpected PACK operand types");
4877 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4878 "Unexpected PACK result type");
4879
4880 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4881 if (EltSizeInBits == 32) {
4882 SmallVector<int> PackMask;
4883 int Offset = PackHiHalf ? 1 : 0;
4884 int NumElts = VT.getVectorNumElements();
4885 for (int I = 0; I != NumElts; I += 4) {
4886 PackMask.push_back(I + Offset);
4887 PackMask.push_back(I + Offset + 2);
4888 PackMask.push_back(I + Offset + NumElts);
4889 PackMask.push_back(I + Offset + NumElts + 2);
4890 }
4891 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4892 DAG.getBitcast(VT, RHS), PackMask);
4893 }
4894
4895 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4896 if (!PackHiHalf) {
4897 if (UsePackUS &&
4898 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4899 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4900 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4901
4902 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4903 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4904 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4905 }
4906
4907 // Fallback to sign/zero extending the requested half and pack.
4908 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4909 if (UsePackUS) {
4910 if (PackHiHalf) {
4911 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4912 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4913 } else {
4914 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4915 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4916 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4917 };
4918 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4919 };
4920
4921 if (!PackHiHalf) {
4922 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4923 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4924 }
4925 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4926 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4927 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4928}
4929
4930/// Return a vector_shuffle of the specified vector of zero or undef vector.
4931/// This produces a shuffle where the low element of V2 is swizzled into the
4932/// zero/undef vector, landing at element Idx.
4933/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4935 bool IsZero,
4936 const X86Subtarget &Subtarget,
4937 SelectionDAG &DAG) {
4938 MVT VT = V2.getSimpleValueType();
4939 SDValue V1 = IsZero
4940 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4941 int NumElems = VT.getVectorNumElements();
4942 SmallVector<int, 16> MaskVec(NumElems);
4943 for (int i = 0; i != NumElems; ++i)
4944 // If this is the insertion idx, put the low elt of V2 here.
4945 MaskVec[i] = (i == Idx) ? NumElems : i;
4946 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4947}
4948
4950 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4951 Ptr.getOpcode() == X86ISD::WrapperRIP)
4952 Ptr = Ptr.getOperand(0);
4954}
4955
4956// TODO: Add support for non-zero offsets.
4959 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4960 return nullptr;
4961 return CNode->getConstVal();
4962}
4963
4965 if (!Load || !ISD::isNormalLoad(Load))
4966 return nullptr;
4967 return getTargetConstantFromBasePtr(Load->getBasePtr());
4968}
4969
4974
4975const Constant *
4977 assert(LD && "Unexpected null LoadSDNode");
4978 return getTargetConstantFromNode(LD);
4979}
4980
4982 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4983 SDValue Cond = N->getOperand(0);
4984 SDValue RHS = N->getOperand(2);
4985 EVT CondVT = Cond.getValueType();
4986 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4987 CondVT.getVectorElementType() == MVT::i1 &&
4988 ISD::isBuildVectorAllZeros(RHS.getNode());
4989}
4990
4991// Extract raw constant bits from constant pools.
4992static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4993 APInt &UndefElts,
4994 SmallVectorImpl<APInt> &EltBits,
4995 bool AllowWholeUndefs = true,
4996 bool AllowPartialUndefs = false) {
4997 assert(EltBits.empty() && "Expected an empty EltBits vector");
4998
5000
5001 EVT VT = Op.getValueType();
5002 unsigned SizeInBits = VT.getSizeInBits();
5003 unsigned NumElts = SizeInBits / EltSizeInBits;
5004
5005 // Can't split constant.
5006 if ((SizeInBits % EltSizeInBits) != 0)
5007 return false;
5008
5009 // Bitcast a source array of element bits to the target size.
5010 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5011 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5012 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5013 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5014 "Constant bit sizes don't match");
5015
5016 // Don't split if we don't allow undef bits.
5017 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5018 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5019 return false;
5020
5021 // If we're already the right size, don't bother bitcasting.
5022 if (NumSrcElts == NumElts) {
5023 UndefElts = UndefSrcElts;
5024 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5025 return true;
5026 }
5027
5028 // Extract all the undef/constant element data and pack into single bitsets.
5029 APInt UndefBits(SizeInBits, 0);
5030 APInt MaskBits(SizeInBits, 0);
5031
5032 for (unsigned i = 0; i != NumSrcElts; ++i) {
5033 unsigned BitOffset = i * SrcEltSizeInBits;
5034 if (UndefSrcElts[i])
5035 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5036 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5037 }
5038
5039 // Split the undef/constant single bitset data into the target elements.
5040 UndefElts = APInt(NumElts, 0);
5041 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5042
5043 for (unsigned i = 0; i != NumElts; ++i) {
5044 unsigned BitOffset = i * EltSizeInBits;
5045 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5046
5047 // Only treat an element as UNDEF if all bits are UNDEF.
5048 if (UndefEltBits.isAllOnes()) {
5049 if (!AllowWholeUndefs)
5050 return false;
5051 UndefElts.setBit(i);
5052 continue;
5053 }
5054
5055 // If only some bits are UNDEF then treat them as zero (or bail if not
5056 // supported).
5057 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5058 return false;
5059
5060 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5061 }
5062 return true;
5063 };
5064
5065 // Collect constant bits and insert into mask/undef bit masks.
5066 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5067 unsigned UndefBitIndex) {
5068 if (!Cst)
5069 return false;
5070 if (isa<UndefValue>(Cst)) {
5071 Undefs.setBit(UndefBitIndex);
5072 return true;
5073 }
5074 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5075 Mask = CInt->getValue();
5076 return true;
5077 }
5078 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5079 Mask = CFP->getValueAPF().bitcastToAPInt();
5080 return true;
5081 }
5082 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5083 Type *Ty = CDS->getType();
5084 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5085 Type *EltTy = CDS->getElementType();
5086 bool IsInteger = EltTy->isIntegerTy();
5087 bool IsFP =
5088 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5089 if (!IsInteger && !IsFP)
5090 return false;
5091 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5092 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5093 if (IsInteger)
5094 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5095 else
5096 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5097 I * EltBits);
5098 return true;
5099 }
5100 return false;
5101 };
5102
5103 // Handle UNDEFs.
5104 if (Op.isUndef()) {
5105 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5106 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5107 return CastBitData(UndefSrcElts, SrcEltBits);
5108 }
5109
5110 // Extract scalar constant bits.
5111 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5112 APInt UndefSrcElts = APInt::getZero(1);
5113 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5114 return CastBitData(UndefSrcElts, SrcEltBits);
5115 }
5116 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5117 APInt UndefSrcElts = APInt::getZero(1);
5118 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5119 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5120 return CastBitData(UndefSrcElts, SrcEltBits);
5121 }
5122
5123 // Extract constant bits from build vector.
5124 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5125 BitVector Undefs;
5126 SmallVector<APInt> SrcEltBits;
5127 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5128 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5129 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5130 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5131 if (Undefs[I])
5132 UndefSrcElts.setBit(I);
5133 return CastBitData(UndefSrcElts, SrcEltBits);
5134 }
5135 }
5136
5137 // Extract constant bits from constant pool vector.
5138 if (auto *Cst = getTargetConstantFromNode(Op)) {
5139 Type *CstTy = Cst->getType();
5140 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5141 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5142 return false;
5143
5144 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5145 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5146 if ((SizeInBits % SrcEltSizeInBits) != 0)
5147 return false;
5148
5149 APInt UndefSrcElts(NumSrcElts, 0);
5150 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5151 for (unsigned i = 0; i != NumSrcElts; ++i)
5152 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5153 UndefSrcElts, i))
5154 return false;
5155
5156 return CastBitData(UndefSrcElts, SrcEltBits);
5157 }
5158
5159 // Extract constant bits from a broadcasted constant pool scalar.
5160 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5161 EltSizeInBits <= VT.getScalarSizeInBits()) {
5162 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5163 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5164 return false;
5165
5166 SDValue Ptr = MemIntr->getBasePtr();
5168 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5169 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5170
5171 APInt UndefSrcElts(NumSrcElts, 0);
5172 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5173 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5174 if (UndefSrcElts[0])
5175 UndefSrcElts.setBits(0, NumSrcElts);
5176 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5177 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5178 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5179 return CastBitData(UndefSrcElts, SrcEltBits);
5180 }
5181 }
5182 }
5183
5184 // Extract constant bits from a subvector broadcast.
5185 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5186 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5187 SDValue Ptr = MemIntr->getBasePtr();
5188 // The source constant may be larger than the subvector broadcast,
5189 // ensure we extract the correct subvector constants.
5190 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5191 Type *CstTy = Cst->getType();
5192 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5193 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5194 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5195 (SizeInBits % SubVecSizeInBits) != 0)
5196 return false;
5197 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5198 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5199 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5200 APInt UndefSubElts(NumSubElts, 0);
5201 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5202 APInt(CstEltSizeInBits, 0));
5203 for (unsigned i = 0; i != NumSubElts; ++i) {
5204 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5205 UndefSubElts, i))
5206 return false;
5207 for (unsigned j = 1; j != NumSubVecs; ++j)
5208 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5209 }
5210 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5211 UndefSubElts);
5212 return CastBitData(UndefSubElts, SubEltBits);
5213 }
5214 }
5215
5216 // Extract a rematerialized scalar constant insertion.
5217 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5218 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5219 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5220 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5221 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5222
5223 APInt UndefSrcElts(NumSrcElts, 0);
5224 SmallVector<APInt, 64> SrcEltBits;
5225 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5226 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5227 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5228 return CastBitData(UndefSrcElts, SrcEltBits);
5229 }
5230
5231 // Insert constant bits from a base and sub vector sources.
5232 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5233 // If bitcasts to larger elements we might lose track of undefs - don't
5234 // allow any to be safe.
5235 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5236 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5237
5238 APInt UndefSrcElts, UndefSubElts;
5239 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5240 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5241 UndefSubElts, EltSubBits,
5242 AllowWholeUndefs && AllowUndefs,
5243 AllowPartialUndefs && AllowUndefs) &&
5244 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5245 UndefSrcElts, EltSrcBits,
5246 AllowWholeUndefs && AllowUndefs,
5247 AllowPartialUndefs && AllowUndefs)) {
5248 unsigned BaseIdx = Op.getConstantOperandVal(2);
5249 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5250 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5251 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5252 return CastBitData(UndefSrcElts, EltSrcBits);
5253 }
5254 }
5255
5256 // Extract constant bits from a subvector's source.
5257 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5258 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5259 EltBits, AllowWholeUndefs,
5260 AllowPartialUndefs)) {
5261 EVT SrcVT = Op.getOperand(0).getValueType();
5262 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5263 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5264 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5265 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5266 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5267 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5268 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5269
5270 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5271 if ((BaseIdx + NumSubElts) != NumSrcElts)
5272 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5273 if (BaseIdx != 0)
5274 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5275 return true;
5276 }
5277
5278 // Extract constant bits from shuffle node sources.
5279 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5280 // TODO - support shuffle through bitcasts.
5281 if (EltSizeInBits != VT.getScalarSizeInBits())
5282 return false;
5283
5284 ArrayRef<int> Mask = SVN->getMask();
5285 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5286 llvm::any_of(Mask, [](int M) { return M < 0; }))
5287 return false;
5288
5289 APInt UndefElts0, UndefElts1;
5290 SmallVector<APInt, 32> EltBits0, EltBits1;
5291 if (isAnyInRange(Mask, 0, NumElts) &&
5292 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5293 UndefElts0, EltBits0, AllowWholeUndefs,
5294 AllowPartialUndefs))
5295 return false;
5296 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5297 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5298 UndefElts1, EltBits1, AllowWholeUndefs,
5299 AllowPartialUndefs))
5300 return false;
5301
5302 UndefElts = APInt::getZero(NumElts);
5303 for (int i = 0; i != (int)NumElts; ++i) {
5304 int M = Mask[i];
5305 if (M < 0) {
5306 UndefElts.setBit(i);
5307 EltBits.push_back(APInt::getZero(EltSizeInBits));
5308 } else if (M < (int)NumElts) {
5309 if (UndefElts0[M])
5310 UndefElts.setBit(i);
5311 EltBits.push_back(EltBits0[M]);
5312 } else {
5313 if (UndefElts1[M - NumElts])
5314 UndefElts.setBit(i);
5315 EltBits.push_back(EltBits1[M - NumElts]);
5316 }
5317 }
5318 return true;
5319 }
5320
5321 return false;
5322}
5323
5324namespace llvm {
5325namespace X86 {
5326bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5327 APInt UndefElts;
5328 SmallVector<APInt, 16> EltBits;
5330 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5331 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5332 int SplatIndex = -1;
5333 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5334 if (UndefElts[i])
5335 continue;
5336 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5337 SplatIndex = -1;
5338 break;
5339 }
5340 SplatIndex = i;
5341 }
5342 if (0 <= SplatIndex) {
5343 SplatVal = EltBits[SplatIndex];
5344 return true;
5345 }
5346 }
5347
5348 return false;
5349}
5350
5351int getRoundingModeX86(unsigned RM) {
5352 switch (static_cast<::llvm::RoundingMode>(RM)) {
5353 // clang-format off
5354 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break;
5355 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; break;
5356 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; break;
5357 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; break;
5358 default:
5359 return X86::rmInvalid; // Invalid rounding mode
5360 }
5361}
5362
5363} // namespace X86
5364} // namespace llvm
5365
5367 unsigned MaskEltSizeInBits,
5369 APInt &UndefElts) {
5370 // Extract the raw target constant bits.
5371 SmallVector<APInt, 64> EltBits;
5372 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5373 EltBits, /* AllowWholeUndefs */ true,
5374 /* AllowPartialUndefs */ false))
5375 return false;
5376
5377 // Insert the extracted elements into the mask.
5378 for (const APInt &Elt : EltBits)
5379 RawMask.push_back(Elt.getZExtValue());
5380
5381 return true;
5382}
5383
5384static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5385 bool AllowUndefs) {
5386 APInt UndefElts;
5387 SmallVector<APInt, 64> EltBits;
5388 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5389 /*AllowWholeUndefs*/ AllowUndefs,
5390 /*AllowPartialUndefs*/ false))
5391 return false;
5392
5393 bool IsPow2OrUndef = true;
5394 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5395 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5396 return IsPow2OrUndef;
5397}
5398
5399// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5401 // TODO: don't always ignore oneuse constraints.
5402 V = peekThroughBitcasts(V);
5403 EVT VT = V.getValueType();
5404
5405 // Match not(xor X, -1) -> X.
5406 if (V.getOpcode() == ISD::XOR &&
5407 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5408 isAllOnesConstant(V.getOperand(1))))
5409 return V.getOperand(0);
5410
5411 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5412 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5413 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5414 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5415 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5416 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5417 V.getOperand(1));
5418 }
5419 }
5420
5421 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5422 if (V.getOpcode() == X86ISD::PCMPGT &&
5423 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5424 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5425 V.getOperand(0).hasOneUse()) {
5426 APInt UndefElts;
5427 SmallVector<APInt> EltBits;
5428 if (getTargetConstantBitsFromNode(V.getOperand(0),
5429 V.getScalarValueSizeInBits(), UndefElts,
5430 EltBits) &&
5431 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5432 // Don't fold min_signed_value -> (min_signed_value - 1)
5433 bool MinSigned = false;
5434 for (APInt &Elt : EltBits) {
5435 MinSigned |= Elt.isMinSignedValue();
5436 Elt -= 1;
5437 }
5438 if (!MinSigned) {
5439 SDLoc DL(V);
5440 MVT VT = V.getSimpleValueType();
5441 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5442 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5443 }
5444 }
5445 }
5446
5447 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5449 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5450 for (SDValue &CatOp : CatOps) {
5451 SDValue NotCat = IsNOT(CatOp, DAG);
5452 if (!NotCat)
5453 return SDValue();
5454 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5455 }
5456 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5457 }
5458
5459 // Match not(or(not(X),not(Y))) -> and(X, Y).
5460 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5461 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5462 // TODO: Handle cases with single NOT operand -> ANDNP
5463 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5464 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5465 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5466 DAG.getBitcast(VT, Op1));
5467 }
5468
5469 return SDValue();
5470}
5471
5472/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5473/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5474/// Note: This ignores saturation, so inputs must be checked first.
5476 bool Unary, unsigned NumStages = 1) {
5477 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5478 unsigned NumElts = VT.getVectorNumElements();
5479 unsigned NumLanes = VT.getSizeInBits() / 128;
5480 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5481 unsigned Offset = Unary ? 0 : NumElts;
5482 unsigned Repetitions = 1u << (NumStages - 1);
5483 unsigned Increment = 1u << NumStages;
5484 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5485
5486 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5487 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5488 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5489 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5490 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5491 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5492 }
5493 }
5494}
5495
5496// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5497static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5498 APInt &DemandedLHS, APInt &DemandedRHS) {
5499 int NumLanes = VT.getSizeInBits() / 128;
5500 int NumElts = DemandedElts.getBitWidth();
5501 int NumInnerElts = NumElts / 2;
5502 int NumEltsPerLane = NumElts / NumLanes;
5503 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5504
5505 DemandedLHS = APInt::getZero(NumInnerElts);
5506 DemandedRHS = APInt::getZero(NumInnerElts);
5507
5508 // Map DemandedElts to the packed operands.
5509 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5510 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5511 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5512 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5513 if (DemandedElts[OuterIdx])
5514 DemandedLHS.setBit(InnerIdx);
5515 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5516 DemandedRHS.setBit(InnerIdx);
5517 }
5518 }
5519}
5520
5521// Split the demanded elts of a HADD/HSUB node between its operands.
5522static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5523 APInt &DemandedLHS, APInt &DemandedRHS) {
5525 DemandedLHS, DemandedRHS);
5526 DemandedLHS |= DemandedLHS << 1;
5527 DemandedRHS |= DemandedRHS << 1;
5528}
5529
5530/// Calculates the shuffle mask corresponding to the target-specific opcode.
5531/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5532/// operands in \p Ops, and returns true.
5533/// Sets \p IsUnary to true if only one source is used. Note that this will set
5534/// IsUnary for shuffles which use a single input multiple times, and in those
5535/// cases it will adjust the mask to only have indices within that single input.
5536/// It is an error to call this with non-empty Mask/Ops vectors.
5537static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5539 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5540 if (!isTargetShuffle(N.getOpcode()))
5541 return false;
5542
5543 MVT VT = N.getSimpleValueType();
5544 unsigned NumElems = VT.getVectorNumElements();
5545 unsigned MaskEltSize = VT.getScalarSizeInBits();
5547 APInt RawUndefs;
5548 uint64_t ImmN;
5549
5550 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5551 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5552
5553 IsUnary = false;
5554 bool IsFakeUnary = false;
5555 switch (N.getOpcode()) {
5556 case X86ISD::BLENDI:
5557 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5558 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5559 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5560 DecodeBLENDMask(NumElems, ImmN, Mask);
5561 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5562 break;
5563 case X86ISD::SHUFP:
5564 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5565 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5566 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5567 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5568 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5569 break;
5570 case X86ISD::INSERTPS:
5571 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5572 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5573 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5574 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5575 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5576 break;
5577 case X86ISD::EXTRQI:
5578 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5579 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5580 isa<ConstantSDNode>(N.getOperand(2))) {
5581 int BitLen = N.getConstantOperandVal(1);
5582 int BitIdx = N.getConstantOperandVal(2);
5583 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5584 IsUnary = true;
5585 }
5586 break;
5587 case X86ISD::INSERTQI:
5588 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5589 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5590 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5591 isa<ConstantSDNode>(N.getOperand(3))) {
5592 int BitLen = N.getConstantOperandVal(2);
5593 int BitIdx = N.getConstantOperandVal(3);
5594 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5595 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5596 }
5597 break;
5598 case X86ISD::UNPCKH:
5599 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5600 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5601 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5602 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5603 break;
5604 case X86ISD::UNPCKL:
5605 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5606 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5607 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5608 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5609 break;
5610 case X86ISD::MOVHLPS:
5611 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5613 DecodeMOVHLPSMask(NumElems, Mask);
5614 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5615 break;
5616 case X86ISD::MOVLHPS:
5617 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5618 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5619 DecodeMOVLHPSMask(NumElems, Mask);
5620 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5621 break;
5622 case X86ISD::VALIGN:
5623 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5624 "Only 32-bit and 64-bit elements are supported!");
5625 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5626 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5627 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5628 DecodeVALIGNMask(NumElems, ImmN, Mask);
5629 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5630 Ops.push_back(N.getOperand(1));
5631 Ops.push_back(N.getOperand(0));
5632 break;
5633 case X86ISD::PALIGNR:
5634 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5635 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5636 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5637 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5638 DecodePALIGNRMask(NumElems, ImmN, Mask);
5639 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5640 Ops.push_back(N.getOperand(1));
5641 Ops.push_back(N.getOperand(0));
5642 break;
5643 case X86ISD::VSHLDQ:
5644 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5645 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5646 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5647 DecodePSLLDQMask(NumElems, ImmN, Mask);
5648 IsUnary = true;
5649 break;
5650 case X86ISD::VSRLDQ:
5651 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5652 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5653 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5654 DecodePSRLDQMask(NumElems, ImmN, Mask);
5655 IsUnary = true;
5656 break;
5657 case X86ISD::PSHUFD:
5658 case X86ISD::VPERMILPI:
5659 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5660 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5661 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5662 IsUnary = true;
5663 break;
5664 case X86ISD::PSHUFHW:
5665 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5666 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5667 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5668 IsUnary = true;
5669 break;
5670 case X86ISD::PSHUFLW:
5671 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5672 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5673 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5674 IsUnary = true;
5675 break;
5676 case X86ISD::VZEXT_MOVL:
5677 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5678 DecodeZeroMoveLowMask(NumElems, Mask);
5679 IsUnary = true;
5680 break;
5681 case X86ISD::VBROADCAST:
5682 // We only decode broadcasts of same-sized vectors, peeking through to
5683 // extracted subvectors is likely to cause hasOneUse issues with
5684 // SimplifyDemandedBits etc.
5685 if (N.getOperand(0).getValueType() == VT) {
5686 DecodeVectorBroadcast(NumElems, Mask);
5687 IsUnary = true;
5688 break;
5689 }
5690 return false;
5691 case X86ISD::VPERMILPV: {
5692 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5693 IsUnary = true;
5694 SDValue MaskNode = N.getOperand(1);
5695 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5696 RawUndefs)) {
5697 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5698 break;
5699 }
5700 return false;
5701 }
5702 case X86ISD::PSHUFB: {
5703 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5704 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5705 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5706 IsUnary = true;
5707 SDValue MaskNode = N.getOperand(1);
5708 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5709 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5710 break;
5711 }
5712 return false;
5713 }
5714 case X86ISD::VPERMI:
5715 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5716 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5717 DecodeVPERMMask(NumElems, ImmN, Mask);
5718 IsUnary = true;
5719 break;
5720 case X86ISD::MOVSS:
5721 case X86ISD::MOVSD:
5722 case X86ISD::MOVSH:
5723 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5724 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5725 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5726 break;
5727 case X86ISD::VPERM2X128:
5728 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5729 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5730 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5731 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5732 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5733 break;
5734 case X86ISD::SHUF128:
5735 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5736 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5737 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5738 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5739 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5740 break;
5741 case X86ISD::MOVSLDUP:
5742 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5743 DecodeMOVSLDUPMask(NumElems, Mask);
5744 IsUnary = true;
5745 break;
5746 case X86ISD::MOVSHDUP:
5747 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5748 DecodeMOVSHDUPMask(NumElems, Mask);
5749 IsUnary = true;
5750 break;
5751 case X86ISD::MOVDDUP:
5752 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5753 DecodeMOVDDUPMask(NumElems, Mask);
5754 IsUnary = true;
5755 break;
5756 case X86ISD::VPERMIL2: {
5757 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5758 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5759 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5760 SDValue MaskNode = N.getOperand(2);
5761 SDValue CtrlNode = N.getOperand(3);
5762 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5763 unsigned CtrlImm = CtrlOp->getZExtValue();
5764 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5765 RawUndefs)) {
5766 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5767 Mask);
5768 break;
5769 }
5770 }
5771 return false;
5772 }
5773 case X86ISD::VPPERM: {
5774 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5775 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5776 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5777 SDValue MaskNode = N.getOperand(2);
5778 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5779 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5780 break;
5781 }
5782 return false;
5783 }
5784 case X86ISD::VPERMV: {
5785 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5786 IsUnary = true;
5787 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5788 Ops.push_back(N.getOperand(1));
5789 SDValue MaskNode = N.getOperand(0);
5790 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5791 RawUndefs)) {
5792 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5793 break;
5794 }
5795 return false;
5796 }
5797 case X86ISD::VPERMV3: {
5798 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5799 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5800 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5801 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5802 Ops.push_back(N.getOperand(0));
5803 Ops.push_back(N.getOperand(2));
5804 SDValue MaskNode = N.getOperand(1);
5805 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5806 RawUndefs)) {
5807 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5808 break;
5809 }
5810 return false;
5811 }
5812 default:
5813 llvm_unreachable("unknown target shuffle node");
5814 }
5815
5816 // Empty mask indicates the decode failed.
5817 if (Mask.empty())
5818 return false;
5819
5820 // Check if we're getting a shuffle mask with zero'd elements.
5821 if (!AllowSentinelZero && isAnyZero(Mask))
5822 return false;
5823
5824 // If we have a fake unary shuffle, the shuffle mask is spread across two
5825 // inputs that are actually the same node. Re-map the mask to always point
5826 // into the first input.
5827 if (IsFakeUnary)
5828 for (int &M : Mask)
5829 if (M >= (int)Mask.size())
5830 M -= Mask.size();
5831
5832 // If we didn't already add operands in the opcode-specific code, default to
5833 // adding 1 or 2 operands starting at 0.
5834 if (Ops.empty()) {
5835 Ops.push_back(N.getOperand(0));
5836 if (!IsUnary || IsFakeUnary)
5837 Ops.push_back(N.getOperand(1));
5838 }
5839
5840 return true;
5841}
5842
5843// Wrapper for getTargetShuffleMask with InUnary;
5844static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5846 SmallVectorImpl<int> &Mask) {
5847 bool IsUnary;
5848 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5849}
5850
5851/// Compute whether each element of a shuffle is zeroable.
5852///
5853/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5854/// Either it is an undef element in the shuffle mask, the element of the input
5855/// referenced is undef, or the element of the input referenced is known to be
5856/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5857/// as many lanes with this technique as possible to simplify the remaining
5858/// shuffle.
5860 SDValue V1, SDValue V2,
5861 APInt &KnownUndef, APInt &KnownZero) {
5862 int Size = Mask.size();
5863 KnownUndef = KnownZero = APInt::getZero(Size);
5864
5865 V1 = peekThroughBitcasts(V1);
5866 V2 = peekThroughBitcasts(V2);
5867
5868 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5869 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5870
5871 int VectorSizeInBits = V1.getValueSizeInBits();
5872 int ScalarSizeInBits = VectorSizeInBits / Size;
5873 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5874
5875 for (int i = 0; i < Size; ++i) {
5876 int M = Mask[i];
5877 // Handle the easy cases.
5878 if (M < 0) {
5879 KnownUndef.setBit(i);
5880 continue;
5881 }
5882 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5883 KnownZero.setBit(i);
5884 continue;
5885 }
5886
5887 // Determine shuffle input and normalize the mask.
5888 SDValue V = M < Size ? V1 : V2;
5889 M %= Size;
5890
5891 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5892 if (V.getOpcode() != ISD::BUILD_VECTOR)
5893 continue;
5894
5895 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5896 // the (larger) source element must be UNDEF/ZERO.
5897 if ((Size % V.getNumOperands()) == 0) {
5898 int Scale = Size / V->getNumOperands();
5899 SDValue Op = V.getOperand(M / Scale);
5900 if (Op.isUndef())
5901 KnownUndef.setBit(i);
5902 if (X86::isZeroNode(Op))
5903 KnownZero.setBit(i);
5904 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5905 APInt Val = Cst->getAPIntValue();
5906 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5907 if (Val == 0)
5908 KnownZero.setBit(i);
5909 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5910 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5911 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5912 if (Val == 0)
5913 KnownZero.setBit(i);
5914 }
5915 continue;
5916 }
5917
5918 // If the BUILD_VECTOR has more elements then all the (smaller) source
5919 // elements must be UNDEF or ZERO.
5920 if ((V.getNumOperands() % Size) == 0) {
5921 int Scale = V->getNumOperands() / Size;
5922 bool AllUndef = true;
5923 bool AllZero = true;
5924 for (int j = 0; j < Scale; ++j) {
5925 SDValue Op = V.getOperand((M * Scale) + j);
5926 AllUndef &= Op.isUndef();
5927 AllZero &= X86::isZeroNode(Op);
5928 }
5929 if (AllUndef)
5930 KnownUndef.setBit(i);
5931 if (AllZero)
5932 KnownZero.setBit(i);
5933 continue;
5934 }
5935 }
5936}
5937
5938/// Decode a target shuffle mask and inputs and see if any values are
5939/// known to be undef or zero from their inputs.
5940/// Returns true if the target shuffle mask was decoded.
5941/// FIXME: Merge this with computeZeroableShuffleElements?
5944 APInt &KnownUndef, APInt &KnownZero) {
5945 bool IsUnary;
5946 if (!isTargetShuffle(N.getOpcode()))
5947 return false;
5948
5949 MVT VT = N.getSimpleValueType();
5950 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5951 return false;
5952
5953 int Size = Mask.size();
5954 SDValue V1 = Ops[0];
5955 SDValue V2 = IsUnary ? V1 : Ops[1];
5956 KnownUndef = KnownZero = APInt::getZero(Size);
5957
5958 V1 = peekThroughBitcasts(V1);
5959 V2 = peekThroughBitcasts(V2);
5960
5961 assert((VT.getSizeInBits() % Size) == 0 &&
5962 "Illegal split of shuffle value type");
5963 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5964
5965 // Extract known constant input data.
5966 APInt UndefSrcElts[2];
5967 SmallVector<APInt, 32> SrcEltBits[2];
5968 bool IsSrcConstant[2] = {
5969 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5970 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5971 /*AllowPartialUndefs*/ false),
5972 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5973 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5974 /*AllowPartialUndefs*/ false)};
5975
5976 for (int i = 0; i < Size; ++i) {
5977 int M = Mask[i];
5978
5979 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5980 if (M < 0) {
5981 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5982 if (SM_SentinelUndef == M)
5983 KnownUndef.setBit(i);
5984 if (SM_SentinelZero == M)
5985 KnownZero.setBit(i);
5986 continue;
5987 }
5988
5989 // Determine shuffle input and normalize the mask.
5990 unsigned SrcIdx = M / Size;
5991 SDValue V = M < Size ? V1 : V2;
5992 M %= Size;
5993
5994 // We are referencing an UNDEF input.
5995 if (V.isUndef()) {
5996 KnownUndef.setBit(i);
5997 continue;
5998 }
5999
6000 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6001 // TODO: We currently only set UNDEF for integer types - floats use the same
6002 // registers as vectors and many of the scalar folded loads rely on the
6003 // SCALAR_TO_VECTOR pattern.
6004 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6005 (Size % V.getValueType().getVectorNumElements()) == 0) {
6006 int Scale = Size / V.getValueType().getVectorNumElements();
6007 int Idx = M / Scale;
6008 if (Idx != 0 && !VT.isFloatingPoint())
6009 KnownUndef.setBit(i);
6010 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6011 KnownZero.setBit(i);
6012 continue;
6013 }
6014
6015 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6016 // base vectors.
6017 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6018 SDValue Vec = V.getOperand(0);
6019 int NumVecElts = Vec.getValueType().getVectorNumElements();
6020 if (Vec.isUndef() && Size == NumVecElts) {
6021 int Idx = V.getConstantOperandVal(2);
6022 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6023 if (M < Idx || (Idx + NumSubElts) <= M)
6024 KnownUndef.setBit(i);
6025 }
6026 continue;
6027 }
6028
6029 // Attempt to extract from the source's constant bits.
6030 if (IsSrcConstant[SrcIdx]) {
6031 if (UndefSrcElts[SrcIdx][M])
6032 KnownUndef.setBit(i);
6033 else if (SrcEltBits[SrcIdx][M] == 0)
6034 KnownZero.setBit(i);
6035 }
6036 }
6037
6038 assert(VT.getVectorNumElements() == (unsigned)Size &&
6039 "Different mask size from vector size!");
6040 return true;
6041}
6042
6043// Replace target shuffle mask elements with known undef/zero sentinels.
6045 const APInt &KnownUndef,
6046 const APInt &KnownZero,
6047 bool ResolveKnownZeros= true) {
6048 unsigned NumElts = Mask.size();
6049 assert(KnownUndef.getBitWidth() == NumElts &&
6050 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6051
6052 for (unsigned i = 0; i != NumElts; ++i) {
6053 if (KnownUndef[i])
6054 Mask[i] = SM_SentinelUndef;
6055 else if (ResolveKnownZeros && KnownZero[i])
6056 Mask[i] = SM_SentinelZero;
6057 }
6058}
6059
6060// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6062 APInt &KnownUndef,
6063 APInt &KnownZero) {
6064 unsigned NumElts = Mask.size();
6065 KnownUndef = KnownZero = APInt::getZero(NumElts);
6066
6067 for (unsigned i = 0; i != NumElts; ++i) {
6068 int M = Mask[i];
6069 if (SM_SentinelUndef == M)
6070 KnownUndef.setBit(i);
6071 if (SM_SentinelZero == M)
6072 KnownZero.setBit(i);
6073 }
6074}
6075
6076// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6078 SDValue Cond, bool IsBLENDV = false) {
6079 EVT CondVT = Cond.getValueType();
6080 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6081 unsigned NumElts = CondVT.getVectorNumElements();
6082
6083 APInt UndefElts;
6084 SmallVector<APInt, 32> EltBits;
6085 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6086 /*AllowWholeUndefs*/ true,
6087 /*AllowPartialUndefs*/ false))
6088 return false;
6089
6090 Mask.resize(NumElts, SM_SentinelUndef);
6091
6092 for (int i = 0; i != (int)NumElts; ++i) {
6093 Mask[i] = i;
6094 // Arbitrarily choose from the 2nd operand if the select condition element
6095 // is undef.
6096 // TODO: Can we do better by matching patterns such as even/odd?
6097 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6098 (IsBLENDV && EltBits[i].isNonNegative()))
6099 Mask[i] += NumElts;
6100 }
6101
6102 return true;
6103}
6104
6105// Forward declaration (for getFauxShuffleMask recursive check).
6106static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6109 const SelectionDAG &DAG, unsigned Depth,
6110 bool ResolveKnownElts);
6111
6112// Attempt to decode ops that could be represented as a shuffle mask.
6113// The decoded shuffle mask may contain a different number of elements to the
6114// destination value type.
6115// TODO: Merge into getTargetShuffleInputs()
6116static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6119 const SelectionDAG &DAG, unsigned Depth,
6120 bool ResolveKnownElts) {
6121 Mask.clear();
6122 Ops.clear();
6123
6124 MVT VT = N.getSimpleValueType();
6125 unsigned NumElts = VT.getVectorNumElements();
6126 unsigned NumSizeInBits = VT.getSizeInBits();
6127 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6128 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6129 return false;
6130 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6131 unsigned NumSizeInBytes = NumSizeInBits / 8;
6132 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6133
6134 unsigned Opcode = N.getOpcode();
6135 switch (Opcode) {
6136 case ISD::VECTOR_SHUFFLE: {
6137 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6138 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6139 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6140 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6141 Ops.push_back(N.getOperand(0));
6142 Ops.push_back(N.getOperand(1));
6143 return true;
6144 }
6145 return false;
6146 }
6147 case ISD::AND:
6148 case X86ISD::ANDNP: {
6149 // Attempt to decode as a per-byte mask.
6150 APInt UndefElts;
6151 SmallVector<APInt, 32> EltBits;
6152 SDValue N0 = N.getOperand(0);
6153 SDValue N1 = N.getOperand(1);
6154 bool IsAndN = (X86ISD::ANDNP == Opcode);
6155 uint64_t ZeroMask = IsAndN ? 255 : 0;
6156 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6157 /*AllowWholeUndefs*/ false,
6158 /*AllowPartialUndefs*/ false))
6159 return false;
6160 // We can't assume an undef src element gives an undef dst - the other src
6161 // might be zero.
6162 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6163 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6164 const APInt &ByteBits = EltBits[i];
6165 if (ByteBits != 0 && ByteBits != 255)
6166 return false;
6167 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6168 }
6169 Ops.push_back(IsAndN ? N1 : N0);
6170 return true;
6171 }
6172 case ISD::OR: {
6173 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6174 // is a valid shuffle index.
6175 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6176 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6177 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6178 return false;
6179
6180 SmallVector<int, 64> SrcMask0, SrcMask1;
6181 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6184 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6185 Depth + 1, true) ||
6186 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6187 Depth + 1, true))
6188 return false;
6189
6190 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6191 SmallVector<int, 64> Mask0, Mask1;
6192 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6193 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6194 for (int i = 0; i != (int)MaskSize; ++i) {
6195 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6196 // loops converting between OR and BLEND shuffles due to
6197 // canWidenShuffleElements merging away undef elements, meaning we
6198 // fail to recognise the OR as the undef element isn't known zero.
6199 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6200 Mask.push_back(SM_SentinelZero);
6201 else if (Mask1[i] == SM_SentinelZero)
6202 Mask.push_back(i);
6203 else if (Mask0[i] == SM_SentinelZero)
6204 Mask.push_back(i + MaskSize);
6205 else
6206 return false;
6207 }
6208 Ops.push_back(N.getOperand(0));
6209 Ops.push_back(N.getOperand(1));
6210 return true;
6211 }
6212 case ISD::CONCAT_VECTORS: {
6213 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6214 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6215 if (NumBitsPerElt == 64) {
6216 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6217 for (unsigned M = 0; M != NumSubElts; ++M)
6218 Mask.push_back((I * NumElts) + M);
6219 Ops.push_back(N.getOperand(I));
6220 }
6221 return true;
6222 }
6223 return false;
6224 }
6225 case ISD::INSERT_SUBVECTOR: {
6226 SDValue Src = N.getOperand(0);
6227 SDValue Sub = N.getOperand(1);
6228 EVT SubVT = Sub.getValueType();
6229 unsigned NumSubElts = SubVT.getVectorNumElements();
6230 uint64_t InsertIdx = N.getConstantOperandVal(2);
6231 // Subvector isn't demanded - just return the base vector.
6232 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6233 Mask.resize(NumElts);
6234 std::iota(Mask.begin(), Mask.end(), 0);
6235 Ops.push_back(Src);
6236 return true;
6237 }
6238 // Handle CONCAT(SUB0, SUB1).
6239 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6240 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6241 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6242 Src.getOperand(0).isUndef() &&
6243 Src.getOperand(1).getValueType() == SubVT &&
6244 Src.getConstantOperandVal(2) == 0 &&
6245 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6246 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6247 Mask.resize(NumElts);
6248 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6249 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6250 Ops.push_back(Src.getOperand(1));
6251 Ops.push_back(Sub);
6252 return true;
6253 }
6254 if (!N->isOnlyUserOf(Sub.getNode()))
6255 return false;
6256
6257 SmallVector<int, 64> SubMask;
6258 SmallVector<SDValue, 2> SubInputs;
6260 EVT SubSrcVT = SubSrc.getValueType();
6261 if (!SubSrcVT.isVector())
6262 return false;
6263
6264 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6265 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6266 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6267 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6268 SDValue SubSrcSrc = SubSrc.getOperand(0);
6269 unsigned NumSubSrcSrcElts =
6270 SubSrcSrc.getValueType().getVectorNumElements();
6271 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6272 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6273 "Subvector valuetype mismatch");
6274 InsertIdx *= (MaxElts / NumElts);
6275 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6276 NumSubElts *= (MaxElts / NumElts);
6277 bool SrcIsUndef = Src.isUndef();
6278 for (int i = 0; i != (int)MaxElts; ++i)
6279 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6280 for (int i = 0; i != (int)NumSubElts; ++i)
6281 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6282 if (!SrcIsUndef)
6283 Ops.push_back(Src);
6284 Ops.push_back(SubSrcSrc);
6285 return true;
6286 }
6287
6288 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6289 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6290 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6291 Depth + 1, ResolveKnownElts))
6292 return false;
6293
6294 // Subvector shuffle inputs must not be larger than the subvector.
6295 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6296 return SubVT.getFixedSizeInBits() <
6297 SubInput.getValueSizeInBits().getFixedValue();
6298 }))
6299 return false;
6300
6301 if (SubMask.size() != NumSubElts) {
6302 assert(((SubMask.size() % NumSubElts) == 0 ||
6303 (NumSubElts % SubMask.size()) == 0) &&
6304 "Illegal submask scale");
6305 if ((NumSubElts % SubMask.size()) == 0) {
6306 int Scale = NumSubElts / SubMask.size();
6307 SmallVector<int, 64> ScaledSubMask;
6308 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6309 SubMask = ScaledSubMask;
6310 } else {
6311 int Scale = SubMask.size() / NumSubElts;
6312 NumSubElts = SubMask.size();
6313 NumElts *= Scale;
6314 InsertIdx *= Scale;
6315 }
6316 }
6317 Ops.push_back(Src);
6318 Ops.append(SubInputs.begin(), SubInputs.end());
6319 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6320 Mask.append(NumElts, SM_SentinelZero);
6321 else
6322 for (int i = 0; i != (int)NumElts; ++i)
6323 Mask.push_back(i);
6324 for (int i = 0; i != (int)NumSubElts; ++i) {
6325 int M = SubMask[i];
6326 if (0 <= M) {
6327 int InputIdx = M / NumSubElts;
6328 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6329 }
6330 Mask[i + InsertIdx] = M;
6331 }
6332 return true;
6333 }
6334 case X86ISD::PINSRB:
6335 case X86ISD::PINSRW:
6338 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6339 // vector, for matching src/dst vector types.
6340 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6341
6342 unsigned DstIdx = 0;
6343 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6344 // Check we have an in-range constant insertion index.
6345 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6346 N.getConstantOperandAPInt(2).uge(NumElts))
6347 return false;
6348 DstIdx = N.getConstantOperandVal(2);
6349
6350 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6351 if (X86::isZeroNode(Scl)) {
6352 Ops.push_back(N.getOperand(0));
6353 for (unsigned i = 0; i != NumElts; ++i)
6354 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6355 return true;
6356 }
6357 }
6358
6359 // Peek through trunc/aext/zext/bitcast.
6360 // TODO: aext shouldn't require SM_SentinelZero padding.
6361 // TODO: handle shift of scalars.
6362 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6363 while (Scl.getOpcode() == ISD::TRUNCATE ||
6364 Scl.getOpcode() == ISD::ANY_EXTEND ||
6365 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6366 (Scl.getOpcode() == ISD::BITCAST &&
6369 Scl = Scl.getOperand(0);
6370 MinBitsPerElt =
6371 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6372 }
6373 if ((MinBitsPerElt % 8) != 0)
6374 return false;
6375
6376 // Attempt to find the source vector the scalar was extracted from.
6377 SDValue SrcExtract;
6378 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6379 Scl.getOpcode() == X86ISD::PEXTRW ||
6380 Scl.getOpcode() == X86ISD::PEXTRB) &&
6381 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6382 SrcExtract = Scl;
6383 }
6384 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6385 return false;
6386
6387 SDValue SrcVec = SrcExtract.getOperand(0);
6388 EVT SrcVT = SrcVec.getValueType();
6389 if (!SrcVT.getScalarType().isByteSized())
6390 return false;
6391 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6392 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6393 unsigned DstByte = DstIdx * NumBytesPerElt;
6394 MinBitsPerElt =
6395 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6396
6397 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6398 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6399 Ops.push_back(SrcVec);
6400 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6401 } else {
6402 Ops.push_back(SrcVec);
6403 Ops.push_back(N.getOperand(0));
6404 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6405 Mask.push_back(NumSizeInBytes + i);
6406 }
6407
6408 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6409 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6410 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6411 Mask[DstByte + i] = SrcByte + i;
6412 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6413 Mask[DstByte + i] = SM_SentinelZero;
6414 return true;
6415 }
6416 case X86ISD::PACKSS:
6417 case X86ISD::PACKUS: {
6418 SDValue N0 = N.getOperand(0);
6419 SDValue N1 = N.getOperand(1);
6420 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6421 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6422 "Unexpected input value type");
6423
6424 APInt EltsLHS, EltsRHS;
6425 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6426
6427 // If we know input saturation won't happen (or we don't care for particular
6428 // lanes), we can treat this as a truncation shuffle.
6429 bool Offset0 = false, Offset1 = false;
6430 if (Opcode == X86ISD::PACKSS) {
6431 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6432 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6433 (!(N1.isUndef() || EltsRHS.isZero()) &&
6434 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6435 return false;
6436 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6437 // PACKSS then it was likely being used for sign-extension for a
6438 // truncation, so just peek through and adjust the mask accordingly.
6439 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6440 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6441 Offset0 = true;
6442 N0 = N0.getOperand(0);
6443 }
6444 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6445 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6446 Offset1 = true;
6447 N1 = N1.getOperand(0);
6448 }
6449 } else {
6450 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6451 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6452 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6453 (!(N1.isUndef() || EltsRHS.isZero()) &&
6454 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6455 return false;
6456 }
6457
6458 bool IsUnary = (N0 == N1);
6459
6460 Ops.push_back(N0);
6461 if (!IsUnary)
6462 Ops.push_back(N1);
6463
6464 createPackShuffleMask(VT, Mask, IsUnary);
6465
6466 if (Offset0 || Offset1) {
6467 for (int &M : Mask)
6468 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6469 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6470 ++M;
6471 }
6472 return true;
6473 }
6474 case ISD::VSELECT:
6475 case X86ISD::BLENDV: {
6476 SDValue Cond = N.getOperand(0);
6477 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6478 Ops.push_back(N.getOperand(1));
6479 Ops.push_back(N.getOperand(2));
6480 return true;
6481 }
6482 return false;
6483 }
6484 case X86ISD::VTRUNC: {
6485 SDValue Src = N.getOperand(0);
6486 EVT SrcVT = Src.getValueType();
6487 if (SrcVT.getSizeInBits() != NumSizeInBits)
6488 return false;
6489 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6490 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6491 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6492 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6493 for (unsigned i = 0; i != NumSrcElts; ++i)
6494 Mask.push_back(i * Scale);
6495 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6496 Ops.push_back(Src);
6497 return true;
6498 }
6499 case ISD::SHL:
6500 case ISD::SRL: {
6501 APInt UndefElts;
6502 SmallVector<APInt, 32> EltBits;
6503 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6504 UndefElts, EltBits,
6505 /*AllowWholeUndefs*/ true,
6506 /*AllowPartialUndefs*/ false))
6507 return false;
6508
6509 // We can only decode 'whole byte' bit shifts as shuffles.
6510 for (unsigned I = 0; I != NumElts; ++I)
6511 if (DemandedElts[I] && !UndefElts[I] &&
6512 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6513 return false;
6514
6515 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6516 Ops.push_back(N.getOperand(0));
6517
6518 for (unsigned I = 0; I != NumElts; ++I) {
6519 if (!DemandedElts[I] || UndefElts[I])
6520 continue;
6521 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6522 unsigned Lo = I * NumBytesPerElt;
6523 unsigned Hi = Lo + NumBytesPerElt;
6524 // Clear mask to all zeros and insert the shifted byte indices.
6525 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6526 if (ISD::SHL == Opcode)
6527 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6528 else
6529 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6530 Lo + ByteShift);
6531 }
6532 return true;
6533 }
6534 case X86ISD::VSHLI:
6535 case X86ISD::VSRLI: {
6536 uint64_t ShiftVal = N.getConstantOperandVal(1);
6537 // Out of range bit shifts are guaranteed to be zero.
6538 if (NumBitsPerElt <= ShiftVal) {
6539 Mask.append(NumElts, SM_SentinelZero);
6540 return true;
6541 }
6542
6543 // We can only decode 'whole byte' bit shifts as shuffles.
6544 if ((ShiftVal % 8) != 0)
6545 break;
6546
6547 uint64_t ByteShift = ShiftVal / 8;
6548 Ops.push_back(N.getOperand(0));
6549
6550 // Clear mask to all zeros and insert the shifted byte indices.
6551 Mask.append(NumSizeInBytes, SM_SentinelZero);
6552
6553 if (X86ISD::VSHLI == Opcode) {
6554 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6555 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6556 Mask[i + j] = i + j - ByteShift;
6557 } else {
6558 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6559 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6560 Mask[i + j - ByteShift] = i + j;
6561 }
6562 return true;
6563 }
6564 case X86ISD::VROTLI:
6565 case X86ISD::VROTRI: {
6566 // We can only decode 'whole byte' bit rotates as shuffles.
6567 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6568 if ((RotateVal % 8) != 0)
6569 return false;
6570 Ops.push_back(N.getOperand(0));
6571 int Offset = RotateVal / 8;
6572 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6573 for (int i = 0; i != (int)NumElts; ++i) {
6574 int BaseIdx = i * NumBytesPerElt;
6575 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6576 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6577 }
6578 }
6579 return true;
6580 }
6581 case X86ISD::VBROADCAST: {
6582 SDValue Src = N.getOperand(0);
6583 if (!Src.getSimpleValueType().isVector()) {
6584 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6585 !isNullConstant(Src.getOperand(1)) ||
6586 Src.getOperand(0).getValueType().getScalarType() !=
6587 VT.getScalarType())
6588 return false;
6589 Src = Src.getOperand(0);
6590 }
6591 Ops.push_back(Src);
6592 Mask.append(NumElts, 0);
6593 return true;
6594 }
6596 SDValue Src = N.getOperand(0);
6597 EVT SrcVT = Src.getValueType();
6598 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6599
6600 // Extended source must be a simple vector.
6601 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6602 (NumBitsPerSrcElt % 8) != 0)
6603 return false;
6604
6605 // We can only handle all-signbits extensions.
6606 APInt DemandedSrcElts =
6607 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6608 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6609 return false;
6610
6611 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6612 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6613 for (unsigned I = 0; I != NumElts; ++I)
6614 Mask.append(Scale, I);
6615 Ops.push_back(Src);
6616 return true;
6617 }
6618 case ISD::ZERO_EXTEND:
6619 case ISD::ANY_EXTEND:
6622 SDValue Src = N.getOperand(0);
6623 EVT SrcVT = Src.getValueType();
6624
6625 // Extended source must be a simple vector.
6626 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6627 (SrcVT.getScalarSizeInBits() % 8) != 0)
6628 return false;
6629
6630 bool IsAnyExtend =
6631 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6632 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6633 IsAnyExtend, Mask);
6634 Ops.push_back(Src);
6635 return true;
6636 }
6637 }
6638
6639 return false;
6640}
6641
6642/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6644 SmallVectorImpl<int> &Mask) {
6645 int MaskWidth = Mask.size();
6646 SmallVector<SDValue, 16> UsedInputs;
6647 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6648 int lo = UsedInputs.size() * MaskWidth;
6649 int hi = lo + MaskWidth;
6650
6651 // Strip UNDEF input usage.
6652 if (Inputs[i].isUndef())
6653 for (int &M : Mask)
6654 if ((lo <= M) && (M < hi))
6655 M = SM_SentinelUndef;
6656
6657 // Check for unused inputs.
6658 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6659 for (int &M : Mask)
6660 if (lo <= M)
6661 M -= MaskWidth;
6662 continue;
6663 }
6664
6665 // Check for repeated inputs.
6666 bool IsRepeat = false;
6667 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6668 if (UsedInputs[j] != Inputs[i])
6669 continue;
6670 for (int &M : Mask)
6671 if (lo <= M)
6672 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6673 IsRepeat = true;
6674 break;
6675 }
6676 if (IsRepeat)
6677 continue;
6678
6679 UsedInputs.push_back(Inputs[i]);
6680 }
6681 Inputs = UsedInputs;
6682}
6683
6684/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6685/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6686/// Returns true if the target shuffle mask was decoded.
6687static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6690 APInt &KnownUndef, APInt &KnownZero,
6691 const SelectionDAG &DAG, unsigned Depth,
6692 bool ResolveKnownElts) {
6694 return false; // Limit search depth.
6695
6696 EVT VT = Op.getValueType();
6697 if (!VT.isSimple() || !VT.isVector())
6698 return false;
6699
6700 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6701 if (ResolveKnownElts)
6702 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6703 return true;
6704 }
6705 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6706 ResolveKnownElts)) {
6707 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6708 return true;
6709 }
6710 return false;
6711}
6712
6713static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6716 const SelectionDAG &DAG, unsigned Depth,
6717 bool ResolveKnownElts) {
6718 APInt KnownUndef, KnownZero;
6719 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6720 KnownZero, DAG, Depth, ResolveKnownElts);
6721}
6722
6725 const SelectionDAG &DAG, unsigned Depth = 0,
6726 bool ResolveKnownElts = true) {
6727 EVT VT = Op.getValueType();
6728 if (!VT.isSimple() || !VT.isVector())
6729 return false;
6730
6731 unsigned NumElts = Op.getValueType().getVectorNumElements();
6732 APInt DemandedElts = APInt::getAllOnes(NumElts);
6733 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6734 ResolveKnownElts);
6735}
6736
6737// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6738static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6739 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6740 SelectionDAG &DAG) {
6741 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6742 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6743 "Unknown broadcast load type");
6744
6745 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6746 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6747 return SDValue();
6748
6751 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6752 SDValue Ops[] = {Mem->getChain(), Ptr};
6753 SDValue BcstLd = DAG.getMemIntrinsicNode(
6754 Opcode, DL, Tys, Ops, MemVT,
6756 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6757 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6758 return BcstLd;
6759}
6760
6761/// Returns the scalar element that will make up the i'th
6762/// element of the result of the vector shuffle.
6763static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6764 SelectionDAG &DAG, unsigned Depth) {
6766 return SDValue(); // Limit search depth.
6767
6768 EVT VT = Op.getValueType();
6769 unsigned Opcode = Op.getOpcode();
6770 unsigned NumElems = VT.getVectorNumElements();
6771
6772 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6773 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6774 int Elt = SV->getMaskElt(Index);
6775
6776 if (Elt < 0)
6777 return DAG.getUNDEF(VT.getVectorElementType());
6778
6779 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6780 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6781 }
6782
6783 // Recurse into target specific vector shuffles to find scalars.
6784 if (isTargetShuffle(Opcode)) {
6785 MVT ShufVT = VT.getSimpleVT();
6786 MVT ShufSVT = ShufVT.getVectorElementType();
6787 int NumElems = (int)ShufVT.getVectorNumElements();
6788 SmallVector<int, 16> ShuffleMask;
6790 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6791 return SDValue();
6792
6793 int Elt = ShuffleMask[Index];
6794 if (Elt == SM_SentinelZero)
6795 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6796 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6797 if (Elt == SM_SentinelUndef)
6798 return DAG.getUNDEF(ShufSVT);
6799
6800 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6801 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6802 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6803 }
6804
6805 // Recurse into insert_subvector base/sub vector to find scalars.
6806 if (Opcode == ISD::INSERT_SUBVECTOR) {
6807 SDValue Vec = Op.getOperand(0);
6808 SDValue Sub = Op.getOperand(1);
6809 uint64_t SubIdx = Op.getConstantOperandVal(2);
6810 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6811
6812 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6813 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6814 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6815 }
6816
6817 // Recurse into concat_vectors sub vector to find scalars.
6818 if (Opcode == ISD::CONCAT_VECTORS) {
6819 EVT SubVT = Op.getOperand(0).getValueType();
6820 unsigned NumSubElts = SubVT.getVectorNumElements();
6821 uint64_t SubIdx = Index / NumSubElts;
6822 uint64_t SubElt = Index % NumSubElts;
6823 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6824 }
6825
6826 // Recurse into extract_subvector src vector to find scalars.
6827 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6828 SDValue Src = Op.getOperand(0);
6829 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6830 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6831 }
6832
6833 // We only peek through bitcasts of the same vector width.
6834 if (Opcode == ISD::BITCAST) {
6835 SDValue Src = Op.getOperand(0);
6836 EVT SrcVT = Src.getValueType();
6837 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6838 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6839 return SDValue();
6840 }
6841
6842 // Actual nodes that may contain scalar elements
6843
6844 // For insert_vector_elt - either return the index matching scalar or recurse
6845 // into the base vector.
6846 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6847 isa<ConstantSDNode>(Op.getOperand(2))) {
6848 if (Op.getConstantOperandAPInt(2) == Index)
6849 return Op.getOperand(1);
6850 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6851 }
6852
6853 if (Opcode == ISD::SCALAR_TO_VECTOR)
6854 return (Index == 0) ? Op.getOperand(0)
6855 : DAG.getUNDEF(VT.getVectorElementType());
6856
6857 if (Opcode == ISD::BUILD_VECTOR)
6858 return Op.getOperand(Index);
6859
6860 return SDValue();
6861}
6862
6863// Use PINSRB/PINSRW/PINSRD to create a build vector.
6865 const APInt &NonZeroMask,
6866 unsigned NumNonZero, unsigned NumZero,
6867 SelectionDAG &DAG,
6868 const X86Subtarget &Subtarget) {
6869 MVT VT = Op.getSimpleValueType();
6870 unsigned NumElts = VT.getVectorNumElements();
6871 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6872 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6873 "Illegal vector insertion");
6874
6875 SDValue V;
6876 bool First = true;
6877
6878 for (unsigned i = 0; i < NumElts; ++i) {
6879 bool IsNonZero = NonZeroMask[i];
6880 if (!IsNonZero)
6881 continue;
6882
6883 // If the build vector contains zeros or our first insertion is not the
6884 // first index then insert into zero vector to break any register
6885 // dependency else use SCALAR_TO_VECTOR.
6886 if (First) {
6887 First = false;
6888 if (NumZero || 0 != i)
6889 V = getZeroVector(VT, Subtarget, DAG, DL);
6890 else {
6891 assert(0 == i && "Expected insertion into zero-index");
6892 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6893 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6894 V = DAG.getBitcast(VT, V);
6895 continue;
6896 }
6897 }
6898 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6899 DAG.getVectorIdxConstant(i, DL));
6900 }
6901
6902 return V;
6903}
6904
6905/// Custom lower build_vector of v16i8.
6907 const APInt &NonZeroMask,
6908 unsigned NumNonZero, unsigned NumZero,
6909 SelectionDAG &DAG,
6910 const X86Subtarget &Subtarget) {
6911 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6912 return SDValue();
6913
6914 // SSE4.1 - use PINSRB to insert each byte directly.
6915 if (Subtarget.hasSSE41())
6916 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6917 DAG, Subtarget);
6918
6919 SDValue V;
6920
6921 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6922 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6923 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6924 !NonZeroMask.extractBits(2, 2).isZero()) {
6925 for (unsigned I = 0; I != 4; ++I) {
6926 if (!NonZeroMask[I])
6927 continue;
6928 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6929 if (I != 0)
6930 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6931 DAG.getConstant(I * 8, DL, MVT::i8));
6932 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6933 }
6934 assert(V && "Failed to fold v16i8 vector to zero");
6935 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6936 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6937 V = DAG.getBitcast(MVT::v8i16, V);
6938 }
6939 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6940 bool ThisIsNonZero = NonZeroMask[i];
6941 bool NextIsNonZero = NonZeroMask[i + 1];
6942 if (!ThisIsNonZero && !NextIsNonZero)
6943 continue;
6944
6945 SDValue Elt;
6946 if (ThisIsNonZero) {
6947 if (NumZero || NextIsNonZero)
6948 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6949 else
6950 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6951 }
6952
6953 if (NextIsNonZero) {
6954 SDValue NextElt = Op.getOperand(i + 1);
6955 if (i == 0 && NumZero)
6956 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6957 else
6958 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6959 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6960 DAG.getConstant(8, DL, MVT::i8));
6961 if (ThisIsNonZero)
6962 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6963 else
6964 Elt = NextElt;
6965 }
6966
6967 // If our first insertion is not the first index or zeros are needed, then
6968 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6969 // elements undefined).
6970 if (!V) {
6971 if (i != 0 || NumZero)
6972 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6973 else {
6974 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6975 V = DAG.getBitcast(MVT::v8i16, V);
6976 continue;
6977 }
6978 }
6979 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6980 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6981 DAG.getVectorIdxConstant(i / 2, DL));
6982 }
6983
6984 return DAG.getBitcast(MVT::v16i8, V);
6985}
6986
6987/// Custom lower build_vector of v8i16.
6989 const APInt &NonZeroMask,
6990 unsigned NumNonZero, unsigned NumZero,
6991 SelectionDAG &DAG,
6992 const X86Subtarget &Subtarget) {
6993 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6994 return SDValue();
6995
6996 // Use PINSRW to insert each byte directly.
6997 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6998 Subtarget);
6999}
7000
7001/// Custom lower build_vector of v4i32 or v4f32.
7003 SelectionDAG &DAG,
7004 const X86Subtarget &Subtarget) {
7005 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7006 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7007 // Because we're creating a less complicated build vector here, we may enable
7008 // further folding of the MOVDDUP via shuffle transforms.
7009 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7010 Op.getOperand(0) == Op.getOperand(2) &&
7011 Op.getOperand(1) == Op.getOperand(3) &&
7012 Op.getOperand(0) != Op.getOperand(1)) {
7013 MVT VT = Op.getSimpleValueType();
7014 MVT EltVT = VT.getVectorElementType();
7015 // Create a new build vector with the first 2 elements followed by undef
7016 // padding, bitcast to v2f64, duplicate, and bitcast back.
7017 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7018 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7019 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7020 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7021 return DAG.getBitcast(VT, Dup);
7022 }
7023
7024 // Find all zeroable elements.
7025 std::bitset<4> Zeroable, Undefs;
7026 for (int i = 0; i < 4; ++i) {
7027 SDValue Elt = Op.getOperand(i);
7028 Undefs[i] = Elt.isUndef();
7029 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7030 }
7031 assert(Zeroable.size() - Zeroable.count() > 1 &&
7032 "We expect at least two non-zero elements!");
7033
7034 // We only know how to deal with build_vector nodes where elements are either
7035 // zeroable or extract_vector_elt with constant index.
7036 SDValue FirstNonZero;
7037 unsigned FirstNonZeroIdx;
7038 for (unsigned i = 0; i < 4; ++i) {
7039 if (Zeroable[i])
7040 continue;
7041 SDValue Elt = Op.getOperand(i);
7042 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7044 return SDValue();
7045 // Make sure that this node is extracting from a 128-bit vector.
7046 MVT VT = Elt.getOperand(0).getSimpleValueType();
7047 if (!VT.is128BitVector())
7048 return SDValue();
7049 if (!FirstNonZero.getNode()) {
7050 FirstNonZero = Elt;
7051 FirstNonZeroIdx = i;
7052 }
7053 }
7054
7055 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7056 SDValue V1 = FirstNonZero.getOperand(0);
7057 MVT VT = V1.getSimpleValueType();
7058
7059 // See if this build_vector can be lowered as a blend with zero.
7060 SDValue Elt;
7061 unsigned EltMaskIdx, EltIdx;
7062 int Mask[4];
7063 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7064 if (Zeroable[EltIdx]) {
7065 // The zero vector will be on the right hand side.
7066 Mask[EltIdx] = EltIdx+4;
7067 continue;
7068 }
7069
7070 Elt = Op->getOperand(EltIdx);
7071 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7072 EltMaskIdx = Elt.getConstantOperandVal(1);
7073 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7074 break;
7075 Mask[EltIdx] = EltIdx;
7076 }
7077
7078 if (EltIdx == 4) {
7079 // Let the shuffle legalizer deal with blend operations.
7080 SDValue VZeroOrUndef = (Zeroable == Undefs)
7081 ? DAG.getUNDEF(VT)
7082 : getZeroVector(VT, Subtarget, DAG, DL);
7083 if (V1.getSimpleValueType() != VT)
7084 V1 = DAG.getBitcast(VT, V1);
7085 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7086 }
7087
7088 // See if we can lower this build_vector to a INSERTPS.
7089 if (!Subtarget.hasSSE41())
7090 return SDValue();
7091
7092 SDValue V2 = Elt.getOperand(0);
7093 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7094 V1 = SDValue();
7095
7096 bool CanFold = true;
7097 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7098 if (Zeroable[i])
7099 continue;
7100
7101 SDValue Current = Op->getOperand(i);
7102 SDValue SrcVector = Current->getOperand(0);
7103 if (!V1.getNode())
7104 V1 = SrcVector;
7105 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7106 }
7107
7108 if (!CanFold)
7109 return SDValue();
7110
7111 assert(V1.getNode() && "Expected at least two non-zero elements!");
7112 if (V1.getSimpleValueType() != MVT::v4f32)
7113 V1 = DAG.getBitcast(MVT::v4f32, V1);
7114 if (V2.getSimpleValueType() != MVT::v4f32)
7115 V2 = DAG.getBitcast(MVT::v4f32, V2);
7116
7117 // Ok, we can emit an INSERTPS instruction.
7118 unsigned ZMask = Zeroable.to_ulong();
7119
7120 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7121 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7122 SDValue Result =
7123 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7124 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7125 return DAG.getBitcast(VT, Result);
7126}
7127
7128/// Return a vector logical shift node.
7129static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7130 SelectionDAG &DAG, const TargetLowering &TLI,
7131 const SDLoc &dl) {
7132 assert(VT.is128BitVector() && "Unknown type for VShift");
7133 MVT ShVT = MVT::v16i8;
7134 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7135 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7136 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7137 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7138 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7139}
7140
7142 SelectionDAG &DAG) {
7143
7144 // Check if the scalar load can be widened into a vector load. And if
7145 // the address is "base + cst" see if the cst can be "absorbed" into
7146 // the shuffle mask.
7148 SDValue Ptr = LD->getBasePtr();
7149 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7150 return SDValue();
7151 EVT PVT = LD->getValueType(0);
7152 if (PVT != MVT::i32 && PVT != MVT::f32)
7153 return SDValue();
7154
7155 int FI = -1;
7156 int64_t Offset = 0;
7158 FI = FINode->getIndex();
7159 Offset = 0;
7160 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7161 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7162 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7163 Offset = Ptr.getConstantOperandVal(1);
7164 Ptr = Ptr.getOperand(0);
7165 } else {
7166 return SDValue();
7167 }
7168
7169 // FIXME: 256-bit vector instructions don't require a strict alignment,
7170 // improve this code to support it better.
7171 Align RequiredAlign(VT.getSizeInBits() / 8);
7172 SDValue Chain = LD->getChain();
7173 // Make sure the stack object alignment is at least 16 or 32.
7175 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7176 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7177 if (MFI.isFixedObjectIndex(FI)) {
7178 // Can't change the alignment. FIXME: It's possible to compute
7179 // the exact stack offset and reference FI + adjust offset instead.
7180 // If someone *really* cares about this. That's the way to implement it.
7181 return SDValue();
7182 } else {
7183 MFI.setObjectAlignment(FI, RequiredAlign);
7184 }
7185 }
7186
7187 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7188 // Ptr + (Offset & ~15).
7189 if (Offset < 0)
7190 return SDValue();
7191 if ((Offset % RequiredAlign.value()) & 3)
7192 return SDValue();
7193 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7194 if (StartOffset) {
7195 SDLoc DL(Ptr);
7196 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7197 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7198 }
7199
7200 int EltNo = (Offset - StartOffset) >> 2;
7201 unsigned NumElems = VT.getVectorNumElements();
7202
7203 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7204 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7205 LD->getPointerInfo().getWithOffset(StartOffset));
7206
7207 SmallVector<int, 8> Mask(NumElems, EltNo);
7208
7209 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7210 }
7211
7212 return SDValue();
7213}
7214
7215// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7216static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7217 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7218 auto *BaseLd = cast<LoadSDNode>(Elt);
7219 if (!BaseLd->isSimple())
7220 return false;
7221 Ld = BaseLd;
7222 ByteOffset = 0;
7223 return true;
7224 }
7225
7226 switch (Elt.getOpcode()) {
7227 case ISD::BITCAST:
7228 case ISD::TRUNCATE:
7230 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7231 case ISD::SRL:
7232 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7233 uint64_t Amt = AmtC->getZExtValue();
7234 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7235 ByteOffset += Amt / 8;
7236 return true;
7237 }
7238 }
7239 break;
7241 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7242 SDValue Src = Elt.getOperand(0);
7243 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7244 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7245 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7246 findEltLoadSrc(Src, Ld, ByteOffset)) {
7247 uint64_t Idx = IdxC->getZExtValue();
7248 ByteOffset += Idx * (SrcSizeInBits / 8);
7249 return true;
7250 }
7251 }
7252 break;
7253 }
7254
7255 return false;
7256}
7257
7258/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7259/// elements can be replaced by a single large load which has the same value as
7260/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7261///
7262/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7264 const SDLoc &DL, SelectionDAG &DAG,
7265 const X86Subtarget &Subtarget,
7266 bool IsAfterLegalize) {
7267 if ((VT.getScalarSizeInBits() % 8) != 0)
7268 return SDValue();
7269
7270 unsigned NumElems = Elts.size();
7271
7272 int LastLoadedElt = -1;
7273 APInt LoadMask = APInt::getZero(NumElems);
7274 APInt ZeroMask = APInt::getZero(NumElems);
7275 APInt UndefMask = APInt::getZero(NumElems);
7276
7277 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7278 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7279
7280 // For each element in the initializer, see if we've found a load, zero or an
7281 // undef.
7282 for (unsigned i = 0; i < NumElems; ++i) {
7283 SDValue Elt = peekThroughBitcasts(Elts[i]);
7284 if (!Elt.getNode())
7285 return SDValue();
7286 if (Elt.isUndef()) {
7287 UndefMask.setBit(i);
7288 continue;
7289 }
7291 ZeroMask.setBit(i);
7292 continue;
7293 }
7294
7295 // Each loaded element must be the correct fractional portion of the
7296 // requested vector load.
7297 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7298 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7299 return SDValue();
7300
7301 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7302 return SDValue();
7303 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7304 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7305 return SDValue();
7306
7307 LoadMask.setBit(i);
7308 LastLoadedElt = i;
7309 }
7310 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7311 NumElems &&
7312 "Incomplete element masks");
7313
7314 // Handle Special Cases - all undef or undef/zero.
7315 if (UndefMask.popcount() == NumElems)
7316 return DAG.getUNDEF(VT);
7317 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7318 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7319 : DAG.getConstantFP(0.0, DL, VT);
7320
7321 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7322 int FirstLoadedElt = LoadMask.countr_zero();
7323 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7324 EVT EltBaseVT = EltBase.getValueType();
7325 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7326 "Register/Memory size mismatch");
7327 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7328 assert(LDBase && "Did not find base load for merging consecutive loads");
7329 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7330 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7331 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7332 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7333 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7334
7335 // TODO: Support offsetting the base load.
7336 if (ByteOffsets[FirstLoadedElt] != 0)
7337 return SDValue();
7338
7339 // Check to see if the element's load is consecutive to the base load
7340 // or offset from a previous (already checked) load.
7341 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7342 LoadSDNode *Ld = Loads[EltIdx];
7343 int64_t ByteOffset = ByteOffsets[EltIdx];
7344 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7345 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7346 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7347 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7348 }
7349 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7350 EltIdx - FirstLoadedElt);
7351 };
7352
7353 // Consecutive loads can contain UNDEFS but not ZERO elements.
7354 // Consecutive loads with UNDEFs and ZEROs elements require a
7355 // an additional shuffle stage to clear the ZERO elements.
7356 bool IsConsecutiveLoad = true;
7357 bool IsConsecutiveLoadWithZeros = true;
7358 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7359 if (LoadMask[i]) {
7360 if (!CheckConsecutiveLoad(LDBase, i)) {
7361 IsConsecutiveLoad = false;
7362 IsConsecutiveLoadWithZeros = false;
7363 break;
7364 }
7365 } else if (ZeroMask[i]) {
7366 IsConsecutiveLoad = false;
7367 }
7368 }
7369
7370 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7371 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7372 assert(LDBase->isSimple() &&
7373 "Cannot merge volatile or atomic loads.");
7374 SDValue NewLd =
7375 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7376 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7377 for (auto *LD : Loads)
7378 if (LD)
7379 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7380 return NewLd;
7381 };
7382
7383 // Check if the base load is entirely dereferenceable.
7384 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7385 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7386
7387 // LOAD - all consecutive load/undefs (must start/end with a load or be
7388 // entirely dereferenceable). If we have found an entire vector of loads and
7389 // undefs, then return a large load of the entire vector width starting at the
7390 // base pointer. If the vector contains zeros, then attempt to shuffle those
7391 // elements.
7392 if (FirstLoadedElt == 0 &&
7393 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7394 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7395 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7396 return SDValue();
7397
7398 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7399 // will lower to regular temporal loads and use the cache.
7400 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7401 VT.is256BitVector() && !Subtarget.hasInt256())
7402 return SDValue();
7403
7404 if (NumElems == 1)
7405 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7406
7407 if (!ZeroMask)
7408 return CreateLoad(VT, LDBase);
7409
7410 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7411 // vector and a zero vector to clear out the zero elements.
7412 if (!IsAfterLegalize && VT.isVector()) {
7413 unsigned NumMaskElts = VT.getVectorNumElements();
7414 if ((NumMaskElts % NumElems) == 0) {
7415 unsigned Scale = NumMaskElts / NumElems;
7416 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7417 for (unsigned i = 0; i < NumElems; ++i) {
7418 if (UndefMask[i])
7419 continue;
7420 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7421 for (unsigned j = 0; j != Scale; ++j)
7422 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7423 }
7424 SDValue V = CreateLoad(VT, LDBase);
7425 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7426 : DAG.getConstantFP(0.0, DL, VT);
7427 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7428 }
7429 }
7430 }
7431
7432 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7433 if (VT.is256BitVector() || VT.is512BitVector()) {
7434 unsigned HalfNumElems = NumElems / 2;
7435 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7436 EVT HalfVT =
7437 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7438 SDValue HalfLD =
7439 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7440 DAG, Subtarget, IsAfterLegalize);
7441 if (HalfLD)
7442 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7443 HalfLD, DAG.getVectorIdxConstant(0, DL));
7444 }
7445 }
7446
7447 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7448 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7449 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7450 LoadSizeInBits == 64) &&
7451 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7452 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7453 : MVT::getIntegerVT(LoadSizeInBits);
7454 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7455 // Allow v4f32 on SSE1 only targets.
7456 // FIXME: Add more isel patterns so we can just use VT directly.
7457 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7458 VecVT = MVT::v4f32;
7459 if (TLI.isTypeLegal(VecVT)) {
7460 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7461 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7462 SDValue ResNode = DAG.getMemIntrinsicNode(
7463 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7465 for (auto *LD : Loads)
7466 if (LD)
7467 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7468 return DAG.getBitcast(VT, ResNode);
7469 }
7470 }
7471
7472 // BROADCAST - match the smallest possible repetition pattern, load that
7473 // scalar/subvector element and then broadcast to the entire vector.
7474 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7475 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7476 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7477 unsigned RepeatSize = SubElems * BaseSizeInBits;
7478 unsigned ScalarSize = std::min(RepeatSize, 64u);
7479 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7480 continue;
7481
7482 // Don't attempt a 1:N subvector broadcast - it should be caught by
7483 // combineConcatVectorOps, else will cause infinite loops.
7484 if (RepeatSize > ScalarSize && SubElems == 1)
7485 continue;
7486
7487 bool Match = true;
7488 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7489 for (unsigned i = 0; i != NumElems && Match; ++i) {
7490 if (!LoadMask[i])
7491 continue;
7492 SDValue Elt = peekThroughBitcasts(Elts[i]);
7493 if (RepeatedLoads[i % SubElems].isUndef())
7494 RepeatedLoads[i % SubElems] = Elt;
7495 else
7496 Match &= (RepeatedLoads[i % SubElems] == Elt);
7497 }
7498
7499 // We must have loads at both ends of the repetition.
7500 Match &= !RepeatedLoads.front().isUndef();
7501 Match &= !RepeatedLoads.back().isUndef();
7502 if (!Match)
7503 continue;
7504
7505 EVT RepeatVT =
7506 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7507 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7508 : EVT::getFloatingPointVT(ScalarSize);
7509 if (RepeatSize > ScalarSize)
7510 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7511 RepeatSize / ScalarSize);
7512 EVT BroadcastVT =
7513 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7514 VT.getSizeInBits() / ScalarSize);
7515 if (TLI.isTypeLegal(BroadcastVT)) {
7516 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7517 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7518 SDValue Broadcast = RepeatLoad;
7519 if (RepeatSize > ScalarSize) {
7520 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7521 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7522 } else {
7523 if (!Subtarget.hasAVX2() &&
7525 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7526 Subtarget,
7527 /*AssumeSingleUse=*/true))
7528 return SDValue();
7529 Broadcast =
7530 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7531 }
7532 return DAG.getBitcast(VT, Broadcast);
7533 }
7534 }
7535 }
7536 }
7537
7538 return SDValue();
7539}
7540
7541// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7542// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7543// are consecutive, non-overlapping, and in the right order.
7545 SelectionDAG &DAG,
7546 const X86Subtarget &Subtarget,
7547 bool IsAfterLegalize) {
7549 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7550 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7551 Elts.push_back(Elt);
7552 continue;
7553 }
7554 return SDValue();
7555 }
7556 assert(Elts.size() == VT.getVectorNumElements());
7557 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7558 IsAfterLegalize);
7559}
7560
7562 const APInt &Undefs, LLVMContext &C) {
7563 unsigned ScalarSize = VT.getScalarSizeInBits();
7564 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7565
7566 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7567 if (VT.isFloatingPoint()) {
7568 if (ScalarSize == 16)
7569 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7570 if (ScalarSize == 32)
7571 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7572 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7573 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7574 }
7575 return Constant::getIntegerValue(Ty, Val);
7576 };
7577
7578 SmallVector<Constant *, 32> ConstantVec;
7579 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7580 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7581 : getConstantScalar(Bits[I]));
7582
7583 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7584}
7585
7586static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7587 unsigned SplatBitSize, LLVMContext &C) {
7588 unsigned ScalarSize = VT.getScalarSizeInBits();
7589
7590 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7591 if (VT.isFloatingPoint()) {
7592 if (ScalarSize == 16)
7593 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7594 if (ScalarSize == 32)
7595 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7596 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7597 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7598 }
7599 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7600 };
7601
7602 if (ScalarSize == SplatBitSize)
7603 return getConstantScalar(SplatValue);
7604
7605 unsigned NumElm = SplatBitSize / ScalarSize;
7606 SmallVector<Constant *, 32> ConstantVec;
7607 for (unsigned I = 0; I != NumElm; ++I) {
7608 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7609 ConstantVec.push_back(getConstantScalar(Val));
7610 }
7611 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7612}
7613
7615 for (auto *U : N->users()) {
7616 unsigned Opc = U->getOpcode();
7617 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7618 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7619 return false;
7620 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7621 return false;
7622 if (isTargetShuffle(Opc))
7623 return true;
7624 if (Opc == ISD::BITCAST) // Ignore bitcasts
7625 return isFoldableUseOfShuffle(U);
7626 if (N->hasOneUse()) {
7627 // TODO, there may be some general way to know if a SDNode can
7628 // be folded. We now only know whether an MI is foldable.
7629 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7630 return false;
7631 return true;
7632 }
7633 }
7634 return false;
7635}
7636
7637// If the node has a single use by a VSELECT then AVX512 targets may be able to
7638// fold as a predicated instruction.
7639static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7640 unsigned SizeInBits = V.getValueSizeInBits();
7641 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7642 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7643 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7644 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7645 return true;
7646 }
7647 }
7648 return false;
7649}
7650
7651/// Attempt to use the vbroadcast instruction to generate a splat value
7652/// from a splat BUILD_VECTOR which uses:
7653/// a. A single scalar load, or a constant.
7654/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7655///
7656/// The VBROADCAST node is returned when a pattern is found,
7657/// or SDValue() otherwise.
7659 const SDLoc &dl,
7660 const X86Subtarget &Subtarget,
7661 SelectionDAG &DAG) {
7662 // VBROADCAST requires AVX.
7663 // TODO: Splats could be generated for non-AVX CPUs using SSE
7664 // instructions, but there's less potential gain for only 128-bit vectors.
7665 if (!Subtarget.hasAVX())
7666 return SDValue();
7667
7668 MVT VT = BVOp->getSimpleValueType(0);
7669 unsigned NumElts = VT.getVectorNumElements();
7670 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7671 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7672 "Unsupported vector type for broadcast.");
7673
7674 // See if the build vector is a repeating sequence of scalars (inc. splat).
7675 SDValue Ld;
7676 BitVector UndefElements;
7677 SmallVector<SDValue, 16> Sequence;
7678 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7679 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7680 if (Sequence.size() == 1)
7681 Ld = Sequence[0];
7682 }
7683
7684 // Attempt to use VBROADCASTM
7685 // From this pattern:
7686 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7687 // b. t1 = (build_vector t0 t0)
7688 //
7689 // Create (VBROADCASTM v2i1 X)
7690 if (!Sequence.empty() && Subtarget.hasCDI()) {
7691 // If not a splat, are the upper sequence values zeroable?
7692 unsigned SeqLen = Sequence.size();
7693 bool UpperZeroOrUndef =
7694 SeqLen == 1 ||
7695 llvm::all_of(ArrayRef(Sequence).drop_front(),
7696 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7697 SDValue Op0 = Sequence[0];
7698 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7699 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7700 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7701 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7702 ? Op0.getOperand(0)
7703 : Op0.getOperand(0).getOperand(0);
7704 MVT MaskVT = BOperand.getSimpleValueType();
7705 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7706 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7707 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7708 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7709 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7710 unsigned Scale = 512 / VT.getSizeInBits();
7711 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7712 }
7713 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7714 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7715 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7716 return DAG.getBitcast(VT, Bcst);
7717 }
7718 }
7719 }
7720
7721 unsigned NumUndefElts = UndefElements.count();
7722 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7723 APInt SplatValue, Undef;
7724 unsigned SplatBitSize;
7725 bool HasUndef;
7726 // Check if this is a repeated constant pattern suitable for broadcasting.
7727 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7728 SplatBitSize > VT.getScalarSizeInBits() &&
7729 SplatBitSize < VT.getSizeInBits()) {
7730 // Avoid replacing with broadcast when it's a use of a shuffle
7731 // instruction to preserve the present custom lowering of shuffles.
7732 if (isFoldableUseOfShuffle(BVOp))
7733 return SDValue();
7734 // replace BUILD_VECTOR with broadcast of the repeated constants.
7735 LLVMContext *Ctx = DAG.getContext();
7736 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7737 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7738 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7739 // Load the constant scalar/subvector and broadcast it.
7740 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7741 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7742 SDValue CP = DAG.getConstantPool(C, PVT);
7743 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7744
7745 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7746 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7747 SDValue Ops[] = {DAG.getEntryNode(), CP};
7748 MachinePointerInfo MPI =
7750 SDValue Brdcst =
7752 MPI, Alignment, MachineMemOperand::MOLoad);
7753 return DAG.getBitcast(VT, Brdcst);
7754 }
7755 if (SplatBitSize > 64) {
7756 // Load the vector of constants and broadcast it.
7757 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7758 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7759 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7760 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7761 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7762 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7763 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7764 MachinePointerInfo MPI =
7767 Ops, VVT, MPI, Alignment,
7769 }
7770 }
7771
7772 // If we are moving a scalar into a vector (Ld must be set and all elements
7773 // but 1 are undef) and that operation is not obviously supported by
7774 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7775 // That's better than general shuffling and may eliminate a load to GPR and
7776 // move from scalar to vector register.
7777 if (!Ld || NumElts - NumUndefElts != 1)
7778 return SDValue();
7779 unsigned ScalarSize = Ld.getValueSizeInBits();
7780 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7781 return SDValue();
7782 }
7783
7784 bool ConstSplatVal =
7785 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7786 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7787
7788 // TODO: Handle broadcasts of non-constant sequences.
7789
7790 // Make sure that all of the users of a non-constant load are from the
7791 // BUILD_VECTOR node.
7792 // FIXME: Is the use count needed for non-constant, non-load case?
7793 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7794 return SDValue();
7795
7796 unsigned ScalarSize = Ld.getValueSizeInBits();
7797 bool IsGE256 = (VT.getSizeInBits() >= 256);
7798
7799 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7800 // instruction to save 8 or more bytes of constant pool data.
7801 // TODO: If multiple splats are generated to load the same constant,
7802 // it may be detrimental to overall size. There needs to be a way to detect
7803 // that condition to know if this is truly a size win.
7804 bool OptForSize = DAG.shouldOptForSize();
7805
7806 // Handle broadcasting a single constant scalar from the constant pool
7807 // into a vector.
7808 // On Sandybridge (no AVX2), it is still better to load a constant vector
7809 // from the constant pool and not to broadcast it from a scalar.
7810 // But override that restriction when optimizing for size.
7811 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7812 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7813 EVT CVT = Ld.getValueType();
7814 assert(!CVT.isVector() && "Must not broadcast a vector type");
7815
7816 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7817 // For size optimization, also splat v2f64 and v2i64, and for size opt
7818 // with AVX2, also splat i8 and i16.
7819 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7820 if (ScalarSize == 32 ||
7821 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7822 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7823 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7824 const Constant *C = nullptr;
7826 C = CI->getConstantIntValue();
7828 C = CF->getConstantFPValue();
7829
7830 assert(C && "Invalid constant type");
7831
7832 SDValue CP =
7834 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7835
7836 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7837 SDValue Ops[] = {DAG.getEntryNode(), CP};
7838 MachinePointerInfo MPI =
7840 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7841 MPI, Alignment, MachineMemOperand::MOLoad);
7842 }
7843 }
7844
7845 // Handle AVX2 in-register broadcasts.
7846 if (!IsLoad && Subtarget.hasInt256() &&
7847 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7848 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7849
7850 // The scalar source must be a normal load.
7851 if (!IsLoad)
7852 return SDValue();
7853
7854 // Make sure the non-chain result is only used by this build vector.
7855 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7856 return SDValue();
7857
7858 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7859 (Subtarget.hasVLX() && ScalarSize == 64)) {
7860 auto *LN = cast<LoadSDNode>(Ld);
7861 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7862 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7863 SDValue BCast =
7865 LN->getMemoryVT(), LN->getMemOperand());
7866 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7867 return BCast;
7868 }
7869
7870 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7871 // double since there is no vbroadcastsd xmm
7872 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7873 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7874 auto *LN = cast<LoadSDNode>(Ld);
7875 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7876 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7877 SDValue BCast =
7879 LN->getMemoryVT(), LN->getMemOperand());
7880 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7881 return BCast;
7882 }
7883
7884 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7885 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7886
7887 // Unsupported broadcast.
7888 return SDValue();
7889}
7890
7891/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7892/// underlying vector and index.
7893///
7894/// Modifies \p ExtractedFromVec to the real vector and returns the real
7895/// index.
7896static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7897 SDValue ExtIdx) {
7898 int Idx = ExtIdx->getAsZExtVal();
7899 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7900 return Idx;
7901
7902 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7903 // lowered this:
7904 // (extract_vector_elt (v8f32 %1), Constant<6>)
7905 // to:
7906 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7907 // (extract_subvector (v8f32 %0), Constant<4>),
7908 // undef)
7909 // Constant<0>)
7910 // In this case the vector is the extract_subvector expression and the index
7911 // is 2, as specified by the shuffle.
7912 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7913 SDValue ShuffleVec = SVOp->getOperand(0);
7914 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7915 assert(ShuffleVecVT.getVectorElementType() ==
7916 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7917
7918 int ShuffleIdx = SVOp->getMaskElt(Idx);
7919 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7920 ExtractedFromVec = ShuffleVec;
7921 return ShuffleIdx;
7922 }
7923 return Idx;
7924}
7925
7927 SelectionDAG &DAG) {
7928 MVT VT = Op.getSimpleValueType();
7929
7930 // Skip if insert_vec_elt is not supported.
7931 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7933 return SDValue();
7934
7935 unsigned NumElems = Op.getNumOperands();
7936 SDValue VecIn1;
7937 SDValue VecIn2;
7938 SmallVector<unsigned, 4> InsertIndices;
7939 SmallVector<int, 8> Mask(NumElems, -1);
7940
7941 for (unsigned i = 0; i != NumElems; ++i) {
7942 unsigned Opc = Op.getOperand(i).getOpcode();
7943
7944 if (Opc == ISD::UNDEF)
7945 continue;
7946
7948 // Quit if more than 1 elements need inserting.
7949 if (InsertIndices.size() > 1)
7950 return SDValue();
7951
7952 InsertIndices.push_back(i);
7953 continue;
7954 }
7955
7956 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7957 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7958
7959 // Quit if non-constant index.
7960 if (!isa<ConstantSDNode>(ExtIdx))
7961 return SDValue();
7962 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7963
7964 // Quit if extracted from vector of different type.
7965 if (ExtractedFromVec.getValueType() != VT)
7966 return SDValue();
7967
7968 if (!VecIn1.getNode())
7969 VecIn1 = ExtractedFromVec;
7970 else if (VecIn1 != ExtractedFromVec) {
7971 if (!VecIn2.getNode())
7972 VecIn2 = ExtractedFromVec;
7973 else if (VecIn2 != ExtractedFromVec)
7974 // Quit if more than 2 vectors to shuffle
7975 return SDValue();
7976 }
7977
7978 if (ExtractedFromVec == VecIn1)
7979 Mask[i] = Idx;
7980 else if (ExtractedFromVec == VecIn2)
7981 Mask[i] = Idx + NumElems;
7982 }
7983
7984 if (!VecIn1.getNode())
7985 return SDValue();
7986
7987 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7988 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7989
7990 for (unsigned Idx : InsertIndices)
7991 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7992 DAG.getVectorIdxConstant(Idx, DL));
7993
7994 return NV;
7995}
7996
7997// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7999 const X86Subtarget &Subtarget) {
8000 MVT VT = Op.getSimpleValueType();
8001 MVT IVT =
8002 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
8004 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8005 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
8006 Op.getOperand(I)));
8007 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8008 return DAG.getBitcast(VT, Res);
8009}
8010
8011// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8013 SelectionDAG &DAG,
8014 const X86Subtarget &Subtarget) {
8015
8016 MVT VT = Op.getSimpleValueType();
8017 assert((VT.getVectorElementType() == MVT::i1) &&
8018 "Unexpected type in LowerBUILD_VECTORvXi1!");
8019 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8020 ISD::isBuildVectorAllOnes(Op.getNode()))
8021 return Op;
8022
8023 uint64_t Immediate = 0;
8024 SmallVector<unsigned, 16> NonConstIdx;
8025 bool IsSplat = true;
8026 bool HasConstElts = false;
8027 int SplatIdx = -1;
8028 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8029 SDValue In = Op.getOperand(idx);
8030 if (In.isUndef())
8031 continue;
8032 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8033 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8034 HasConstElts = true;
8035 } else {
8036 NonConstIdx.push_back(idx);
8037 }
8038 if (SplatIdx < 0)
8039 SplatIdx = idx;
8040 else if (In != Op.getOperand(SplatIdx))
8041 IsSplat = false;
8042 }
8043
8044 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8045 if (IsSplat) {
8046 // The build_vector allows the scalar element to be larger than the vector
8047 // element type. We need to mask it to use as a condition unless we know
8048 // the upper bits are zero.
8049 // FIXME: Use computeKnownBits instead of checking specific opcode?
8050 SDValue Cond = Op.getOperand(SplatIdx);
8051 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8052 if (Cond.getOpcode() != ISD::SETCC)
8053 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8054 DAG.getConstant(1, dl, MVT::i8));
8055
8056 // Perform the select in the scalar domain so we can use cmov.
8057 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8058 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8059 DAG.getAllOnesConstant(dl, MVT::i32),
8060 DAG.getConstant(0, dl, MVT::i32));
8061 Select = DAG.getBitcast(MVT::v32i1, Select);
8062 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8063 } else {
8064 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8065 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8066 DAG.getAllOnesConstant(dl, ImmVT),
8067 DAG.getConstant(0, dl, ImmVT));
8068 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8069 Select = DAG.getBitcast(VecVT, Select);
8070 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8071 DAG.getVectorIdxConstant(0, dl));
8072 }
8073 }
8074
8075 // insert elements one by one
8076 SDValue DstVec;
8077 if (HasConstElts) {
8078 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8079 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8080 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8081 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8082 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8083 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8084 } else {
8085 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8086 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8087 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8088 DstVec = DAG.getBitcast(VecVT, Imm);
8089 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8090 DAG.getVectorIdxConstant(0, dl));
8091 }
8092 } else
8093 DstVec = DAG.getUNDEF(VT);
8094
8095 for (unsigned InsertIdx : NonConstIdx) {
8096 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8097 Op.getOperand(InsertIdx),
8098 DAG.getVectorIdxConstant(InsertIdx, dl));
8099 }
8100 return DstVec;
8101}
8102
8103LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
8104 switch (Opcode) {
8105 case X86ISD::PACKSS:
8106 case X86ISD::PACKUS:
8107 case X86ISD::FHADD:
8108 case X86ISD::FHSUB:
8109 case X86ISD::HADD:
8110 case X86ISD::HSUB:
8111 return true;
8112 }
8113 return false;
8114}
8115
8116/// This is a helper function of LowerToHorizontalOp().
8117/// This function checks that the build_vector \p N in input implements a
8118/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8119/// may not match the layout of an x86 256-bit horizontal instruction.
8120/// In other words, if this returns true, then some extraction/insertion will
8121/// be required to produce a valid horizontal instruction.
8122///
8123/// Parameter \p Opcode defines the kind of horizontal operation to match.
8124/// For example, if \p Opcode is equal to ISD::ADD, then this function
8125/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8126/// is equal to ISD::SUB, then this function checks if this is a horizontal
8127/// arithmetic sub.
8128///
8129/// This function only analyzes elements of \p N whose indices are
8130/// in range [BaseIdx, LastIdx).
8131///
8132/// TODO: This function was originally used to match both real and fake partial
8133/// horizontal operations, but the index-matching logic is incorrect for that.
8134/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8135/// code because it is only used for partial h-op matching now?
8136static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8137 const SDLoc &DL, SelectionDAG &DAG,
8138 unsigned BaseIdx, unsigned LastIdx,
8139 SDValue &V0, SDValue &V1) {
8140 EVT VT = N->getValueType(0);
8141 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8142 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8143 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8144 "Invalid Vector in input!");
8145
8146 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8147 bool CanFold = true;
8148 unsigned ExpectedVExtractIdx = BaseIdx;
8149 unsigned NumElts = LastIdx - BaseIdx;
8150 V0 = DAG.getUNDEF(VT);
8151 V1 = DAG.getUNDEF(VT);
8152
8153 // Check if N implements a horizontal binop.
8154 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8155 SDValue Op = N->getOperand(i + BaseIdx);
8156
8157 // Skip UNDEFs.
8158 if (Op->isUndef()) {
8159 // Update the expected vector extract index.
8160 if (i * 2 == NumElts)
8161 ExpectedVExtractIdx = BaseIdx;
8162 ExpectedVExtractIdx += 2;
8163 continue;
8164 }
8165
8166 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8167
8168 if (!CanFold)
8169 break;
8170
8171 SDValue Op0 = Op.getOperand(0);
8172 SDValue Op1 = Op.getOperand(1);
8173
8174 // Try to match the following pattern:
8175 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8176 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8178 Op0.getOperand(0) == Op1.getOperand(0) &&
8181 if (!CanFold)
8182 break;
8183
8184 unsigned I0 = Op0.getConstantOperandVal(1);
8185 unsigned I1 = Op1.getConstantOperandVal(1);
8186
8187 if (i * 2 < NumElts) {
8188 if (V0.isUndef()) {
8189 V0 = Op0.getOperand(0);
8190 if (V0.getValueType() != VT)
8191 return false;
8192 }
8193 } else {
8194 if (V1.isUndef()) {
8195 V1 = Op0.getOperand(0);
8196 if (V1.getValueType() != VT)
8197 return false;
8198 }
8199 if (i * 2 == NumElts)
8200 ExpectedVExtractIdx = BaseIdx;
8201 }
8202
8203 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8204 if (I0 == ExpectedVExtractIdx)
8205 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8206 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8207 // Try to match the following dag sequence:
8208 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8209 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8210 } else
8211 CanFold = false;
8212
8213 ExpectedVExtractIdx += 2;
8214 }
8215
8216 return CanFold;
8217}
8218
8219/// Emit a sequence of two 128-bit horizontal add/sub followed by
8220/// a concat_vector.
8221///
8222/// This is a helper function of LowerToHorizontalOp().
8223/// This function expects two 256-bit vectors called V0 and V1.
8224/// At first, each vector is split into two separate 128-bit vectors.
8225/// Then, the resulting 128-bit vectors are used to implement two
8226/// horizontal binary operations.
8227///
8228/// The kind of horizontal binary operation is defined by \p X86Opcode.
8229///
8230/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8231/// the two new horizontal binop.
8232/// When Mode is set, the first horizontal binop dag node would take as input
8233/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8234/// horizontal binop dag node would take as input the lower 128-bit of V1
8235/// and the upper 128-bit of V1.
8236/// Example:
8237/// HADD V0_LO, V0_HI
8238/// HADD V1_LO, V1_HI
8239///
8240/// Otherwise, the first horizontal binop dag node takes as input the lower
8241/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8242/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8243/// Example:
8244/// HADD V0_LO, V1_LO
8245/// HADD V0_HI, V1_HI
8246///
8247/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8248/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8249/// the upper 128-bits of the result.
8250static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8251 const SDLoc &DL, SelectionDAG &DAG,
8252 unsigned X86Opcode, bool Mode,
8253 bool isUndefLO, bool isUndefHI) {
8254 MVT VT = V0.getSimpleValueType();
8255 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8256 "Invalid nodes in input!");
8257
8258 unsigned NumElts = VT.getVectorNumElements();
8259 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8260 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8261 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8262 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8263 MVT NewVT = V0_LO.getSimpleValueType();
8264
8265 SDValue LO = DAG.getUNDEF(NewVT);
8266 SDValue HI = DAG.getUNDEF(NewVT);
8267
8268 if (Mode) {
8269 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8270 if (!isUndefLO && !V0->isUndef())
8271 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8272 if (!isUndefHI && !V1->isUndef())
8273 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8274 } else {
8275 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8276 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8277 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8278
8279 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8280 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8281 }
8282
8283 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8284}
8285
8286/// Returns true iff \p BV builds a vector with the result equivalent to
8287/// the result of ADDSUB/SUBADD operation.
8288/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8289/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8290/// \p Opnd0 and \p Opnd1.
8292 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8293 SDValue &Opnd0, SDValue &Opnd1,
8294 unsigned &NumExtracts, bool &IsSubAdd,
8295 bool &HasAllowContract) {
8296 using namespace SDPatternMatch;
8297
8298 MVT VT = BV->getSimpleValueType(0);
8299 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8300 return false;
8301
8302 unsigned NumElts = VT.getVectorNumElements();
8303 SDValue InVec0 = DAG.getUNDEF(VT);
8304 SDValue InVec1 = DAG.getUNDEF(VT);
8305
8306 NumExtracts = 0;
8307 HasAllowContract = NumElts != 0;
8308
8309 // Odd-numbered elements in the input build vector are obtained from
8310 // adding/subtracting two integer/float elements.
8311 // Even-numbered elements in the input build vector are obtained from
8312 // subtracting/adding two integer/float elements.
8313 unsigned Opc[2] = {0, 0};
8314 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8315 SDValue Op = BV->getOperand(i);
8316
8317 // Skip 'undef' values.
8318 unsigned Opcode = Op.getOpcode();
8319 if (Opcode == ISD::UNDEF)
8320 continue;
8321
8322 // Early exit if we found an unexpected opcode.
8323 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8324 return false;
8325
8326 SDValue Op0 = Op.getOperand(0);
8327 SDValue Op1 = Op.getOperand(1);
8328
8329 // Try to match the following pattern:
8330 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8331 // Early exit if we cannot match that sequence.
8332 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8333 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8334 return false;
8335
8336 // We found a valid add/sub node, make sure its the same opcode as previous
8337 // elements for this parity.
8338 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8339 return false;
8340 Opc[i % 2] = Opcode;
8341
8342 // Update InVec0 and InVec1.
8343 if (InVec0.isUndef())
8344 InVec0 = Op0.getOperand(0);
8345 if (InVec1.isUndef())
8346 InVec1 = Op1.getOperand(0);
8347
8348 // Make sure that operands in input to each add/sub node always
8349 // come from a same pair of vectors.
8350 if (InVec0 != Op0.getOperand(0)) {
8351 if (Opcode == ISD::FSUB)
8352 return false;
8353
8354 // FADD is commutable. Try to commute the operands
8355 // and then test again.
8356 std::swap(Op0, Op1);
8357 if (InVec0 != Op0.getOperand(0))
8358 return false;
8359 }
8360
8361 if (InVec1 != Op1.getOperand(0))
8362 return false;
8363
8364 // Increment the number of extractions done.
8365 ++NumExtracts;
8366 HasAllowContract &= Op->getFlags().hasAllowContract();
8367 }
8368
8369 // Ensure we have found an opcode for both parities and that they are
8370 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8371 // inputs are undef.
8372 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8373 InVec0.isUndef() || InVec1.isUndef())
8374 return false;
8375
8376 IsSubAdd = Opc[0] == ISD::FADD;
8377
8378 Opnd0 = InVec0;
8379 Opnd1 = InVec1;
8380 return true;
8381}
8382
8383/// Returns true if is possible to fold MUL and an idiom that has already been
8384/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8385/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8386/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8387///
8388/// Prior to calling this function it should be known that there is some
8389/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8390/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8391/// before replacement of such SDNode with ADDSUB operation. Thus the number
8392/// of \p Opnd0 uses is expected to be equal to 2.
8393/// For example, this function may be called for the following IR:
8394/// %AB = fmul fast <2 x double> %A, %B
8395/// %Sub = fsub fast <2 x double> %AB, %C
8396/// %Add = fadd fast <2 x double> %AB, %C
8397/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8398/// <2 x i32> <i32 0, i32 3>
8399/// There is a def for %Addsub here, which potentially can be replaced by
8400/// X86ISD::ADDSUB operation:
8401/// %Addsub = X86ISD::ADDSUB %AB, %C
8402/// and such ADDSUB can further be replaced with FMADDSUB:
8403/// %Addsub = FMADDSUB %A, %B, %C.
8404///
8405/// The main reason why this method is called before the replacement of the
8406/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8407/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8408/// FMADDSUB is.
8409static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8410 SelectionDAG &DAG, SDValue &Opnd0,
8411 SDValue &Opnd1, SDValue &Opnd2,
8412 unsigned ExpectedUses,
8413 bool AllowSubAddOrAddSubContract) {
8414 if (Opnd0.getOpcode() != ISD::FMUL ||
8415 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8416 return false;
8417
8418 // FIXME: These checks must match the similar ones in
8419 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8420 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8421 // or MUL + ADDSUB to FMADDSUB.
8422 const TargetOptions &Options = DAG.getTarget().Options;
8423 bool AllowFusion =
8424 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8425 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8426 if (!AllowFusion)
8427 return false;
8428
8429 Opnd2 = Opnd1;
8430 Opnd1 = Opnd0.getOperand(1);
8431 Opnd0 = Opnd0.getOperand(0);
8432
8433 return true;
8434}
8435
8436/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8437/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8438/// X86ISD::FMSUBADD node.
8440 const SDLoc &DL,
8441 const X86Subtarget &Subtarget,
8442 SelectionDAG &DAG) {
8443 SDValue Opnd0, Opnd1;
8444 unsigned NumExtracts;
8445 bool IsSubAdd;
8446 bool HasAllowContract;
8447 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8448 HasAllowContract))
8449 return SDValue();
8450
8451 MVT VT = BV->getSimpleValueType(0);
8452
8453 // Try to generate X86ISD::FMADDSUB node here.
8454 SDValue Opnd2;
8455 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8456 HasAllowContract)) {
8457 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8458 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8459 }
8460
8461 // We only support ADDSUB.
8462 if (IsSubAdd)
8463 return SDValue();
8464
8465 // There are no known X86 targets with 512-bit ADDSUB instructions!
8466 // Convert to blend(fsub,fadd).
8467 if (VT.is512BitVector()) {
8468 SmallVector<int> Mask;
8469 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8470 Mask.push_back(I);
8471 Mask.push_back(I + E + 1);
8472 }
8473 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8474 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8475 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8476 }
8477
8478 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8479}
8480
8482 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8483 // Initialize outputs to known values.
8484 MVT VT = BV->getSimpleValueType(0);
8485 HOpcode = ISD::DELETED_NODE;
8486 V0 = DAG.getUNDEF(VT);
8487 V1 = DAG.getUNDEF(VT);
8488
8489 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8490 // half of the result is calculated independently from the 128-bit halves of
8491 // the inputs, so that makes the index-checking logic below more complicated.
8492 unsigned NumElts = VT.getVectorNumElements();
8493 unsigned GenericOpcode = ISD::DELETED_NODE;
8494 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8495 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8496 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8497 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8498 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8499 // Ignore undef elements.
8500 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8501 if (Op.isUndef())
8502 continue;
8503
8504 // If there's an opcode mismatch, we're done.
8505 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8506 return false;
8507
8508 // Initialize horizontal opcode.
8509 if (HOpcode == ISD::DELETED_NODE) {
8510 GenericOpcode = Op.getOpcode();
8511 switch (GenericOpcode) {
8512 // clang-format off
8513 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8514 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8515 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8516 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8517 default: return false;
8518 // clang-format on
8519 }
8520 }
8521
8522 SDValue Op0 = Op.getOperand(0);
8523 SDValue Op1 = Op.getOperand(1);
8524 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8526 Op0.getOperand(0) != Op1.getOperand(0) ||
8528 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8529 return false;
8530
8531 // The source vector is chosen based on which 64-bit half of the
8532 // destination vector is being calculated.
8533 if (j < NumEltsIn64Bits) {
8534 if (V0.isUndef())
8535 V0 = Op0.getOperand(0);
8536 } else {
8537 if (V1.isUndef())
8538 V1 = Op0.getOperand(0);
8539 }
8540
8541 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8542 if (SourceVec != Op0.getOperand(0))
8543 return false;
8544
8545 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8546 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8547 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8548 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8549 (j % NumEltsIn64Bits) * 2;
8550 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8551 continue;
8552
8553 // If this is not a commutative op, this does not match.
8554 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8555 return false;
8556
8557 // Addition is commutative, so try swapping the extract indexes.
8558 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8559 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8560 continue;
8561
8562 // Extract indexes do not match horizontal requirement.
8563 return false;
8564 }
8565 }
8566 // We matched. Opcode and operands are returned by reference as arguments.
8567 return true;
8568}
8569
8571 const SDLoc &DL, SelectionDAG &DAG,
8572 unsigned HOpcode, SDValue V0, SDValue V1) {
8573 // If either input vector is not the same size as the build vector,
8574 // extract/insert the low bits to the correct size.
8575 // This is free (examples: zmm --> xmm, xmm --> ymm).
8576 MVT VT = BV->getSimpleValueType(0);
8577 unsigned Width = VT.getSizeInBits();
8578 if (V0.getValueSizeInBits() > Width)
8579 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8580 else if (V0.getValueSizeInBits() < Width)
8581 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8582
8583 if (V1.getValueSizeInBits() > Width)
8584 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8585 else if (V1.getValueSizeInBits() < Width)
8586 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8587
8588 unsigned NumElts = VT.getVectorNumElements();
8589 APInt DemandedElts = APInt::getAllOnes(NumElts);
8590 for (unsigned i = 0; i != NumElts; ++i)
8591 if (BV->getOperand(i).isUndef())
8592 DemandedElts.clearBit(i);
8593
8594 // If we don't need the upper xmm, then perform as a xmm hop.
8595 unsigned HalfNumElts = NumElts / 2;
8596 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8597 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8598 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8599 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8600 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8601 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8602 }
8603
8604 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8605}
8606
8607/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8609 const X86Subtarget &Subtarget,
8610 SelectionDAG &DAG) {
8611 // We need at least 2 non-undef elements to make this worthwhile by default.
8612 unsigned NumNonUndefs =
8613 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8614 if (NumNonUndefs < 2)
8615 return SDValue();
8616
8617 // There are 4 sets of horizontal math operations distinguished by type:
8618 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8619 // subtarget feature. Try to match those "native" patterns first.
8620 MVT VT = BV->getSimpleValueType(0);
8621 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8622 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8623 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8624 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8625 unsigned HOpcode;
8626 SDValue V0, V1;
8627 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8628 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8629 }
8630
8631 // Try harder to match 256-bit ops by using extract/concat.
8632 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8633 return SDValue();
8634
8635 // Count the number of UNDEF operands in the build_vector in input.
8636 unsigned NumElts = VT.getVectorNumElements();
8637 unsigned Half = NumElts / 2;
8638 unsigned NumUndefsLO = 0;
8639 unsigned NumUndefsHI = 0;
8640 for (unsigned i = 0, e = Half; i != e; ++i)
8641 if (BV->getOperand(i)->isUndef())
8642 NumUndefsLO++;
8643
8644 for (unsigned i = Half, e = NumElts; i != e; ++i)
8645 if (BV->getOperand(i)->isUndef())
8646 NumUndefsHI++;
8647
8648 SDValue InVec0, InVec1;
8649 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8650 SDValue InVec2, InVec3;
8651 unsigned X86Opcode;
8652 bool CanFold = true;
8653
8654 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8655 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8656 InVec3) &&
8657 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8658 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8659 X86Opcode = X86ISD::HADD;
8660 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8661 InVec1) &&
8662 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8663 InVec3) &&
8664 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8665 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8666 X86Opcode = X86ISD::HSUB;
8667 else
8668 CanFold = false;
8669
8670 if (CanFold) {
8671 // Do not try to expand this build_vector into a pair of horizontal
8672 // add/sub if we can emit a pair of scalar add/sub.
8673 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8674 return SDValue();
8675
8676 // Convert this build_vector into a pair of horizontal binops followed by
8677 // a concat vector. We must adjust the outputs from the partial horizontal
8678 // matching calls above to account for undefined vector halves.
8679 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8680 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8681 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8682 bool isUndefLO = NumUndefsLO == Half;
8683 bool isUndefHI = NumUndefsHI == Half;
8684 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8685 isUndefHI);
8686 }
8687 }
8688
8689 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8690 VT == MVT::v16i16) {
8691 unsigned X86Opcode;
8692 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8693 InVec1))
8694 X86Opcode = X86ISD::HADD;
8695 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8696 InVec1))
8697 X86Opcode = X86ISD::HSUB;
8698 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8699 InVec1))
8700 X86Opcode = X86ISD::FHADD;
8701 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8702 InVec1))
8703 X86Opcode = X86ISD::FHSUB;
8704 else
8705 return SDValue();
8706
8707 // Don't try to expand this build_vector into a pair of horizontal add/sub
8708 // if we can simply emit a pair of scalar add/sub.
8709 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8710 return SDValue();
8711
8712 // Convert this build_vector into two horizontal add/sub followed by
8713 // a concat vector.
8714 bool isUndefLO = NumUndefsLO == Half;
8715 bool isUndefHI = NumUndefsHI == Half;
8716 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8717 isUndefLO, isUndefHI);
8718 }
8719
8720 return SDValue();
8721}
8722
8723static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8724 SelectionDAG &DAG);
8725
8726/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8727/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8728/// just apply the bit to the vectors.
8729/// NOTE: Its not in our interest to start make a general purpose vectorizer
8730/// from this, but enough scalar bit operations are created from the later
8731/// legalization + scalarization stages to need basic support.
8733 const X86Subtarget &Subtarget,
8734 SelectionDAG &DAG) {
8735 MVT VT = Op->getSimpleValueType(0);
8736 unsigned NumElems = VT.getVectorNumElements();
8737 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8738
8739 // Check that all elements have the same opcode.
8740 // TODO: Should we allow UNDEFS and if so how many?
8741 unsigned Opcode = Op->getOperand(0).getOpcode();
8742 for (unsigned i = 1; i < NumElems; ++i)
8743 if (Opcode != Op->getOperand(i).getOpcode())
8744 return SDValue();
8745
8746 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8747 bool IsShift = false;
8748 switch (Opcode) {
8749 default:
8750 return SDValue();
8751 case ISD::SHL:
8752 case ISD::SRL:
8753 case ISD::SRA:
8754 IsShift = true;
8755 break;
8756 case ISD::AND:
8757 case ISD::XOR:
8758 case ISD::OR:
8759 // Don't do this if the buildvector is a splat - we'd replace one
8760 // constant with an entire vector.
8761 if (Op->getSplatValue())
8762 return SDValue();
8763 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8764 return SDValue();
8765 break;
8766 }
8767
8768 SmallVector<SDValue, 4> LHSElts, RHSElts;
8769 for (SDValue Elt : Op->ops()) {
8770 SDValue LHS = Elt.getOperand(0);
8771 SDValue RHS = Elt.getOperand(1);
8772
8773 // We expect the canonicalized RHS operand to be the constant.
8775 return SDValue();
8776
8777 // Extend shift amounts.
8778 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8779 if (!IsShift)
8780 return SDValue();
8781 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8782 }
8783
8784 LHSElts.push_back(LHS);
8785 RHSElts.push_back(RHS);
8786 }
8787
8788 // Limit to shifts by uniform immediates.
8789 // TODO: Only accept vXi8/vXi64 special cases?
8790 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8791 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8792 return SDValue();
8793
8794 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8795 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8796 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8797
8798 if (!IsShift)
8799 return Res;
8800
8801 // Immediately lower the shift to ensure the constant build vector doesn't
8802 // get converted to a constant pool before the shift is lowered.
8803 return LowerShift(Res, Subtarget, DAG);
8804}
8805
8806static bool isShuffleFoldableLoad(SDValue);
8807
8808/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8809/// representing a blend.
8811 X86Subtarget const &Subtarget,
8812 SelectionDAG &DAG) {
8813 MVT VT = BVOp->getSimpleValueType(0u);
8814
8815 if (VT != MVT::v4f64)
8816 return SDValue();
8817
8818 // Collect unique operands.
8819 auto UniqueOps = SmallSet<SDValue, 16u>();
8820 for (SDValue Op : BVOp->ops()) {
8821 if (isIntOrFPConstant(Op) || Op.isUndef())
8822 return SDValue();
8823 UniqueOps.insert(Op);
8824 }
8825
8826 // Candidate BUILD_VECTOR must have 2 unique operands.
8827 if (UniqueOps.size() != 2u)
8828 return SDValue();
8829
8830 SDValue Op0 = BVOp->getOperand(0u);
8831 UniqueOps.erase(Op0);
8832 SDValue Op1 = *UniqueOps.begin();
8833
8834 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8835 isShuffleFoldableLoad(Op1)) {
8836 // Create shuffle mask.
8837 auto const NumElems = VT.getVectorNumElements();
8838 SmallVector<int, 16u> Mask(NumElems);
8839 for (auto I = 0u; I < NumElems; ++I) {
8840 SDValue Op = BVOp->getOperand(I);
8841 Mask[I] = Op == Op0 ? I : I + NumElems;
8842 }
8843 // Create shuffle of splats.
8844 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8845 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8846 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8847 }
8848
8849 return SDValue();
8850}
8851
8852/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8853/// functionality to do this, so it's all zeros, all ones, or some derivation
8854/// that is cheap to calculate.
8856 SelectionDAG &DAG,
8857 const X86Subtarget &Subtarget) {
8858 MVT VT = Op.getSimpleValueType();
8859
8860 // Vectors containing all zeros can be matched by pxor and xorps.
8861 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8862 return Op;
8863
8864 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8865 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8866 // vpcmpeqd on 256-bit vectors.
8867 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8868 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8869 return Op;
8870
8871 return getOnesVector(VT, DAG, DL);
8872 }
8873
8874 return SDValue();
8875}
8876
8877/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8878/// from a vector of source values and a vector of extraction indices.
8879/// The vectors might be manipulated to match the type of the permute op.
8880static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8881 const SDLoc &DL, SelectionDAG &DAG,
8882 const X86Subtarget &Subtarget) {
8883 MVT ShuffleVT = VT;
8884 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8885 unsigned NumElts = VT.getVectorNumElements();
8886 unsigned SizeInBits = VT.getSizeInBits();
8887
8888 // Adjust IndicesVec to match VT size.
8889 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8890 "Illegal variable permute mask size");
8891 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8892 // Narrow/widen the indices vector to the correct size.
8893 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8894 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8895 NumElts * VT.getScalarSizeInBits());
8896 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8897 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8898 SDLoc(IndicesVec), SizeInBits);
8899 // Zero-extend the index elements within the vector.
8900 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8901 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8902 IndicesVT, IndicesVec);
8903 }
8904 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8905
8906 // Handle SrcVec that don't match VT type.
8907 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8908 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8909 // Handle larger SrcVec by treating it as a larger permute.
8910 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8911 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8912 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8913 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8914 Subtarget, DAG, SDLoc(IndicesVec));
8915 SDValue NewSrcVec =
8916 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8917 if (NewSrcVec)
8918 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8919 return SDValue();
8920 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8921 // Widen smaller SrcVec to match VT.
8922 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8923 } else
8924 return SDValue();
8925 }
8926
8927 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8928 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8929 EVT SrcVT = Idx.getValueType();
8930 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8931 uint64_t IndexScale = 0;
8932 uint64_t IndexOffset = 0;
8933
8934 // If we're scaling a smaller permute op, then we need to repeat the
8935 // indices, scaling and offsetting them as well.
8936 // e.g. v4i32 -> v16i8 (Scale = 4)
8937 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8938 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8939 for (uint64_t i = 0; i != Scale; ++i) {
8940 IndexScale |= Scale << (i * NumDstBits);
8941 IndexOffset |= i << (i * NumDstBits);
8942 }
8943
8944 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8945 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8946 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8947 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8948 return Idx;
8949 };
8950
8951 unsigned Opcode = 0;
8952 switch (VT.SimpleTy) {
8953 default:
8954 break;
8955 case MVT::v16i8:
8956 if (Subtarget.hasSSSE3())
8957 Opcode = X86ISD::PSHUFB;
8958 break;
8959 case MVT::v8i16:
8960 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8961 Opcode = X86ISD::VPERMV;
8962 else if (Subtarget.hasSSSE3()) {
8963 Opcode = X86ISD::PSHUFB;
8964 ShuffleVT = MVT::v16i8;
8965 }
8966 break;
8967 case MVT::v4f32:
8968 case MVT::v4i32:
8969 if (Subtarget.hasAVX()) {
8970 Opcode = X86ISD::VPERMILPV;
8971 ShuffleVT = MVT::v4f32;
8972 } else if (Subtarget.hasSSSE3()) {
8973 Opcode = X86ISD::PSHUFB;
8974 ShuffleVT = MVT::v16i8;
8975 }
8976 break;
8977 case MVT::v2f64:
8978 case MVT::v2i64:
8979 if (Subtarget.hasAVX()) {
8980 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8981 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8982 Opcode = X86ISD::VPERMILPV;
8983 ShuffleVT = MVT::v2f64;
8984 } else if (Subtarget.hasSSE41()) {
8985 // SSE41 can compare v2i64 - select between indices 0 and 1.
8986 return DAG.getSelectCC(
8987 DL, IndicesVec,
8988 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8989 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8990 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8992 }
8993 break;
8994 case MVT::v32i8:
8995 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8996 Opcode = X86ISD::VPERMV;
8997 else if (Subtarget.hasXOP()) {
8998 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8999 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9000 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9001 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9002 return DAG.getNode(
9004 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9005 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9006 } else if (Subtarget.hasAVX()) {
9007 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9008 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9009 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9010 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9011 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9013 // Permute Lo and Hi and then select based on index range.
9014 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9015 // care about the bit[7] as its just an index vector.
9016 SDValue Idx = Ops[2];
9017 EVT VT = Idx.getValueType();
9018 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9019 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9020 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9022 };
9023 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9024 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9025 PSHUFBBuilder);
9026 }
9027 break;
9028 case MVT::v16i16:
9029 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9030 Opcode = X86ISD::VPERMV;
9031 else if (Subtarget.hasAVX()) {
9032 // Scale to v32i8 and perform as v32i8.
9033 IndicesVec = ScaleIndices(IndicesVec, 2);
9034 return DAG.getBitcast(
9036 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9037 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9038 }
9039 break;
9040 case MVT::v8f32:
9041 case MVT::v8i32:
9042 if (Subtarget.hasAVX2())
9043 Opcode = X86ISD::VPERMV;
9044 else if (Subtarget.hasAVX()) {
9045 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9046 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9047 {0, 1, 2, 3, 0, 1, 2, 3});
9048 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9049 {4, 5, 6, 7, 4, 5, 6, 7});
9050 if (Subtarget.hasXOP())
9051 return DAG.getBitcast(
9052 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9053 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9054 // Permute Lo and Hi and then select based on index range.
9055 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9056 SDValue Res = DAG.getSelectCC(
9057 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9058 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9059 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9061 return DAG.getBitcast(VT, Res);
9062 }
9063 break;
9064 case MVT::v4i64:
9065 case MVT::v4f64:
9066 if (Subtarget.hasAVX512()) {
9067 if (!Subtarget.hasVLX()) {
9068 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9069 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9070 SDLoc(SrcVec));
9071 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9072 DAG, SDLoc(IndicesVec));
9073 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9074 DAG, Subtarget);
9075 return extract256BitVector(Res, 0, DAG, DL);
9076 }
9077 Opcode = X86ISD::VPERMV;
9078 } else if (Subtarget.hasAVX()) {
9079 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9080 SDValue LoLo =
9081 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9082 SDValue HiHi =
9083 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9084 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9085 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9086 if (Subtarget.hasXOP())
9087 return DAG.getBitcast(
9088 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9089 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9090 // Permute Lo and Hi and then select based on index range.
9091 // This works as VPERMILPD only uses index bit[1] to permute elements.
9092 SDValue Res = DAG.getSelectCC(
9093 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9094 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9095 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9097 return DAG.getBitcast(VT, Res);
9098 }
9099 break;
9100 case MVT::v64i8:
9101 if (Subtarget.hasVBMI())
9102 Opcode = X86ISD::VPERMV;
9103 break;
9104 case MVT::v32i16:
9105 if (Subtarget.hasBWI())
9106 Opcode = X86ISD::VPERMV;
9107 break;
9108 case MVT::v16f32:
9109 case MVT::v16i32:
9110 case MVT::v8f64:
9111 case MVT::v8i64:
9112 if (Subtarget.hasAVX512())
9113 Opcode = X86ISD::VPERMV;
9114 break;
9115 }
9116 if (!Opcode)
9117 return SDValue();
9118
9119 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9120 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9121 "Illegal variable permute shuffle type");
9122
9123 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9124 if (Scale > 1)
9125 IndicesVec = ScaleIndices(IndicesVec, Scale);
9126
9127 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9128 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9129
9130 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9131 SDValue Res = Opcode == X86ISD::VPERMV
9132 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9133 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9134 return DAG.getBitcast(VT, Res);
9135}
9136
9137// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9138// reasoned to be a permutation of a vector by indices in a non-constant vector.
9139// (build_vector (extract_elt V, (extract_elt I, 0)),
9140// (extract_elt V, (extract_elt I, 1)),
9141// ...
9142// ->
9143// (vpermv I, V)
9144//
9145// TODO: Handle undefs
9146// TODO: Utilize pshufb and zero mask blending to support more efficient
9147// construction of vectors with constant-0 elements.
9148static SDValue
9150 SelectionDAG &DAG,
9151 const X86Subtarget &Subtarget) {
9152 SDValue SrcVec, IndicesVec;
9153
9154 auto PeekThroughFreeze = [](SDValue N) {
9155 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9156 return N->getOperand(0);
9157 return N;
9158 };
9159 // Check for a match of the permute source vector and permute index elements.
9160 // This is done by checking that the i-th build_vector operand is of the form:
9161 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9162 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9163 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9164 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9165 return SDValue();
9166
9167 // If this is the first extract encountered in V, set the source vector,
9168 // otherwise verify the extract is from the previously defined source
9169 // vector.
9170 if (!SrcVec)
9171 SrcVec = Op.getOperand(0);
9172 else if (SrcVec != Op.getOperand(0))
9173 return SDValue();
9174 SDValue ExtractedIndex = Op->getOperand(1);
9175 // Peek through extends.
9176 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9177 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9178 ExtractedIndex = ExtractedIndex.getOperand(0);
9179 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9180 return SDValue();
9181
9182 // If this is the first extract from the index vector candidate, set the
9183 // indices vector, otherwise verify the extract is from the previously
9184 // defined indices vector.
9185 if (!IndicesVec)
9186 IndicesVec = ExtractedIndex.getOperand(0);
9187 else if (IndicesVec != ExtractedIndex.getOperand(0))
9188 return SDValue();
9189
9190 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9191 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9192 return SDValue();
9193 }
9194
9195 MVT VT = V.getSimpleValueType();
9196 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9197}
9198
9199SDValue
9200X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9201 SDLoc dl(Op);
9202
9203 MVT VT = Op.getSimpleValueType();
9204 MVT EltVT = VT.getVectorElementType();
9205 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9206 unsigned NumElems = Op.getNumOperands();
9207
9208 // Generate vectors for predicate vectors.
9209 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9210 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9211
9212 if (VT.getVectorElementType() == MVT::bf16 &&
9213 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9214 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9215
9216 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9217 return VectorCst;
9218
9219 unsigned EVTBits = EltVT.getSizeInBits();
9220 APInt UndefMask = APInt::getZero(NumElems);
9221 APInt FrozenUndefMask = APInt::getZero(NumElems);
9222 APInt ZeroMask = APInt::getZero(NumElems);
9223 APInt NonZeroMask = APInt::getZero(NumElems);
9224 bool IsAllConstants = true;
9225 bool OneUseFrozenUndefs = true;
9226 SmallSet<SDValue, 8> Values;
9227 unsigned NumConstants = NumElems;
9228 for (unsigned i = 0; i < NumElems; ++i) {
9229 SDValue Elt = Op.getOperand(i);
9230 if (Elt.isUndef()) {
9231 UndefMask.setBit(i);
9232 continue;
9233 }
9234 if (ISD::isFreezeUndef(Elt.getNode())) {
9235 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9236 FrozenUndefMask.setBit(i);
9237 continue;
9238 }
9239 Values.insert(Elt);
9240 if (!isIntOrFPConstant(Elt)) {
9241 IsAllConstants = false;
9242 NumConstants--;
9243 }
9244 if (X86::isZeroNode(Elt)) {
9245 ZeroMask.setBit(i);
9246 } else {
9247 NonZeroMask.setBit(i);
9248 }
9249 }
9250
9251 // All undef vector. Return an UNDEF.
9252 if (UndefMask.isAllOnes())
9253 return DAG.getUNDEF(VT);
9254
9255 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9256 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9257 return DAG.getFreeze(DAG.getUNDEF(VT));
9258
9259 // All undef/freeze(undef)/zero vector. Return a zero vector.
9260 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9261 return getZeroVector(VT, Subtarget, DAG, dl);
9262
9263 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9264 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9265 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9266 // and blend the FREEZE-UNDEF operands back in.
9267 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9268 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9269 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9270 SmallVector<int, 16> BlendMask(NumElems, -1);
9271 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9272 for (unsigned i = 0; i < NumElems; ++i) {
9273 if (UndefMask[i]) {
9274 BlendMask[i] = -1;
9275 continue;
9276 }
9277 BlendMask[i] = i;
9278 if (!FrozenUndefMask[i])
9279 Elts[i] = Op.getOperand(i);
9280 else
9281 BlendMask[i] += NumElems;
9282 }
9283 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9284 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9285 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9286 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9287 }
9288
9289 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9290
9291 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9292 // be better off lowering to a smaller build vector and padding with
9293 // undef/zero.
9294 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9296 unsigned UpperElems = NumElems / 2;
9297 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9298 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9299 if (NumUpperUndefsOrZeros >= UpperElems) {
9300 if (VT.is512BitVector() &&
9301 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9302 UpperElems = NumElems - (NumElems / 4);
9303 // If freeze(undef) is in any upper elements, force to zero.
9304 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9305 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9306 SDValue NewBV =
9307 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9308 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9309 }
9310 }
9311
9312 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9313 return AddSub;
9314 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9315 return HorizontalOp;
9316 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9317 return Broadcast;
9318 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9319 return BitOp;
9320 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9321 return Blend;
9322
9323 unsigned NumZero = ZeroMask.popcount();
9324 unsigned NumNonZero = NonZeroMask.popcount();
9325
9326 // If we are inserting one variable into a vector of non-zero constants, try
9327 // to avoid loading each constant element as a scalar. Load the constants as a
9328 // vector and then insert the variable scalar element. If insertion is not
9329 // supported, fall back to a shuffle to get the scalar blended with the
9330 // constants. Insertion into a zero vector is handled as a special-case
9331 // somewhere below here.
9332 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9333 FrozenUndefMask.isZero() &&
9336 // Create an all-constant vector. The variable element in the old
9337 // build vector is replaced by undef in the constant vector. Save the
9338 // variable scalar element and its index for use in the insertelement.
9339 LLVMContext &Context = *DAG.getContext();
9340 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9341 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9342 SDValue VarElt;
9343 SDValue InsIndex;
9344 for (unsigned i = 0; i != NumElems; ++i) {
9345 SDValue Elt = Op.getOperand(i);
9346 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9347 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9348 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9349 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9350 else if (!Elt.isUndef()) {
9351 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9352 "Expected one variable element in this vector");
9353 VarElt = Elt;
9354 InsIndex = DAG.getVectorIdxConstant(i, dl);
9355 }
9356 }
9357 Constant *CV = ConstantVector::get(ConstVecOps);
9358 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9359
9360 // The constants we just created may not be legal (eg, floating point). We
9361 // must lower the vector right here because we can not guarantee that we'll
9362 // legalize it before loading it. This is also why we could not just create
9363 // a new build vector here. If the build vector contains illegal constants,
9364 // it could get split back up into a series of insert elements.
9365 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9366 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9367 MachineFunction &MF = DAG.getMachineFunction();
9368 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9369 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9370 unsigned InsertC = InsIndex->getAsZExtVal();
9371 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9372 if (InsertC < NumEltsInLow128Bits)
9373 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9374
9375 // There's no good way to insert into the high elements of a >128-bit
9376 // vector, so use shuffles to avoid an extract/insert sequence.
9377 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9378 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9379 SmallVector<int, 8> ShuffleMask;
9380 unsigned NumElts = VT.getVectorNumElements();
9381 for (unsigned i = 0; i != NumElts; ++i)
9382 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9383 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9384 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9385 }
9386
9387 // Special case for single non-zero, non-undef, element.
9388 if (NumNonZero == 1) {
9389 unsigned Idx = NonZeroMask.countr_zero();
9390 SDValue Item = Op.getOperand(Idx);
9391
9392 // If we have a constant or non-constant insertion into the low element of
9393 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9394 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9395 // depending on what the source datatype is.
9396 if (Idx == 0) {
9397 if (NumZero == 0)
9398 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9399
9400 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9401 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9402 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9403 assert((VT.is128BitVector() || VT.is256BitVector() ||
9404 VT.is512BitVector()) &&
9405 "Expected an SSE value type!");
9406 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9407 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9408 // zero vector.
9409 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9410 }
9411
9412 // We can't directly insert an i8 or i16 into a vector, so zero extend
9413 // it to i32 first.
9414 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9415 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9416 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9417 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9418 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9419 return DAG.getBitcast(VT, Item);
9420 }
9421 }
9422
9423 // Is it a vector logical left shift?
9424 if (NumElems == 2 && Idx == 1 &&
9425 X86::isZeroNode(Op.getOperand(0)) &&
9426 !X86::isZeroNode(Op.getOperand(1))) {
9427 unsigned NumBits = VT.getSizeInBits();
9428 return getVShift(true, VT,
9430 VT, Op.getOperand(1)),
9431 NumBits/2, DAG, *this, dl);
9432 }
9433
9434 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9435 return SDValue();
9436
9437 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9438 // is a non-constant being inserted into an element other than the low one,
9439 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9440 // movd/movss) to move this into the low element, then shuffle it into
9441 // place.
9442 if (EVTBits == 32) {
9443 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9444 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9445 }
9446 }
9447
9448 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9449 if (Values.size() == 1) {
9450 if (EVTBits == 32) {
9451 // Instead of a shuffle like this:
9452 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9453 // Check if it's possible to issue this instead.
9454 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9455 unsigned Idx = NonZeroMask.countr_zero();
9456 SDValue Item = Op.getOperand(Idx);
9457 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9458 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9459 }
9460 return SDValue();
9461 }
9462
9463 // A vector full of immediates; various special cases are already
9464 // handled, so this is best done with a single constant-pool load.
9465 if (IsAllConstants)
9466 return SDValue();
9467
9468 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9469 return V;
9470
9471 // See if we can use a vector load to get all of the elements.
9472 {
9473 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9474 if (SDValue LD =
9475 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9476 return LD;
9477 }
9478
9479 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9480 // build_vector and broadcast it.
9481 // TODO: We could probably generalize this more.
9482 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9483 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9484 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9485 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9486 // Make sure all the even/odd operands match.
9487 for (unsigned i = 2; i != NumElems; ++i)
9488 if (Ops[i % 2] != Op.getOperand(i))
9489 return false;
9490 return true;
9491 };
9492 if (CanSplat(Op, NumElems, Ops)) {
9493 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9494 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9495 // Create a new build vector and cast to v2i64/v2f64.
9496 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9497 DAG.getBuildVector(NarrowVT, dl, Ops));
9498 // Broadcast from v2i64/v2f64 and cast to final VT.
9499 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9500 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9501 NewBV));
9502 }
9503 }
9504
9505 // For AVX-length vectors, build the individual 128-bit pieces and use
9506 // shuffles to put them in place.
9507 if (VT.getSizeInBits() > 128) {
9508 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9509
9510 // Build both the lower and upper subvector.
9511 SDValue Lower =
9512 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9514 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9515
9516 // Recreate the wider vector with the lower and upper part.
9517 return concatSubVectors(Lower, Upper, DAG, dl);
9518 }
9519
9520 // Let legalizer expand 2-wide build_vectors.
9521 if (EVTBits == 64) {
9522 if (NumNonZero == 1) {
9523 // One half is zero or undef.
9524 unsigned Idx = NonZeroMask.countr_zero();
9525 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9526 Op.getOperand(Idx));
9527 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9528 }
9529 return SDValue();
9530 }
9531
9532 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9533 if (EVTBits == 8 && NumElems == 16)
9534 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9535 NumZero, DAG, Subtarget))
9536 return V;
9537
9538 if (EltVT == MVT::i16 && NumElems == 8)
9539 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9540 NumZero, DAG, Subtarget))
9541 return V;
9542
9543 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9544 if (EVTBits == 32 && NumElems == 4)
9545 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9546 return V;
9547
9548 // If element VT is == 32 bits, turn it into a number of shuffles.
9549 if (NumElems == 4 && NumZero > 0) {
9550 SmallVector<SDValue, 8> Ops(NumElems);
9551 for (unsigned i = 0; i < 4; ++i) {
9552 bool isZero = !NonZeroMask[i];
9553 if (isZero)
9554 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9555 else
9556 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9557 }
9558
9559 for (unsigned i = 0; i < 2; ++i) {
9560 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9561 default: llvm_unreachable("Unexpected NonZero count");
9562 case 0:
9563 Ops[i] = Ops[i*2]; // Must be a zero vector.
9564 break;
9565 case 1:
9566 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9567 break;
9568 case 2:
9569 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9570 break;
9571 case 3:
9572 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9573 break;
9574 }
9575 }
9576
9577 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9578 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9579 int MaskVec[] = {
9580 Reverse1 ? 1 : 0,
9581 Reverse1 ? 0 : 1,
9582 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9583 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9584 };
9585 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9586 }
9587
9588 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9589
9590 // Check for a build vector from mostly shuffle plus few inserting.
9591 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9592 return Sh;
9593
9594 // For SSE 4.1, use insertps to put the high elements into the low element.
9595 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9597 if (!Op.getOperand(0).isUndef())
9598 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9599 else
9600 Result = DAG.getUNDEF(VT);
9601
9602 for (unsigned i = 1; i < NumElems; ++i) {
9603 if (Op.getOperand(i).isUndef()) continue;
9604 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9605 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9606 }
9607 return Result;
9608 }
9609
9610 // Otherwise, expand into a number of unpckl*, start by extending each of
9611 // our (non-undef) elements to the full vector width with the element in the
9612 // bottom slot of the vector (which generates no code for SSE).
9613 SmallVector<SDValue, 8> Ops(NumElems);
9614 for (unsigned i = 0; i < NumElems; ++i) {
9615 if (!Op.getOperand(i).isUndef())
9616 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9617 else
9618 Ops[i] = DAG.getUNDEF(VT);
9619 }
9620
9621 // Next, we iteratively mix elements, e.g. for v4f32:
9622 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9623 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9624 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9625 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9626 // Generate scaled UNPCKL shuffle mask.
9627 SmallVector<int, 16> Mask;
9628 for(unsigned i = 0; i != Scale; ++i)
9629 Mask.push_back(i);
9630 for (unsigned i = 0; i != Scale; ++i)
9631 Mask.push_back(NumElems+i);
9632 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9633
9634 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9635 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9636 }
9637 return Ops[0];
9638}
9639
9640// 256-bit AVX can use the vinsertf128 instruction
9641// to create 256-bit vectors from two other 128-bit ones.
9642// TODO: Detect subvector broadcast here instead of DAG combine?
9644 SelectionDAG &DAG,
9645 const X86Subtarget &Subtarget) {
9646 MVT ResVT = Op.getSimpleValueType();
9647 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9648 "Value type must be 256-/512-bit wide");
9649
9650 unsigned NumOperands = Op.getNumOperands();
9651 unsigned NumFreezeUndef = 0;
9652 unsigned NumZero = 0;
9653 unsigned NumNonZero = 0;
9654 unsigned NonZeros = 0;
9655 SmallSet<SDValue, 4> Undefs;
9656 for (unsigned i = 0; i != NumOperands; ++i) {
9657 SDValue SubVec = Op.getOperand(i);
9658 if (SubVec.isUndef())
9659 continue;
9660 if (ISD::isFreezeUndef(SubVec.getNode())) {
9661 // If the freeze(undef) has multiple uses then we must fold to zero.
9662 if (SubVec.hasOneUse()) {
9663 ++NumFreezeUndef;
9664 } else {
9665 ++NumZero;
9666 Undefs.insert(SubVec);
9667 }
9668 }
9669 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9670 ++NumZero;
9671 else {
9672 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9673 NonZeros |= 1 << i;
9674 ++NumNonZero;
9675 }
9676 }
9677
9678 // If we have more than 2 non-zeros, build each half separately.
9679 if (NumNonZero > 2) {
9680 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9681 ArrayRef<SDUse> Ops = Op->ops();
9682 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9683 Ops.slice(0, NumOperands/2));
9684 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9685 Ops.slice(NumOperands/2));
9686 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9687 }
9688
9689 // Otherwise, build it up through insert_subvectors.
9690 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9691 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9692 : DAG.getUNDEF(ResVT));
9693
9694 // Replace Undef operands with ZeroVector.
9695 for (SDValue U : Undefs)
9697 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9698
9699 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9700 unsigned NumSubElems = SubVT.getVectorNumElements();
9701 for (unsigned i = 0; i != NumOperands; ++i) {
9702 if ((NonZeros & (1 << i)) == 0)
9703 continue;
9704
9705 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9706 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9707 }
9708
9709 return Vec;
9710}
9711
9712// Returns true if the given node is a type promotion (by concatenating i1
9713// zeros) of the result of a node that already zeros all upper bits of
9714// k-register.
9715// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9717 const X86Subtarget &Subtarget,
9718 SelectionDAG & DAG) {
9719 MVT ResVT = Op.getSimpleValueType();
9720 unsigned NumOperands = Op.getNumOperands();
9721 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9722 "Unexpected number of operands in CONCAT_VECTORS");
9723
9724 uint64_t Zeros = 0;
9725 uint64_t NonZeros = 0;
9726 for (unsigned i = 0; i != NumOperands; ++i) {
9727 SDValue SubVec = Op.getOperand(i);
9728 if (SubVec.isUndef())
9729 continue;
9730 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9731 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9732 Zeros |= (uint64_t)1 << i;
9733 else
9734 NonZeros |= (uint64_t)1 << i;
9735 }
9736
9737 unsigned NumElems = ResVT.getVectorNumElements();
9738
9739 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9740 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9741 // insert_subvector will give us two kshifts.
9742 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9743 Log2_64(NonZeros) != NumOperands - 1) {
9744 unsigned Idx = Log2_64(NonZeros);
9745 SDValue SubVec = Op.getOperand(Idx);
9746 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9747 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9748 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9749 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9750 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9751 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9752 DAG.getVectorIdxConstant(0, dl));
9753 }
9754
9755 // If there are zero or one non-zeros we can handle this very simply.
9756 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9757 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9758 if (!NonZeros)
9759 return Vec;
9760 unsigned Idx = Log2_64(NonZeros);
9761 SDValue SubVec = Op.getOperand(Idx);
9762 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9763 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9764 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9765 }
9766
9767 if (NumOperands > 2) {
9768 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9769 ArrayRef<SDUse> Ops = Op->ops();
9770 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9771 Ops.slice(0, NumOperands / 2));
9772 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9773 Ops.slice(NumOperands / 2));
9774 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9775 }
9776
9777 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9778
9779 if (ResVT.getVectorNumElements() >= 16)
9780 return Op; // The operation is legal with KUNPCK
9781
9782 SDValue Vec =
9783 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9784 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9785 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9786 DAG.getVectorIdxConstant(NumElems / 2, dl));
9787}
9788
9790 const X86Subtarget &Subtarget,
9791 SelectionDAG &DAG) {
9792 SDLoc DL(Op);
9793 MVT VT = Op.getSimpleValueType();
9794 if (VT.getVectorElementType() == MVT::i1)
9795 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9796
9797 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9798 // from two other 128-bit ones.
9799 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9800 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9801 (VT.is512BitVector() &&
9802 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9803 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9804}
9805
9806//===----------------------------------------------------------------------===//
9807// Vector shuffle lowering
9808//
9809// This is an experimental code path for lowering vector shuffles on x86. It is
9810// designed to handle arbitrary vector shuffles and blends, gracefully
9811// degrading performance as necessary. It works hard to recognize idiomatic
9812// shuffles and lower them to optimal instruction patterns without leaving
9813// a framework that allows reasonably efficient handling of all vector shuffle
9814// patterns.
9815//===----------------------------------------------------------------------===//
9816
9817/// Checks whether the vector elements referenced by two shuffle masks are
9818/// equivalent.
9819static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9820 int Idx, int ExpectedIdx) {
9821 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9822 ExpectedIdx < MaskSize && "Out of range element index");
9823 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9824 return false;
9825
9826 EVT VT = Op.getValueType();
9827 EVT ExpectedVT = ExpectedOp.getValueType();
9828
9829 // Sources must be vectors and match the mask's element count.
9830 if (!VT.isVector() || !ExpectedVT.isVector() ||
9831 (int)VT.getVectorNumElements() != MaskSize ||
9832 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9833 return false;
9834
9835 // Exact match.
9836 if (Idx == ExpectedIdx && Op == ExpectedOp)
9837 return true;
9838
9839 switch (Op.getOpcode()) {
9840 case ISD::BUILD_VECTOR:
9841 // If the values are build vectors, we can look through them to find
9842 // equivalent inputs that make the shuffles equivalent.
9843 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9844 case ISD::BITCAST: {
9846 EVT SrcVT = Src.getValueType();
9847 if (Op == ExpectedOp && SrcVT.isVector()) {
9848 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9849 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9850 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9851 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9852 Idx / Scale, ExpectedIdx / Scale);
9853 }
9854 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9855 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9856 for (unsigned I = 0; I != Scale; ++I)
9857 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9858 (Idx * Scale) + I,
9859 (ExpectedIdx * Scale) + I))
9860 return false;
9861 return true;
9862 }
9863 }
9864 break;
9865 }
9866 case ISD::VECTOR_SHUFFLE: {
9867 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9868 return Op == ExpectedOp &&
9869 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9870 }
9871 case X86ISD::VBROADCAST:
9873 return Op == ExpectedOp;
9875 if (Op == ExpectedOp) {
9876 auto *MemOp = cast<MemSDNode>(Op);
9877 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9878 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9879 }
9880 break;
9881 case X86ISD::VPERMI: {
9882 if (Op == ExpectedOp) {
9884 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9885 SDValue Src = Op.getOperand(0);
9886 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9887 Mask[ExpectedIdx]);
9888 }
9889 break;
9890 }
9891 case X86ISD::HADD:
9892 case X86ISD::HSUB:
9893 case X86ISD::FHADD:
9894 case X86ISD::FHSUB:
9895 case X86ISD::PACKSS:
9896 case X86ISD::PACKUS:
9897 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9898 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9899 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9900 int NumElts = VT.getVectorNumElements();
9901 int NumLanes = VT.getSizeInBits() / 128;
9902 int NumEltsPerLane = NumElts / NumLanes;
9903 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9904 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9905 bool SameElt =
9906 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9907 return SameLane && SameElt;
9908 }
9909 break;
9910 }
9911
9912 return false;
9913}
9914
9915/// Tiny helper function to identify a no-op mask.
9916///
9917/// This is a somewhat boring predicate function. It checks whether the mask
9918/// array input, which is assumed to be a single-input shuffle mask of the kind
9919/// used by the X86 shuffle instructions (not a fully general
9920/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9921/// in-place shuffle are 'no-op's.
9923 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9924 assert(Mask[i] >= -1 && "Out of bound mask element!");
9925 if (Mask[i] >= 0 && Mask[i] != i)
9926 return false;
9927 }
9928 return true;
9929}
9930
9931/// Test whether there are elements crossing LaneSizeInBits lanes in this
9932/// shuffle mask.
9933///
9934/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9935/// and we routinely test for these.
9936static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9937 unsigned ScalarSizeInBits,
9938 ArrayRef<int> Mask) {
9939 assert(LaneSizeInBits && ScalarSizeInBits &&
9940 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9941 "Illegal shuffle lane size");
9942 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9943 int Size = Mask.size();
9944 for (int i = 0; i < Size; ++i)
9945 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9946 return true;
9947 return false;
9948}
9949
9950/// Test whether there are elements crossing 128-bit lanes in this
9951/// shuffle mask.
9953 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9954}
9955
9956/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9957/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9958/// better support 'repeated mask + lane permute' style shuffles.
9959static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9960 unsigned ScalarSizeInBits,
9961 ArrayRef<int> Mask) {
9962 assert(LaneSizeInBits && ScalarSizeInBits &&
9963 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9964 "Illegal shuffle lane size");
9965 int NumElts = Mask.size();
9966 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9967 int NumLanes = NumElts / NumEltsPerLane;
9968 if (NumLanes > 1) {
9969 for (int i = 0; i != NumLanes; ++i) {
9970 int SrcLane = -1;
9971 for (int j = 0; j != NumEltsPerLane; ++j) {
9972 int M = Mask[(i * NumEltsPerLane) + j];
9973 if (M < 0)
9974 continue;
9975 int Lane = (M % NumElts) / NumEltsPerLane;
9976 if (SrcLane >= 0 && SrcLane != Lane)
9977 return true;
9978 SrcLane = Lane;
9979 }
9980 }
9981 }
9982 return false;
9983}
9984
9985/// Test whether a shuffle mask is equivalent within each sub-lane.
9986///
9987/// This checks a shuffle mask to see if it is performing the same
9988/// lane-relative shuffle in each sub-lane. This trivially implies
9989/// that it is also not lane-crossing. It may however involve a blend from the
9990/// same lane of a second vector.
9991///
9992/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9993/// non-trivial to compute in the face of undef lanes. The representation is
9994/// suitable for use with existing 128-bit shuffles as entries from the second
9995/// vector have been remapped to [LaneSize, 2*LaneSize).
9996static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9997 ArrayRef<int> Mask,
9998 SmallVectorImpl<int> &RepeatedMask) {
9999 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10000 RepeatedMask.assign(LaneSize, -1);
10001 int Size = Mask.size();
10002 for (int i = 0; i < Size; ++i) {
10003 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10004 if (Mask[i] < 0)
10005 continue;
10006 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10007 // This entry crosses lanes, so there is no way to model this shuffle.
10008 return false;
10009
10010 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10011 // Adjust second vector indices to start at LaneSize instead of Size.
10012 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10013 : Mask[i] % LaneSize + LaneSize;
10014 if (RepeatedMask[i % LaneSize] < 0)
10015 // This is the first non-undef entry in this slot of a 128-bit lane.
10016 RepeatedMask[i % LaneSize] = LocalM;
10017 else if (RepeatedMask[i % LaneSize] != LocalM)
10018 // Found a mismatch with the repeated mask.
10019 return false;
10020 }
10021 return true;
10022}
10023
10024/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10025static bool
10027 SmallVectorImpl<int> &RepeatedMask) {
10028 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10029}
10030
10031static bool
10033 SmallVector<int, 32> RepeatedMask;
10034 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10035}
10036
10037/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10038static bool
10040 SmallVectorImpl<int> &RepeatedMask) {
10041 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10042}
10043
10044/// Test whether a target shuffle mask is equivalent within each sub-lane.
10045/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10046static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10047 unsigned EltSizeInBits,
10048 ArrayRef<int> Mask,
10049 SmallVectorImpl<int> &RepeatedMask) {
10050 int LaneSize = LaneSizeInBits / EltSizeInBits;
10051 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10052 int Size = Mask.size();
10053 for (int i = 0; i < Size; ++i) {
10054 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10055 if (Mask[i] == SM_SentinelUndef)
10056 continue;
10057 if (Mask[i] == SM_SentinelZero) {
10058 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10059 return false;
10060 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10061 continue;
10062 }
10063 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10064 // This entry crosses lanes, so there is no way to model this shuffle.
10065 return false;
10066
10067 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10068 // later vector indices to start at multiples of LaneSize instead of Size.
10069 int LaneM = Mask[i] / Size;
10070 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10071 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10072 // This is the first non-undef entry in this slot of a 128-bit lane.
10073 RepeatedMask[i % LaneSize] = LocalM;
10074 else if (RepeatedMask[i % LaneSize] != LocalM)
10075 // Found a mismatch with the repeated mask.
10076 return false;
10077 }
10078 return true;
10079}
10080
10081/// Test whether a target shuffle mask is equivalent within each sub-lane.
10082/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10083static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10084 ArrayRef<int> Mask,
10085 SmallVectorImpl<int> &RepeatedMask) {
10086 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10087 Mask, RepeatedMask);
10088}
10089
10090/// Checks whether a shuffle mask is equivalent to an explicit list of
10091/// arguments.
10092///
10093/// This is a fast way to test a shuffle mask against a fixed pattern:
10094///
10095/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10096///
10097/// It returns true if the mask is exactly as wide as the argument list, and
10098/// each element of the mask is either -1 (signifying undef) or the value given
10099/// in the argument.
10100static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10101 SDValue V1 = SDValue(),
10102 SDValue V2 = SDValue()) {
10103 int Size = Mask.size();
10104 if (Size != (int)ExpectedMask.size())
10105 return false;
10106
10107 for (int i = 0; i < Size; ++i) {
10108 assert(Mask[i] >= -1 && "Out of bound mask element!");
10109 int MaskIdx = Mask[i];
10110 int ExpectedIdx = ExpectedMask[i];
10111 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10112 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10113 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10114 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10115 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10116 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10117 return false;
10118 }
10119 }
10120 return true;
10121}
10122
10123/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10124///
10125/// The masks must be exactly the same width.
10126///
10127/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10128/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10129///
10130/// SM_SentinelZero is accepted as a valid negative index but must match in
10131/// both, or via a known bits test.
10133 ArrayRef<int> ExpectedMask,
10134 const SelectionDAG &DAG,
10135 SDValue V1 = SDValue(),
10136 SDValue V2 = SDValue()) {
10137 int Size = Mask.size();
10138 if (Size != (int)ExpectedMask.size())
10139 return false;
10140 assert(llvm::all_of(ExpectedMask,
10141 [Size](int M) {
10142 return M == SM_SentinelZero ||
10143 isInRange(M, 0, 2 * Size);
10144 }) &&
10145 "Illegal target shuffle mask");
10146
10147 // Check for out-of-range target shuffle mask indices.
10148 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10149 return false;
10150
10151 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10152 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10153 !V1.getValueType().isVector()))
10154 V1 = SDValue();
10155 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10156 !V2.getValueType().isVector()))
10157 V2 = SDValue();
10158
10159 APInt ZeroV1 = APInt::getZero(Size);
10160 APInt ZeroV2 = APInt::getZero(Size);
10161
10162 for (int i = 0; i < Size; ++i) {
10163 int MaskIdx = Mask[i];
10164 int ExpectedIdx = ExpectedMask[i];
10165 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10166 continue;
10167 // If we failed to match an expected SM_SentinelZero then early out.
10168 if (ExpectedIdx < 0)
10169 return false;
10170 if (MaskIdx == SM_SentinelZero) {
10171 // If we need this expected index to be a zero element, then update the
10172 // relevant zero mask and perform the known bits at the end to minimize
10173 // repeated computes.
10174 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10175 if (ExpectedV &&
10176 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10177 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10178 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10179 ZeroMask.setBit(BitIdx);
10180 continue;
10181 }
10182 }
10183 if (MaskIdx >= 0) {
10184 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10185 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10186 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10187 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10188 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10189 continue;
10190 }
10191 return false;
10192 }
10193 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10194 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10195}
10196
10197// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10198// instructions.
10200 const SelectionDAG &DAG) {
10201 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10202 return false;
10203
10204 SmallVector<int, 8> Unpcklwd;
10205 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10206 /* Unary = */ false);
10207 SmallVector<int, 8> Unpckhwd;
10208 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10209 /* Unary = */ false);
10210 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10211 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10212 return IsUnpackwdMask;
10213}
10214
10216 const SelectionDAG &DAG) {
10217 // Create 128-bit vector type based on mask size.
10218 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10219 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10220
10221 // We can't assume a canonical shuffle mask, so try the commuted version too.
10222 SmallVector<int, 4> CommutedMask(Mask);
10224
10225 // Match any of unary/binary or low/high.
10226 for (unsigned i = 0; i != 4; ++i) {
10227 SmallVector<int, 16> UnpackMask;
10228 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10229 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10230 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10231 return true;
10232 }
10233 return false;
10234}
10235
10236/// Return true if a shuffle mask chooses elements identically in its top and
10237/// bottom halves. For example, any splat mask has the same top and bottom
10238/// halves. If an element is undefined in only one half of the mask, the halves
10239/// are not considered identical.
10241 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10242 unsigned HalfSize = Mask.size() / 2;
10243 for (unsigned i = 0; i != HalfSize; ++i) {
10244 if (Mask[i] != Mask[i + HalfSize])
10245 return false;
10246 }
10247 return true;
10248}
10249
10250/// Get a 4-lane 8-bit shuffle immediate for a mask.
10251///
10252/// This helper function produces an 8-bit shuffle immediate corresponding to
10253/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10254/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10255/// example.
10256///
10257/// NB: We rely heavily on "undef" masks preserving the input lane.
10258static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10259 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10260 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10261 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10262 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10263 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10264
10265 // If the mask only uses one non-undef element, then fully 'splat' it to
10266 // improve later broadcast matching.
10267 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10268 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10269
10270 int FirstElt = Mask[FirstIndex];
10271 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10272 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10273
10274 unsigned Imm = 0;
10275 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10276 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10277 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10278 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10279 return Imm;
10280}
10281
10283 SelectionDAG &DAG) {
10284 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10285}
10286
10287// Canonicalize SHUFPD mask to improve chances of further folding.
10288// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10289static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10290 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10291 "Unexpected SHUFPD mask size");
10292 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10293 "Unexpected SHUFPD mask elements");
10294
10295 // If the mask only uses one non-undef element, then fully 'splat' it to
10296 // improve later broadcast matching.
10297 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10298 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10299 "All undef shuffle mask");
10300
10301 int FirstElt = Mask[FirstIndex];
10302 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10303 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10304 unsigned Imm = 0;
10305 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10306 Imm |= FirstElt << I;
10307 return Imm;
10308 }
10309
10310 // Attempt to keep any undef elements in place to improve chances of the
10311 // shuffle becoming a (commutative) blend.
10312 unsigned Imm = 0;
10313 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10314 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10315
10316 return Imm;
10317}
10318
10320 SelectionDAG &DAG) {
10321 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10322}
10323
10324// The Shuffle result is as follow:
10325// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10326// Each Zeroable's element correspond to a particular Mask's element.
10327// As described in computeZeroableShuffleElements function.
10328//
10329// The function looks for a sub-mask that the nonzero elements are in
10330// increasing order. If such sub-mask exist. The function returns true.
10331static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10332 ArrayRef<int> Mask, const EVT &VectorType,
10333 bool &IsZeroSideLeft) {
10334 int NextElement = -1;
10335 // Check if the Mask's nonzero elements are in increasing order.
10336 for (int i = 0, e = Mask.size(); i < e; i++) {
10337 // Checks if the mask's zeros elements are built from only zeros.
10338 assert(Mask[i] >= -1 && "Out of bound mask element!");
10339 if (Mask[i] < 0)
10340 return false;
10341 if (Zeroable[i])
10342 continue;
10343 // Find the lowest non zero element
10344 if (NextElement < 0) {
10345 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10346 IsZeroSideLeft = NextElement != 0;
10347 }
10348 // Exit if the mask's non zero elements are not in increasing order.
10349 if (NextElement != Mask[i])
10350 return false;
10351 NextElement++;
10352 }
10353 return true;
10354}
10355
10356static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10358 const X86Subtarget &Subtarget,
10359 unsigned Depth = 0);
10360
10361/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10363 ArrayRef<int> Mask, SDValue V1,
10364 SDValue V2, const APInt &Zeroable,
10365 const X86Subtarget &Subtarget,
10366 SelectionDAG &DAG) {
10367 int Size = Mask.size();
10368 int LaneSize = 128 / VT.getScalarSizeInBits();
10369 const int NumBytes = VT.getSizeInBits() / 8;
10370 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10371
10372 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10373 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10374 (Subtarget.hasBWI() && VT.is512BitVector()));
10375
10376 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10377 // Sign bit set in i8 mask means zero element.
10378 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10379
10380 SDValue V;
10381 for (int i = 0; i < NumBytes; ++i) {
10382 int M = Mask[i / NumEltBytes];
10383 if (M < 0) {
10384 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10385 continue;
10386 }
10387 if (Zeroable[i / NumEltBytes]) {
10388 PSHUFBMask[i] = ZeroMask;
10389 continue;
10390 }
10391
10392 // We can only use a single input of V1 or V2.
10393 SDValue SrcV = (M >= Size ? V2 : V1);
10394 if (V && V != SrcV)
10395 return SDValue();
10396 V = SrcV;
10397 M %= Size;
10398
10399 // PSHUFB can't cross lanes, ensure this doesn't happen.
10400 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10401 return SDValue();
10402
10403 M = M % LaneSize;
10404 M = M * NumEltBytes + (i % NumEltBytes);
10405 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10406 }
10407 assert(V && "Failed to find a source input");
10408
10409 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10410 return DAG.getBitcast(
10411 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10412 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10413}
10414
10415static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10416 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10417 const SDLoc &dl);
10418
10419// X86 has dedicated shuffle that can be lowered to VEXPAND
10421 SDValue V2, ArrayRef<int> Mask,
10422 const APInt &Zeroable,
10423 const X86Subtarget &Subtarget,
10424 SelectionDAG &DAG) {
10425 bool IsLeftZeroSide = true;
10426 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10427 IsLeftZeroSide))
10428 return SDValue();
10429 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10431 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10432 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10433 unsigned NumElts = VT.getVectorNumElements();
10434 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10435 "Unexpected number of vector elements");
10436 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10437 Subtarget, DAG, DL);
10438 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10439 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10440 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10441}
10442
10443static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10444 unsigned &UnpackOpcode, bool IsUnary,
10445 ArrayRef<int> TargetMask, const SDLoc &DL,
10446 SelectionDAG &DAG,
10447 const X86Subtarget &Subtarget) {
10448 int NumElts = VT.getVectorNumElements();
10449
10450 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10451 for (int i = 0; i != NumElts; i += 2) {
10452 int M1 = TargetMask[i + 0];
10453 int M2 = TargetMask[i + 1];
10454 Undef1 &= (SM_SentinelUndef == M1);
10455 Undef2 &= (SM_SentinelUndef == M2);
10456 Zero1 &= isUndefOrZero(M1);
10457 Zero2 &= isUndefOrZero(M2);
10458 }
10459 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10460 "Zeroable shuffle detected");
10461
10462 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10463 SmallVector<int, 64> Unpckl, Unpckh;
10464 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10465 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10466 (IsUnary ? V1 : V2))) {
10467 UnpackOpcode = X86ISD::UNPCKL;
10468 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10469 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10470 return true;
10471 }
10472
10473 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10474 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10475 (IsUnary ? V1 : V2))) {
10476 UnpackOpcode = X86ISD::UNPCKH;
10477 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10478 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10479 return true;
10480 }
10481
10482 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10483 if (IsUnary && (Zero1 || Zero2)) {
10484 // Don't bother if we can blend instead.
10485 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10486 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10487 return false;
10488
10489 bool MatchLo = true, MatchHi = true;
10490 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10491 int M = TargetMask[i];
10492
10493 // Ignore if the input is known to be zero or the index is undef.
10494 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10495 (M == SM_SentinelUndef))
10496 continue;
10497
10498 MatchLo &= (M == Unpckl[i]);
10499 MatchHi &= (M == Unpckh[i]);
10500 }
10501
10502 if (MatchLo || MatchHi) {
10503 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10504 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10505 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10506 return true;
10507 }
10508 }
10509
10510 // If a binary shuffle, commute and try again.
10511 if (!IsUnary) {
10513 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10514 UnpackOpcode = X86ISD::UNPCKL;
10515 std::swap(V1, V2);
10516 return true;
10517 }
10518
10520 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10521 UnpackOpcode = X86ISD::UNPCKH;
10522 std::swap(V1, V2);
10523 return true;
10524 }
10525 }
10526
10527 return false;
10528}
10529
10530// X86 has dedicated unpack instructions that can handle specific blend
10531// operations: UNPCKH and UNPCKL.
10533 SDValue V2, ArrayRef<int> Mask,
10534 SelectionDAG &DAG) {
10535 SmallVector<int, 8> Unpckl;
10536 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10537 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10538 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10539
10540 SmallVector<int, 8> Unpckh;
10541 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10542 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10543 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10544
10545 // Commute and try again.
10547 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10548 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10549
10551 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10552 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10553
10554 return SDValue();
10555}
10556
10557/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10558/// followed by unpack 256-bit.
10560 SDValue V2, ArrayRef<int> Mask,
10561 SelectionDAG &DAG) {
10562 SmallVector<int, 32> Unpckl, Unpckh;
10563 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10564 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10565
10566 unsigned UnpackOpcode;
10567 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10568 UnpackOpcode = X86ISD::UNPCKL;
10569 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10570 UnpackOpcode = X86ISD::UNPCKH;
10571 else
10572 return SDValue();
10573
10574 // This is a "natural" unpack operation (rather than the 128-bit sectored
10575 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10576 // input in order to use the x86 instruction.
10577 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10578 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10579 V1 = DAG.getBitcast(VT, V1);
10580 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10581}
10582
10583// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10584// source into the lower elements and zeroing the upper elements.
10585static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10586 ArrayRef<int> Mask, const APInt &Zeroable,
10587 const X86Subtarget &Subtarget) {
10588 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10589 return false;
10590
10591 unsigned NumElts = Mask.size();
10592 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10593 unsigned MaxScale = 64 / EltSizeInBits;
10594
10595 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10596 unsigned SrcEltBits = EltSizeInBits * Scale;
10597 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10598 continue;
10599 unsigned NumSrcElts = NumElts / Scale;
10600 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10601 continue;
10602 unsigned UpperElts = NumElts - NumSrcElts;
10603 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10604 continue;
10605 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10606 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10607 DstVT = MVT::getIntegerVT(EltSizeInBits);
10608 if ((NumSrcElts * EltSizeInBits) >= 128) {
10609 // ISD::TRUNCATE
10610 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10611 } else {
10612 // X86ISD::VTRUNC
10613 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10614 }
10615 return true;
10616 }
10617
10618 return false;
10619}
10620
10621// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10622// element padding to the final DstVT.
10623static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10624 const X86Subtarget &Subtarget,
10625 SelectionDAG &DAG, bool ZeroUppers) {
10626 MVT SrcVT = Src.getSimpleValueType();
10627 MVT DstSVT = DstVT.getScalarType();
10628 unsigned NumDstElts = DstVT.getVectorNumElements();
10629 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10630 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10631
10632 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10633 return SDValue();
10634
10635 // Perform a direct ISD::TRUNCATE if possible.
10636 if (NumSrcElts == NumDstElts)
10637 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10638
10639 if (NumSrcElts > NumDstElts) {
10640 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10641 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10642 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10643 }
10644
10645 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10646 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10647 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10648 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10649 DstVT.getSizeInBits());
10650 }
10651
10652 // Non-VLX targets must truncate from a 512-bit type, so we need to
10653 // widen, truncate and then possibly extract the original subvector.
10654 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10655 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10656 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10657 }
10658
10659 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10660 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10661 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10662 if (DstVT != TruncVT)
10663 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10664 DstVT.getSizeInBits());
10665 return Trunc;
10666}
10667
10668// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10669//
10670// An example is the following:
10671//
10672// t0: ch = EntryToken
10673// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10674// t25: v4i32 = truncate t2
10675// t41: v8i16 = bitcast t25
10676// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10677// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10678// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10679// t18: v2i64 = bitcast t51
10680//
10681// One can just use a single vpmovdw instruction, without avx512vl we need to
10682// use the zmm variant and extract the lower subvector, padding with zeroes.
10683// TODO: Merge with lowerShuffleAsVTRUNC.
10685 SDValue V2, ArrayRef<int> Mask,
10686 const APInt &Zeroable,
10687 const X86Subtarget &Subtarget,
10688 SelectionDAG &DAG) {
10689 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10690 if (!Subtarget.hasAVX512())
10691 return SDValue();
10692
10693 unsigned NumElts = VT.getVectorNumElements();
10694 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10695 unsigned MaxScale = 64 / EltSizeInBits;
10696 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10697 unsigned SrcEltBits = EltSizeInBits * Scale;
10698 unsigned NumSrcElts = NumElts / Scale;
10699 unsigned UpperElts = NumElts - NumSrcElts;
10700 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10701 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10702 continue;
10703
10704 // Attempt to find a matching source truncation, but as a fall back VLX
10705 // cases can use the VPMOV directly.
10706 SDValue Src = peekThroughBitcasts(V1);
10707 if (Src.getOpcode() == ISD::TRUNCATE &&
10708 Src.getScalarValueSizeInBits() == SrcEltBits) {
10709 Src = Src.getOperand(0);
10710 } else if (Subtarget.hasVLX()) {
10711 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10712 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10713 Src = DAG.getBitcast(SrcVT, Src);
10714 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10715 if (Scale == 2 &&
10716 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10717 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10718 return SDValue();
10719 } else
10720 return SDValue();
10721
10722 // VPMOVWB is only available with avx512bw.
10723 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10724 return SDValue();
10725
10726 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10727 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10728 }
10729
10730 return SDValue();
10731}
10732
10733// Attempt to match binary shuffle patterns as a truncate.
10735 SDValue V2, ArrayRef<int> Mask,
10736 const APInt &Zeroable,
10737 const X86Subtarget &Subtarget,
10738 SelectionDAG &DAG) {
10739 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10740 "Unexpected VTRUNC type");
10741 if (!Subtarget.hasAVX512() ||
10742 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10743 return SDValue();
10744
10745 unsigned NumElts = VT.getVectorNumElements();
10746 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10747 unsigned MaxScale = 64 / EltSizeInBits;
10748 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10749 // TODO: Support non-BWI VPMOVWB truncations?
10750 unsigned SrcEltBits = EltSizeInBits * Scale;
10751 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10752 continue;
10753
10754 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10755 // Bail if the V2 elements are undef.
10756 unsigned NumHalfSrcElts = NumElts / Scale;
10757 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10758 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10759 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10760 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10761 continue;
10762
10763 // The elements beyond the truncation must be undef/zero.
10764 unsigned UpperElts = NumElts - NumSrcElts;
10765 if (UpperElts > 0 &&
10766 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10767 continue;
10768 bool UndefUppers =
10769 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10770
10771 // As we're using both sources then we need to concat them together
10772 // and truncate from the double-sized src.
10773 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10774
10775 // For offset truncations, ensure that the concat is cheap.
10776 SDValue Src =
10777 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10778 if (!Src) {
10779 if (Offset)
10780 continue;
10781 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10782 }
10783
10784 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10785 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10786 Src = DAG.getBitcast(SrcVT, Src);
10787
10788 // Shift the offset'd elements into place for the truncation.
10789 // TODO: Use getTargetVShiftByConstNode.
10790 if (Offset)
10791 Src = DAG.getNode(
10792 X86ISD::VSRLI, DL, SrcVT, Src,
10793 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10794
10795 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10796 }
10797 }
10798
10799 return SDValue();
10800}
10801
10802/// Check whether a compaction lowering can be done by dropping even/odd
10803/// elements and compute how many times even/odd elements must be dropped.
10804///
10805/// This handles shuffles which take every Nth element where N is a power of
10806/// two. Example shuffle masks:
10807///
10808/// (even)
10809/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10810/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10811/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10812/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10813/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10814/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10815///
10816/// (odd)
10817/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10818/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10819///
10820/// Any of these lanes can of course be undef.
10821///
10822/// This routine only supports N <= 3.
10823/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10824/// for larger N.
10825///
10826/// \returns N above, or the number of times even/odd elements must be dropped
10827/// if there is such a number. Otherwise returns zero.
10828static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10829 bool IsSingleInput) {
10830 // The modulus for the shuffle vector entries is based on whether this is
10831 // a single input or not.
10832 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10833 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10834 "We should only be called with masks with a power-of-2 size!");
10835
10836 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10837 int Offset = MatchEven ? 0 : 1;
10838
10839 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10840 // and 2^3 simultaneously. This is because we may have ambiguity with
10841 // partially undef inputs.
10842 bool ViableForN[3] = {true, true, true};
10843
10844 for (int i = 0, e = Mask.size(); i < e; ++i) {
10845 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10846 // want.
10847 if (Mask[i] < 0)
10848 continue;
10849
10850 bool IsAnyViable = false;
10851 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10852 if (ViableForN[j]) {
10853 uint64_t N = j + 1;
10854
10855 // The shuffle mask must be equal to (i * 2^N) % M.
10856 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10857 IsAnyViable = true;
10858 else
10859 ViableForN[j] = false;
10860 }
10861 // Early exit if we exhaust the possible powers of two.
10862 if (!IsAnyViable)
10863 break;
10864 }
10865
10866 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10867 if (ViableForN[j])
10868 return j + 1;
10869
10870 // Return 0 as there is no viable power of two.
10871 return 0;
10872}
10873
10874// X86 has dedicated pack instructions that can handle specific truncation
10875// operations: PACKSS and PACKUS.
10876// Checks for compaction shuffle masks if MaxStages > 1.
10877// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10878static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10879 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10880 const SelectionDAG &DAG,
10881 const X86Subtarget &Subtarget,
10882 unsigned MaxStages = 1) {
10883 unsigned NumElts = VT.getVectorNumElements();
10884 unsigned BitSize = VT.getScalarSizeInBits();
10885 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10886 "Illegal maximum compaction");
10887
10888 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10889 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10890 unsigned NumPackedBits = NumSrcBits - BitSize;
10891 N1 = peekThroughBitcasts(N1);
10892 N2 = peekThroughBitcasts(N2);
10893 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10894 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10895 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10896 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10897 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10898 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10899 return false;
10900 if (Subtarget.hasSSE41() || BitSize == 8) {
10901 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10902 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10903 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10904 V1 = N1;
10905 V2 = N2;
10906 SrcVT = PackVT;
10907 PackOpcode = X86ISD::PACKUS;
10908 return true;
10909 }
10910 }
10911 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10912 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10913 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10914 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10915 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10916 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10917 V1 = N1;
10918 V2 = N2;
10919 SrcVT = PackVT;
10920 PackOpcode = X86ISD::PACKSS;
10921 return true;
10922 }
10923 return false;
10924 };
10925
10926 // Attempt to match against wider and wider compaction patterns.
10927 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10928 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10929 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10930
10931 // Try binary shuffle.
10932 SmallVector<int, 32> BinaryMask;
10933 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10934 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10935 if (MatchPACK(V1, V2, PackVT))
10936 return true;
10937
10938 // Try unary shuffle.
10939 SmallVector<int, 32> UnaryMask;
10940 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10941 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10942 if (MatchPACK(V1, V1, PackVT))
10943 return true;
10944 }
10945
10946 return false;
10947}
10948
10950 SDValue V2, ArrayRef<int> Mask,
10951 const X86Subtarget &Subtarget,
10952 SelectionDAG &DAG) {
10953 MVT PackVT;
10954 unsigned PackOpcode;
10955 unsigned SizeBits = VT.getSizeInBits();
10956 unsigned EltBits = VT.getScalarSizeInBits();
10957 unsigned MaxStages = Log2_32(64 / EltBits);
10958 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10959 Subtarget, MaxStages))
10960 return SDValue();
10961
10962 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10963 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10964
10965 // Don't lower multi-stage packs on AVX512, truncation is better.
10966 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10967 return SDValue();
10968
10969 // Pack to the largest type possible:
10970 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10971 unsigned MaxPackBits = 16;
10972 if (CurrentEltBits > 16 &&
10973 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10974 MaxPackBits = 32;
10975
10976 // Repeatedly pack down to the target size.
10977 SDValue Res;
10978 for (unsigned i = 0; i != NumStages; ++i) {
10979 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10980 unsigned NumSrcElts = SizeBits / SrcEltBits;
10981 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10982 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10983 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10984 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10985 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10986 DAG.getBitcast(SrcVT, V2));
10987 V1 = V2 = Res;
10988 CurrentEltBits /= 2;
10989 }
10990 assert(Res && Res.getValueType() == VT &&
10991 "Failed to lower compaction shuffle");
10992 return Res;
10993}
10994
10995/// Try to emit a bitmask instruction for a shuffle.
10996///
10997/// This handles cases where we can model a blend exactly as a bitmask due to
10998/// one of the inputs being zeroable.
11000 SDValue V2, ArrayRef<int> Mask,
11001 const APInt &Zeroable,
11002 const X86Subtarget &Subtarget,
11003 SelectionDAG &DAG) {
11004 MVT MaskVT = VT;
11005 MVT EltVT = VT.getVectorElementType();
11006 SDValue Zero, AllOnes;
11007 // Use f64 if i64 isn't legal.
11008 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11009 EltVT = MVT::f64;
11010 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11011 }
11012
11013 MVT LogicVT = VT;
11014 if (EltVT.isFloatingPoint()) {
11015 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11016 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11017 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11018 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11019 } else {
11020 Zero = DAG.getConstant(0, DL, EltVT);
11021 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11022 }
11023
11024 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11025 SDValue V;
11026 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11027 if (Zeroable[i])
11028 continue;
11029 if (Mask[i] % Size != i)
11030 return SDValue(); // Not a blend.
11031 if (!V)
11032 V = Mask[i] < Size ? V1 : V2;
11033 else if (V != (Mask[i] < Size ? V1 : V2))
11034 return SDValue(); // Can only let one input through the mask.
11035
11036 VMaskOps[i] = AllOnes;
11037 }
11038 if (!V)
11039 return SDValue(); // No non-zeroable elements!
11040
11041 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11042 VMask = DAG.getBitcast(LogicVT, VMask);
11043 V = DAG.getBitcast(LogicVT, V);
11044 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11045 return DAG.getBitcast(VT, And);
11046}
11047
11048/// Try to emit a blend instruction for a shuffle using bit math.
11049///
11050/// This is used as a fallback approach when first class blend instructions are
11051/// unavailable. Currently it is only suitable for integer vectors, but could
11052/// be generalized for floating point vectors if desirable.
11054 SDValue V2, ArrayRef<int> Mask,
11055 SelectionDAG &DAG) {
11056 assert(VT.isInteger() && "Only supports integer vector types!");
11057 MVT EltVT = VT.getVectorElementType();
11058 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11059 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11061 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11062 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11063 return SDValue(); // Shuffled input!
11064 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11065 }
11066
11067 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11068 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11069}
11070
11072 SDValue PreservedSrc,
11073 const X86Subtarget &Subtarget,
11074 SelectionDAG &DAG);
11075
11078 const APInt &Zeroable, bool &ForceV1Zero,
11079 bool &ForceV2Zero, uint64_t &BlendMask) {
11080 bool V1IsZeroOrUndef =
11082 bool V2IsZeroOrUndef =
11084
11085 BlendMask = 0;
11086 ForceV1Zero = false, ForceV2Zero = false;
11087 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11088
11089 int NumElts = Mask.size();
11090 int NumLanes = VT.getSizeInBits() / 128;
11091 int NumEltsPerLane = NumElts / NumLanes;
11092 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11093
11094 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11095 // then ensure the blend mask part for that lane just references that input.
11096 bool ForceWholeLaneMasks =
11097 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11098
11099 // Attempt to generate the binary blend mask. If an input is zero then
11100 // we can use any lane.
11101 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11102 // Keep track of the inputs used per lane.
11103 bool LaneV1InUse = false;
11104 bool LaneV2InUse = false;
11105 uint64_t LaneBlendMask = 0;
11106 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11107 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11108 int M = Mask[Elt];
11109 if (M == SM_SentinelUndef)
11110 continue;
11111 if (M == Elt || (0 <= M && M < NumElts &&
11112 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11113 Mask[Elt] = Elt;
11114 LaneV1InUse = true;
11115 continue;
11116 }
11117 if (M == (Elt + NumElts) ||
11118 (NumElts <= M &&
11119 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11120 LaneBlendMask |= 1ull << LaneElt;
11121 Mask[Elt] = Elt + NumElts;
11122 LaneV2InUse = true;
11123 continue;
11124 }
11125 if (Zeroable[Elt]) {
11126 if (V1IsZeroOrUndef) {
11127 ForceV1Zero = true;
11128 Mask[Elt] = Elt;
11129 LaneV1InUse = true;
11130 continue;
11131 }
11132 if (V2IsZeroOrUndef) {
11133 ForceV2Zero = true;
11134 LaneBlendMask |= 1ull << LaneElt;
11135 Mask[Elt] = Elt + NumElts;
11136 LaneV2InUse = true;
11137 continue;
11138 }
11139 }
11140 return false;
11141 }
11142
11143 // If we only used V2 then splat the lane blend mask to avoid any demanded
11144 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11145 // blend mask bit).
11146 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11147 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11148
11149 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11150 }
11151 return true;
11152}
11153
11154/// Try to emit a blend instruction for a shuffle.
11155///
11156/// This doesn't do any checks for the availability of instructions for blending
11157/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11158/// be matched in the backend with the type given. What it does check for is
11159/// that the shuffle mask is a blend, or convertible into a blend with zero.
11161 SDValue V2, ArrayRef<int> Original,
11162 const APInt &Zeroable,
11163 const X86Subtarget &Subtarget,
11164 SelectionDAG &DAG) {
11165 uint64_t BlendMask = 0;
11166 bool ForceV1Zero = false, ForceV2Zero = false;
11167 SmallVector<int, 64> Mask(Original);
11168 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11169 BlendMask))
11170 return SDValue();
11171
11172 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11173 if (ForceV1Zero)
11174 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11175 if (ForceV2Zero)
11176 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11177
11178 unsigned NumElts = VT.getVectorNumElements();
11179
11180 switch (VT.SimpleTy) {
11181 case MVT::v4i64:
11182 case MVT::v8i32:
11183 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11184 [[fallthrough]];
11185 case MVT::v4f64:
11186 case MVT::v8f32:
11187 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11188 [[fallthrough]];
11189 case MVT::v2f64:
11190 case MVT::v2i64:
11191 case MVT::v4f32:
11192 case MVT::v4i32:
11193 case MVT::v8i16:
11194 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11195 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11196 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11197 case MVT::v16i16: {
11198 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11199 SmallVector<int, 8> RepeatedMask;
11200 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11201 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11202 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11203 BlendMask = 0;
11204 for (int i = 0; i < 8; ++i)
11205 if (RepeatedMask[i] >= 8)
11206 BlendMask |= 1ull << i;
11207 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11208 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11209 }
11210 // Use PBLENDW for lower/upper lanes and then blend lanes.
11211 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11212 // merge to VSELECT where useful.
11213 uint64_t LoMask = BlendMask & 0xFF;
11214 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11215 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11216 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11217 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11218 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11219 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11220 return DAG.getVectorShuffle(
11221 MVT::v16i16, DL, Lo, Hi,
11222 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11223 }
11224 [[fallthrough]];
11225 }
11226 case MVT::v32i8:
11227 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11228 [[fallthrough]];
11229 case MVT::v16i8: {
11230 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11231
11232 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11233 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11234 Subtarget, DAG))
11235 return Masked;
11236
11237 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11238 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11239 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11240 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11241 }
11242
11243 // If we have VPTERNLOG, we can use that as a bit blend.
11244 if (Subtarget.hasVLX())
11245 if (SDValue BitBlend =
11246 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11247 return BitBlend;
11248
11249 // Scale the blend by the number of bytes per element.
11250 int Scale = VT.getScalarSizeInBits() / 8;
11251
11252 // This form of blend is always done on bytes. Compute the byte vector
11253 // type.
11254 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11255
11256 // x86 allows load folding with blendvb from the 2nd source operand. But
11257 // we are still using LLVM select here (see comment below), so that's V1.
11258 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11259 // allow that load-folding possibility.
11260 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11262 std::swap(V1, V2);
11263 }
11264
11265 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11266 // mix of LLVM's code generator and the x86 backend. We tell the code
11267 // generator that boolean values in the elements of an x86 vector register
11268 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11269 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11270 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11271 // of the element (the remaining are ignored) and 0 in that high bit would
11272 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11273 // the LLVM model for boolean values in vector elements gets the relevant
11274 // bit set, it is set backwards and over constrained relative to x86's
11275 // actual model.
11276 SmallVector<SDValue, 32> VSELECTMask;
11277 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11278 for (int j = 0; j < Scale; ++j)
11279 VSELECTMask.push_back(
11280 Mask[i] < 0
11281 ? DAG.getUNDEF(MVT::i8)
11282 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11283
11284 V1 = DAG.getBitcast(BlendVT, V1);
11285 V2 = DAG.getBitcast(BlendVT, V2);
11286 return DAG.getBitcast(
11287 VT,
11288 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11289 V1, V2));
11290 }
11291 case MVT::v16f32:
11292 case MVT::v8f64:
11293 case MVT::v8i64:
11294 case MVT::v16i32:
11295 case MVT::v32i16:
11296 case MVT::v64i8: {
11297 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11298 bool OptForSize = DAG.shouldOptForSize();
11299 if (!OptForSize) {
11300 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11301 Subtarget, DAG))
11302 return Masked;
11303 }
11304
11305 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11306 // masked move.
11307 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11308 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11309 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11310 }
11311 default:
11312 llvm_unreachable("Not a supported integer vector type!");
11313 }
11314}
11315
11316/// Try to lower as a blend of elements from two inputs followed by
11317/// a single-input permutation.
11318///
11319/// This matches the pattern where we can blend elements from two inputs and
11320/// then reduce the shuffle to a single-input permutation.
11322 SDValue V1, SDValue V2,
11323 ArrayRef<int> Mask,
11324 SelectionDAG &DAG,
11325 bool ImmBlends = false) {
11326 // We build up the blend mask while checking whether a blend is a viable way
11327 // to reduce the shuffle.
11328 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11329 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11330
11331 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11332 if (Mask[i] < 0)
11333 continue;
11334
11335 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11336
11337 if (BlendMask[Mask[i] % Size] < 0)
11338 BlendMask[Mask[i] % Size] = Mask[i];
11339 else if (BlendMask[Mask[i] % Size] != Mask[i])
11340 return SDValue(); // Can't blend in the needed input!
11341
11342 PermuteMask[i] = Mask[i] % Size;
11343 }
11344
11345 // If only immediate blends, then bail if the blend mask can't be widened to
11346 // i16.
11347 unsigned EltSize = VT.getScalarSizeInBits();
11348 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11349 return SDValue();
11350
11351 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11352 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11353}
11354
11355/// Try to lower as an unpack of elements from two inputs followed by
11356/// a single-input permutation.
11357///
11358/// This matches the pattern where we can unpack elements from two inputs and
11359/// then reduce the shuffle to a single-input (wider) permutation.
11361 SDValue V1, SDValue V2,
11362 ArrayRef<int> Mask,
11363 SelectionDAG &DAG) {
11364 int NumElts = Mask.size();
11365 int NumLanes = VT.getSizeInBits() / 128;
11366 int NumLaneElts = NumElts / NumLanes;
11367 int NumHalfLaneElts = NumLaneElts / 2;
11368
11369 bool MatchLo = true, MatchHi = true;
11370 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11371
11372 // Determine UNPCKL/UNPCKH type and operand order.
11373 for (int Elt = 0; Elt != NumElts; ++Elt) {
11374 int M = Mask[Elt];
11375 if (M < 0)
11376 continue;
11377
11378 // Normalize the mask value depending on whether it's V1 or V2.
11379 int NormM = M;
11380 SDValue &Op = Ops[Elt & 1];
11381 if (M < NumElts && (Op.isUndef() || Op == V1))
11382 Op = V1;
11383 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11384 Op = V2;
11385 NormM -= NumElts;
11386 } else
11387 return SDValue();
11388
11389 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11390 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11391 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11392 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11393 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11394 if (MatchLoAnyLane || MatchHiAnyLane) {
11395 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11396 "Failed to match UNPCKLO/UNPCKHI");
11397 break;
11398 }
11399 }
11400 MatchLo &= MatchLoAnyLane;
11401 MatchHi &= MatchHiAnyLane;
11402 if (!MatchLo && !MatchHi)
11403 return SDValue();
11404 }
11405 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11406
11407 // Element indices have changed after unpacking. Calculate permute mask
11408 // so that they will be put back to the position as dictated by the
11409 // original shuffle mask indices.
11410 SmallVector<int, 32> PermuteMask(NumElts, -1);
11411 for (int Elt = 0; Elt != NumElts; ++Elt) {
11412 int M = Mask[Elt];
11413 if (M < 0)
11414 continue;
11415 int NormM = M;
11416 if (NumElts <= M)
11417 NormM -= NumElts;
11418 bool IsFirstOp = M < NumElts;
11419 int BaseMaskElt =
11420 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11421 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11422 PermuteMask[Elt] = BaseMaskElt;
11423 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11424 PermuteMask[Elt] = BaseMaskElt + 1;
11425 assert(PermuteMask[Elt] != -1 &&
11426 "Input mask element is defined but failed to assign permute mask");
11427 }
11428
11429 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11430 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11431 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11432}
11433
11434/// Try to lower a shuffle as a permute of the inputs followed by an
11435/// UNPCK instruction.
11436///
11437/// This specifically targets cases where we end up with alternating between
11438/// the two inputs, and so can permute them into something that feeds a single
11439/// UNPCK instruction. Note that this routine only targets integer vectors
11440/// because for floating point vectors we have a generalized SHUFPS lowering
11441/// strategy that handles everything that doesn't *exactly* match an unpack,
11442/// making this clever lowering unnecessary.
11444 SDValue V1, SDValue V2,
11445 ArrayRef<int> Mask,
11446 const X86Subtarget &Subtarget,
11447 SelectionDAG &DAG) {
11448 int Size = Mask.size();
11449 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11450
11451 // This routine only supports 128-bit integer dual input vectors.
11452 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11453 return SDValue();
11454
11455 int NumLoInputs =
11456 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11457 int NumHiInputs =
11458 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11459
11460 bool UnpackLo = NumLoInputs >= NumHiInputs;
11461
11462 auto TryUnpack = [&](int ScalarSize, int Scale) {
11463 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11464 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11465
11466 for (int i = 0; i < Size; ++i) {
11467 if (Mask[i] < 0)
11468 continue;
11469
11470 // Each element of the unpack contains Scale elements from this mask.
11471 int UnpackIdx = i / Scale;
11472
11473 // We only handle the case where V1 feeds the first slots of the unpack.
11474 // We rely on canonicalization to ensure this is the case.
11475 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11476 return SDValue();
11477
11478 // Setup the mask for this input. The indexing is tricky as we have to
11479 // handle the unpack stride.
11480 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11481 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11482 Mask[i] % Size;
11483 }
11484
11485 // If we will have to shuffle both inputs to use the unpack, check whether
11486 // we can just unpack first and shuffle the result. If so, skip this unpack.
11487 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11488 !isNoopShuffleMask(V2Mask))
11489 return SDValue();
11490
11491 // Shuffle the inputs into place.
11492 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11493 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11494
11495 // Cast the inputs to the type we will use to unpack them.
11496 MVT UnpackVT =
11497 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11498 V1 = DAG.getBitcast(UnpackVT, V1);
11499 V2 = DAG.getBitcast(UnpackVT, V2);
11500
11501 // Unpack the inputs and cast the result back to the desired type.
11502 return DAG.getBitcast(
11503 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11504 UnpackVT, V1, V2));
11505 };
11506
11507 // We try each unpack from the largest to the smallest to try and find one
11508 // that fits this mask.
11509 int OrigScalarSize = VT.getScalarSizeInBits();
11510 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11511 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11512 return Unpack;
11513
11514 // If we're shuffling with a zero vector then we're better off not doing
11515 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11518 return SDValue();
11519
11520 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11521 // initial unpack.
11522 if (NumLoInputs == 0 || NumHiInputs == 0) {
11523 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11524 "We have to have *some* inputs!");
11525 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11526
11527 // FIXME: We could consider the total complexity of the permute of each
11528 // possible unpacking. Or at the least we should consider how many
11529 // half-crossings are created.
11530 // FIXME: We could consider commuting the unpacks.
11531
11532 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11533 for (int i = 0; i < Size; ++i) {
11534 if (Mask[i] < 0)
11535 continue;
11536
11537 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11538
11539 PermMask[i] =
11540 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11541 }
11542 return DAG.getVectorShuffle(
11543 VT, DL,
11544 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11545 V1, V2),
11546 DAG.getUNDEF(VT), PermMask);
11547 }
11548
11549 return SDValue();
11550}
11551
11552/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11553/// permuting the elements of the result in place.
11555 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11556 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11557 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11558 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11559 (VT.is512BitVector() && !Subtarget.hasBWI()))
11560 return SDValue();
11561
11562 // We don't currently support lane crossing permutes.
11563 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11564 return SDValue();
11565
11566 int Scale = VT.getScalarSizeInBits() / 8;
11567 int NumLanes = VT.getSizeInBits() / 128;
11568 int NumElts = VT.getVectorNumElements();
11569 int NumEltsPerLane = NumElts / NumLanes;
11570
11571 // Determine range of mask elts.
11572 bool Blend1 = true;
11573 bool Blend2 = true;
11574 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11575 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11576 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11577 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11578 int M = Mask[Lane + Elt];
11579 if (M < 0)
11580 continue;
11581 if (M < NumElts) {
11582 Blend1 &= (M == (Lane + Elt));
11583 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11584 M = M % NumEltsPerLane;
11585 Range1.first = std::min(Range1.first, M);
11586 Range1.second = std::max(Range1.second, M);
11587 } else {
11588 M -= NumElts;
11589 Blend2 &= (M == (Lane + Elt));
11590 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11591 M = M % NumEltsPerLane;
11592 Range2.first = std::min(Range2.first, M);
11593 Range2.second = std::max(Range2.second, M);
11594 }
11595 }
11596 }
11597
11598 // Bail if we don't need both elements.
11599 // TODO - it might be worth doing this for unary shuffles if the permute
11600 // can be widened.
11601 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11602 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11603 return SDValue();
11604
11605 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11606 return SDValue();
11607
11608 // Rotate the 2 ops so we can access both ranges, then permute the result.
11609 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11610 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11611 SDValue Rotate = DAG.getBitcast(
11612 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11613 DAG.getBitcast(ByteVT, Lo),
11614 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11615 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11616 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11617 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11618 int M = Mask[Lane + Elt];
11619 if (M < 0)
11620 continue;
11621 if (M < NumElts)
11622 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11623 else
11624 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11625 }
11626 }
11627 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11628 };
11629
11630 // Check if the ranges are small enough to rotate from either direction.
11631 if (Range2.second < Range1.first)
11632 return RotateAndPermute(V1, V2, Range1.first, 0);
11633 if (Range1.second < Range2.first)
11634 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11635 return SDValue();
11636}
11637
11639 return isUndefOrEqual(Mask, 0);
11640}
11641
11643 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11644}
11645
11646/// Check if the Mask consists of the same element repeated multiple times.
11648 size_t NumUndefs = 0;
11649 std::optional<int> UniqueElt;
11650 for (int Elt : Mask) {
11651 if (Elt == SM_SentinelUndef) {
11652 NumUndefs++;
11653 continue;
11654 }
11655 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11656 return false;
11657 UniqueElt = Elt;
11658 }
11659 // Make sure the element is repeated enough times by checking the number of
11660 // undefs is small.
11661 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11662}
11663
11664/// Generic routine to decompose a shuffle and blend into independent
11665/// blends and permutes.
11666///
11667/// This matches the extremely common pattern for handling combined
11668/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11669/// operations. It will try to pick the best arrangement of shuffles and
11670/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11672 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11673 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11674 int NumElts = Mask.size();
11675 int NumLanes = VT.getSizeInBits() / 128;
11676 int NumEltsPerLane = NumElts / NumLanes;
11677
11678 // Shuffle the input elements into the desired positions in V1 and V2 and
11679 // unpack/blend them together.
11680 bool IsAlternating = true;
11681 bool V1Zero = true, V2Zero = true;
11682 SmallVector<int, 32> V1Mask(NumElts, -1);
11683 SmallVector<int, 32> V2Mask(NumElts, -1);
11684 SmallVector<int, 32> FinalMask(NumElts, -1);
11685 for (int i = 0; i < NumElts; ++i) {
11686 int M = Mask[i];
11687 if (M >= 0 && M < NumElts) {
11688 V1Mask[i] = M;
11689 FinalMask[i] = i;
11690 V1Zero &= Zeroable[i];
11691 IsAlternating &= (i & 1) == 0;
11692 } else if (M >= NumElts) {
11693 V2Mask[i] = M - NumElts;
11694 FinalMask[i] = i + NumElts;
11695 V2Zero &= Zeroable[i];
11696 IsAlternating &= (i & 1) == 1;
11697 }
11698 }
11699
11700 // If we effectively only demand the 0'th element of \p Input, and not only
11701 // as 0'th element, then broadcast said input,
11702 // and change \p InputMask to be a no-op (identity) mask.
11703 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11704 &DAG](SDValue &Input,
11705 MutableArrayRef<int> InputMask) {
11706 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11707 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11708 !X86::mayFoldLoad(Input, Subtarget)))
11709 return;
11710 if (isNoopShuffleMask(InputMask))
11711 return;
11712 assert(isBroadcastShuffleMask(InputMask) &&
11713 "Expected to demand only the 0'th element.");
11715 for (auto I : enumerate(InputMask)) {
11716 int &InputMaskElt = I.value();
11717 if (InputMaskElt >= 0)
11718 InputMaskElt = I.index();
11719 }
11720 };
11721
11722 // Currently, we may need to produce one shuffle per input, and blend results.
11723 // It is possible that the shuffle for one of the inputs is already a no-op.
11724 // See if we can simplify non-no-op shuffles into broadcasts,
11725 // which we consider to be strictly better than an arbitrary shuffle.
11726 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11728 canonicalizeBroadcastableInput(V1, V1Mask);
11729 canonicalizeBroadcastableInput(V2, V2Mask);
11730 }
11731
11732 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11733 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11734 // the shuffle may be able to fold with a load or other benefit. However, when
11735 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11736 // pre-shuffle first is a better strategy.
11737 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11738 // If we don't have blends, see if we can create a cheap unpack.
11739 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
11740 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
11741 is128BitUnpackShuffleMask(V2Mask, DAG)))
11742 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11743 DL, VT, V1, V2, Mask, Subtarget, DAG))
11744 return PermUnpack;
11745
11746 // Only prefer immediate blends to unpack/rotate.
11747 if (SDValue BlendPerm =
11748 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
11749 return BlendPerm;
11750
11751 // If either input vector provides only a single element which is repeated
11752 // multiple times, unpacking from both input vectors would generate worse
11753 // code. e.g. for
11754 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11755 // it is better to process t4 first to create a vector of t4[0], then unpack
11756 // that vector with t2.
11757 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11759 if (SDValue UnpackPerm =
11760 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11761 return UnpackPerm;
11762
11764 DL, VT, V1, V2, Mask, Subtarget, DAG))
11765 return RotatePerm;
11766
11767 // Unpack/rotate failed - try again with variable blends.
11768 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11769 DAG))
11770 return BlendPerm;
11771
11772 if (VT.getScalarSizeInBits() >= 32)
11773 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11774 DL, VT, V1, V2, Mask, Subtarget, DAG))
11775 return PermUnpack;
11776 }
11777
11778 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11779 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11780 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11781 // than half the elements coming from each source.
11782 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11783 V1Mask.assign(NumElts, -1);
11784 V2Mask.assign(NumElts, -1);
11785 FinalMask.assign(NumElts, -1);
11786 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11787 for (int j = 0; j != NumEltsPerLane; ++j) {
11788 int M = Mask[i + j];
11789 if (M >= 0 && M < NumElts) {
11790 V1Mask[i + (j / 2)] = M;
11791 FinalMask[i + j] = i + (j / 2);
11792 } else if (M >= NumElts) {
11793 V2Mask[i + (j / 2)] = M - NumElts;
11794 FinalMask[i + j] = i + (j / 2) + NumElts;
11795 }
11796 }
11797 }
11798
11799 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11800 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11801 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11802}
11803
11804static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11805 const X86Subtarget &Subtarget,
11806 ArrayRef<int> Mask) {
11807 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11808 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11809
11810 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11811 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11812 int MaxSubElts = 64 / EltSizeInBits;
11813 unsigned RotateAmt, NumSubElts;
11814 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11815 MaxSubElts, NumSubElts, RotateAmt))
11816 return -1;
11817 unsigned NumElts = Mask.size();
11818 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11819 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11820 return RotateAmt;
11821}
11822
11823/// Lower shuffle using X86ISD::VROTLI rotations.
11825 ArrayRef<int> Mask,
11826 const X86Subtarget &Subtarget,
11827 SelectionDAG &DAG) {
11828 // Only XOP + AVX512 targets have bit rotation instructions.
11829 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11830 bool IsLegal =
11831 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11832 if (!IsLegal && Subtarget.hasSSE3())
11833 return SDValue();
11834
11835 MVT RotateVT;
11836 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11837 Subtarget, Mask);
11838 if (RotateAmt < 0)
11839 return SDValue();
11840
11841 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11842 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11843 // widen to vXi16 or more then existing lowering should will be better.
11844 if (!IsLegal) {
11845 if ((RotateAmt % 16) == 0)
11846 return SDValue();
11847 // TODO: Use getTargetVShiftByConstNode.
11848 unsigned ShlAmt = RotateAmt;
11849 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11850 V1 = DAG.getBitcast(RotateVT, V1);
11851 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11852 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11853 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11854 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11855 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11856 return DAG.getBitcast(VT, Rot);
11857 }
11858
11859 SDValue Rot =
11860 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11861 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11862 return DAG.getBitcast(VT, Rot);
11863}
11864
11865/// Try to match a vector shuffle as an element rotation.
11866///
11867/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11869 ArrayRef<int> Mask) {
11870 int NumElts = Mask.size();
11871
11872 // We need to detect various ways of spelling a rotation:
11873 // [11, 12, 13, 14, 15, 0, 1, 2]
11874 // [-1, 12, 13, 14, -1, -1, 1, -1]
11875 // [-1, -1, -1, -1, -1, -1, 1, 2]
11876 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11877 // [-1, 4, 5, 6, -1, -1, 9, -1]
11878 // [-1, 4, 5, 6, -1, -1, -1, -1]
11879 int Rotation = 0;
11880 SDValue Lo, Hi;
11881 for (int i = 0; i < NumElts; ++i) {
11882 int M = Mask[i];
11883 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11884 "Unexpected mask index.");
11885 if (M < 0)
11886 continue;
11887
11888 // Determine where a rotated vector would have started.
11889 int StartIdx = i - (M % NumElts);
11890 if (StartIdx == 0)
11891 // The identity rotation isn't interesting, stop.
11892 return -1;
11893
11894 // If we found the tail of a vector the rotation must be the missing
11895 // front. If we found the head of a vector, it must be how much of the
11896 // head.
11897 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11898
11899 if (Rotation == 0)
11900 Rotation = CandidateRotation;
11901 else if (Rotation != CandidateRotation)
11902 // The rotations don't match, so we can't match this mask.
11903 return -1;
11904
11905 // Compute which value this mask is pointing at.
11906 SDValue MaskV = M < NumElts ? V1 : V2;
11907
11908 // Compute which of the two target values this index should be assigned
11909 // to. This reflects whether the high elements are remaining or the low
11910 // elements are remaining.
11911 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11912
11913 // Either set up this value if we've not encountered it before, or check
11914 // that it remains consistent.
11915 if (!TargetV)
11916 TargetV = MaskV;
11917 else if (TargetV != MaskV)
11918 // This may be a rotation, but it pulls from the inputs in some
11919 // unsupported interleaving.
11920 return -1;
11921 }
11922
11923 // Check that we successfully analyzed the mask, and normalize the results.
11924 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11925 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11926 if (!Lo)
11927 Lo = Hi;
11928 else if (!Hi)
11929 Hi = Lo;
11930
11931 V1 = Lo;
11932 V2 = Hi;
11933
11934 return Rotation;
11935}
11936
11937/// Try to lower a vector shuffle as a byte rotation.
11938///
11939/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11940/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11941/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11942/// try to generically lower a vector shuffle through such an pattern. It
11943/// does not check for the profitability of lowering either as PALIGNR or
11944/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11945/// This matches shuffle vectors that look like:
11946///
11947/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11948///
11949/// Essentially it concatenates V1 and V2, shifts right by some number of
11950/// elements, and takes the low elements as the result. Note that while this is
11951/// specified as a *right shift* because x86 is little-endian, it is a *left
11952/// rotate* of the vector lanes.
11954 ArrayRef<int> Mask) {
11955 // Don't accept any shuffles with zero elements.
11956 if (isAnyZero(Mask))
11957 return -1;
11958
11959 // PALIGNR works on 128-bit lanes.
11960 SmallVector<int, 16> RepeatedMask;
11961 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11962 return -1;
11963
11964 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11965 if (Rotation <= 0)
11966 return -1;
11967
11968 // PALIGNR rotates bytes, so we need to scale the
11969 // rotation based on how many bytes are in the vector lane.
11970 int NumElts = RepeatedMask.size();
11971 int Scale = 16 / NumElts;
11972 return Rotation * Scale;
11973}
11974
11976 SDValue V2, ArrayRef<int> Mask,
11977 const X86Subtarget &Subtarget,
11978 SelectionDAG &DAG) {
11979 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11980
11981 SDValue Lo = V1, Hi = V2;
11982 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11983 if (ByteRotation <= 0)
11984 return SDValue();
11985
11986 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11987 // PSLLDQ/PSRLDQ.
11988 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11989 Lo = DAG.getBitcast(ByteVT, Lo);
11990 Hi = DAG.getBitcast(ByteVT, Hi);
11991
11992 // SSSE3 targets can use the palignr instruction.
11993 if (Subtarget.hasSSSE3()) {
11994 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11995 "512-bit PALIGNR requires BWI instructions");
11996 return DAG.getBitcast(
11997 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11998 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11999 }
12000
12001 assert(VT.is128BitVector() &&
12002 "Rotate-based lowering only supports 128-bit lowering!");
12003 assert(Mask.size() <= 16 &&
12004 "Can shuffle at most 16 bytes in a 128-bit vector!");
12005 assert(ByteVT == MVT::v16i8 &&
12006 "SSE2 rotate lowering only needed for v16i8!");
12007
12008 // Default SSE2 implementation
12009 int LoByteShift = 16 - ByteRotation;
12010 int HiByteShift = ByteRotation;
12011
12012 SDValue LoShift =
12013 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12014 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12015 SDValue HiShift =
12016 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12017 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12018 return DAG.getBitcast(VT,
12019 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12020}
12021
12022/// Try to lower a vector shuffle as a dword/qword rotation.
12023///
12024/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12025/// rotation of the concatenation of two vectors; This routine will
12026/// try to generically lower a vector shuffle through such an pattern.
12027///
12028/// Essentially it concatenates V1 and V2, shifts right by some number of
12029/// elements, and takes the low elements as the result. Note that while this is
12030/// specified as a *right shift* because x86 is little-endian, it is a *left
12031/// rotate* of the vector lanes.
12033 SDValue V2, ArrayRef<int> Mask,
12034 const APInt &Zeroable,
12035 const X86Subtarget &Subtarget,
12036 SelectionDAG &DAG) {
12037 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12038 "Only 32-bit and 64-bit elements are supported!");
12039
12040 // 128/256-bit vectors are only supported with VLX.
12041 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12042 && "VLX required for 128/256-bit vectors");
12043
12044 SDValue Lo = V1, Hi = V2;
12045 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12046 if (0 < Rotation)
12047 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12048 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12049
12050 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12051 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12052 // TODO: We can probably make this more aggressive and use shift-pairs like
12053 // lowerShuffleAsByteShiftMask.
12054 unsigned NumElts = Mask.size();
12055 unsigned ZeroLo = Zeroable.countr_one();
12056 unsigned ZeroHi = Zeroable.countl_one();
12057 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12058 if (!ZeroLo && !ZeroHi)
12059 return SDValue();
12060
12061 if (ZeroLo) {
12062 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12063 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12064 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12065 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12066 getZeroVector(VT, Subtarget, DAG, DL),
12067 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12068 }
12069
12070 if (ZeroHi) {
12071 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12072 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12073 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12074 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12075 getZeroVector(VT, Subtarget, DAG, DL), Src,
12076 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12077 }
12078
12079 return SDValue();
12080}
12081
12082/// Try to lower a vector shuffle as a byte shift sequence.
12084 SDValue V2, ArrayRef<int> Mask,
12085 const APInt &Zeroable,
12086 const X86Subtarget &Subtarget,
12087 SelectionDAG &DAG) {
12088 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12089 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12090
12091 // We need a shuffle that has zeros at one/both ends and a sequential
12092 // shuffle from one source within.
12093 unsigned ZeroLo = Zeroable.countr_one();
12094 unsigned ZeroHi = Zeroable.countl_one();
12095 if (!ZeroLo && !ZeroHi)
12096 return SDValue();
12097
12098 unsigned NumElts = Mask.size();
12099 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12100 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12101 return SDValue();
12102
12103 unsigned Scale = VT.getScalarSizeInBits() / 8;
12104 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12105 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12106 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12107 return SDValue();
12108
12109 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12110 Res = DAG.getBitcast(MVT::v16i8, Res);
12111
12112 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12113 // inner sequential set of elements, possibly offset:
12114 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12115 // 01234567 --> 4567zzzz --> zzzzz456
12116 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12117 if (ZeroLo == 0) {
12118 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12119 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12120 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12121 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12122 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12123 } else if (ZeroHi == 0) {
12124 unsigned Shift = Mask[ZeroLo] % NumElts;
12125 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12126 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12127 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12128 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12129 } else if (!Subtarget.hasSSSE3()) {
12130 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12131 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12132 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12133 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12134 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12135 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12136 Shift += Mask[ZeroLo] % NumElts;
12137 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12138 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12139 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12140 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12141 } else
12142 return SDValue();
12143
12144 return DAG.getBitcast(VT, Res);
12145}
12146
12147/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12148///
12149/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12150/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12151/// matches elements from one of the input vectors shuffled to the left or
12152/// right with zeroable elements 'shifted in'. It handles both the strictly
12153/// bit-wise element shifts and the byte shift across an entire 128-bit double
12154/// quad word lane.
12155///
12156/// PSHL : (little-endian) left bit shift.
12157/// [ zz, 0, zz, 2 ]
12158/// [ -1, 4, zz, -1 ]
12159/// PSRL : (little-endian) right bit shift.
12160/// [ 1, zz, 3, zz]
12161/// [ -1, -1, 7, zz]
12162/// PSLLDQ : (little-endian) left byte shift
12163/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12164/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12165/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12166/// PSRLDQ : (little-endian) right byte shift
12167/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12168/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12169/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12170static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12171 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12172 int MaskOffset, const APInt &Zeroable,
12173 const X86Subtarget &Subtarget) {
12174 int Size = Mask.size();
12175 unsigned SizeInBits = Size * ScalarSizeInBits;
12176
12177 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12178 for (int i = 0; i < Size; i += Scale)
12179 for (int j = 0; j < Shift; ++j)
12180 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12181 return false;
12182
12183 return true;
12184 };
12185
12186 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12187 for (int i = 0; i != Size; i += Scale) {
12188 unsigned Pos = Left ? i + Shift : i;
12189 unsigned Low = Left ? i : i + Shift;
12190 unsigned Len = Scale - Shift;
12191 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12192 return -1;
12193 }
12194
12195 int ShiftEltBits = ScalarSizeInBits * Scale;
12196 bool ByteShift = ShiftEltBits > 64;
12197 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12198 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12199 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12200
12201 // Normalize the scale for byte shifts to still produce an i64 element
12202 // type.
12203 Scale = ByteShift ? Scale / 2 : Scale;
12204
12205 // We need to round trip through the appropriate type for the shift.
12206 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12207 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12208 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12209 return (int)ShiftAmt;
12210 };
12211
12212 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12213 // keep doubling the size of the integer elements up to that. We can
12214 // then shift the elements of the integer vector by whole multiples of
12215 // their width within the elements of the larger integer vector. Test each
12216 // multiple to see if we can find a match with the moved element indices
12217 // and that the shifted in elements are all zeroable.
12218 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12219 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12220 for (int Shift = 1; Shift != Scale; ++Shift)
12221 for (bool Left : {true, false})
12222 if (CheckZeros(Shift, Scale, Left)) {
12223 int ShiftAmt = MatchShift(Shift, Scale, Left);
12224 if (0 < ShiftAmt)
12225 return ShiftAmt;
12226 }
12227
12228 // no match
12229 return -1;
12230}
12231
12233 SDValue V2, ArrayRef<int> Mask,
12234 const APInt &Zeroable,
12235 const X86Subtarget &Subtarget,
12236 SelectionDAG &DAG, bool BitwiseOnly) {
12237 int Size = Mask.size();
12238 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12239
12240 MVT ShiftVT;
12241 SDValue V = V1;
12242 unsigned Opcode;
12243
12244 // Try to match shuffle against V1 shift.
12245 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12246 Mask, 0, Zeroable, Subtarget);
12247
12248 // If V1 failed, try to match shuffle against V2 shift.
12249 if (ShiftAmt < 0) {
12250 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12251 Mask, Size, Zeroable, Subtarget);
12252 V = V2;
12253 }
12254
12255 if (ShiftAmt < 0)
12256 return SDValue();
12257
12258 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12259 return SDValue();
12260
12261 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12262 "Illegal integer vector type");
12263 V = DAG.getBitcast(ShiftVT, V);
12264 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12265 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12266 return DAG.getBitcast(VT, V);
12267}
12268
12269// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12270// Remainder of lower half result is zero and upper half is all undef.
12271static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12272 ArrayRef<int> Mask, uint64_t &BitLen,
12273 uint64_t &BitIdx, const APInt &Zeroable) {
12274 int Size = Mask.size();
12275 int HalfSize = Size / 2;
12276 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12277 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12278
12279 // Upper half must be undefined.
12280 if (!isUndefUpperHalf(Mask))
12281 return false;
12282
12283 // Determine the extraction length from the part of the
12284 // lower half that isn't zeroable.
12285 int Len = HalfSize;
12286 for (; Len > 0; --Len)
12287 if (!Zeroable[Len - 1])
12288 break;
12289 assert(Len > 0 && "Zeroable shuffle mask");
12290
12291 // Attempt to match first Len sequential elements from the lower half.
12292 SDValue Src;
12293 int Idx = -1;
12294 for (int i = 0; i != Len; ++i) {
12295 int M = Mask[i];
12296 if (M == SM_SentinelUndef)
12297 continue;
12298 SDValue &V = (M < Size ? V1 : V2);
12299 M = M % Size;
12300
12301 // The extracted elements must start at a valid index and all mask
12302 // elements must be in the lower half.
12303 if (i > M || M >= HalfSize)
12304 return false;
12305
12306 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12307 Src = V;
12308 Idx = M - i;
12309 continue;
12310 }
12311 return false;
12312 }
12313
12314 if (!Src || Idx < 0)
12315 return false;
12316
12317 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12318 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12319 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12320 V1 = Src;
12321 return true;
12322}
12323
12324// INSERTQ: Extract lowest Len elements from lower half of second source and
12325// insert over first source, starting at Idx.
12326// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12327static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12328 ArrayRef<int> Mask, uint64_t &BitLen,
12329 uint64_t &BitIdx) {
12330 int Size = Mask.size();
12331 int HalfSize = Size / 2;
12332 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12333
12334 // Upper half must be undefined.
12335 if (!isUndefUpperHalf(Mask))
12336 return false;
12337
12338 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12339 SDValue Base;
12340
12341 // Attempt to match first source from mask before insertion point.
12342 if (isUndefInRange(Mask, 0, Idx)) {
12343 /* EMPTY */
12344 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12345 Base = V1;
12346 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12347 Base = V2;
12348 } else {
12349 continue;
12350 }
12351
12352 // Extend the extraction length looking to match both the insertion of
12353 // the second source and the remaining elements of the first.
12354 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12355 SDValue Insert;
12356 int Len = Hi - Idx;
12357
12358 // Match insertion.
12359 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12360 Insert = V1;
12361 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12362 Insert = V2;
12363 } else {
12364 continue;
12365 }
12366
12367 // Match the remaining elements of the lower half.
12368 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12369 /* EMPTY */
12370 } else if ((!Base || (Base == V1)) &&
12371 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12372 Base = V1;
12373 } else if ((!Base || (Base == V2)) &&
12374 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12375 Size + Hi)) {
12376 Base = V2;
12377 } else {
12378 continue;
12379 }
12380
12381 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12382 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12383 V1 = Base;
12384 V2 = Insert;
12385 return true;
12386 }
12387 }
12388
12389 return false;
12390}
12391
12392/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12394 SDValue V2, ArrayRef<int> Mask,
12395 const APInt &Zeroable, SelectionDAG &DAG) {
12396 uint64_t BitLen, BitIdx;
12397 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12398 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12399 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12400 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12401
12402 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12403 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12404 V2 ? V2 : DAG.getUNDEF(VT),
12405 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12406 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12407
12408 return SDValue();
12409}
12410
12411/// Lower a vector shuffle as an any/signed/zero extension.
12412///
12413/// Given a specific number of elements, element bit width, and extension
12414/// stride, produce either an extension based on the available
12415/// features of the subtarget. The extended elements are consecutive and
12416/// begin and can start from an offsetted element index in the input; to
12417/// avoid excess shuffling the offset must either being in the bottom lane
12418/// or at the start of a higher lane. All extended elements must be from
12419/// the same lane.
12421 int Scale, int Offset,
12422 unsigned ExtOpc, SDValue InputV,
12423 ArrayRef<int> Mask,
12424 const X86Subtarget &Subtarget,
12425 SelectionDAG &DAG) {
12426 assert(Scale > 1 && "Need a scale to extend.");
12427 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12428 int EltBits = VT.getScalarSizeInBits();
12429 int NumElements = VT.getVectorNumElements();
12430 int NumEltsPerLane = 128 / EltBits;
12431 int OffsetLane = Offset / NumEltsPerLane;
12432 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12433 "Only 8, 16, and 32 bit elements can be extended.");
12434 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12435 assert(0 <= Offset && "Extension offset must be positive.");
12436 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12437 "Extension offset must be in the first lane or start an upper lane.");
12438
12439 // Check that an index is in same lane as the base offset.
12440 auto SafeOffset = [&](int Idx) {
12441 return OffsetLane == (Idx / NumEltsPerLane);
12442 };
12443
12444 // Shift along an input so that the offset base moves to the first element.
12445 auto ShuffleOffset = [&](SDValue V) {
12446 if (!Offset)
12447 return V;
12448
12449 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12450 for (int i = 0; i * Scale < NumElements; ++i) {
12451 int SrcIdx = i + Offset;
12452 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12453 }
12454 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12455 };
12456
12457 // Found a valid a/zext mask! Try various lowering strategies based on the
12458 // input type and available ISA extensions.
12459 if (Subtarget.hasSSE41()) {
12460 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12461 // PUNPCK will catch this in a later shuffle match.
12462 if (Offset && Scale == 2 && VT.is128BitVector())
12463 return SDValue();
12464 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12465 NumElements / Scale);
12466 InputV = DAG.getBitcast(VT, InputV);
12467 InputV = ShuffleOffset(InputV);
12468 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12469 return DAG.getBitcast(VT, InputV);
12470 }
12471
12472 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12473 InputV = DAG.getBitcast(VT, InputV);
12474 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12475
12476 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12477 if (ExtOpc == ISD::SIGN_EXTEND)
12478 return SDValue();
12479
12480 // For any extends we can cheat for larger element sizes and use shuffle
12481 // instructions that can fold with a load and/or copy.
12482 if (AnyExt && EltBits == 32) {
12483 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12484 -1};
12485 return DAG.getBitcast(
12486 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12487 DAG.getBitcast(MVT::v4i32, InputV),
12488 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12489 }
12490 if (AnyExt && EltBits == 16 && Scale > 2) {
12491 int PSHUFDMask[4] = {Offset / 2, -1,
12492 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12493 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12494 DAG.getBitcast(MVT::v4i32, InputV),
12495 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12496 int PSHUFWMask[4] = {1, -1, -1, -1};
12497 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12498 return DAG.getBitcast(
12499 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12500 DAG.getBitcast(MVT::v8i16, InputV),
12501 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12502 }
12503
12504 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12505 // to 64-bits.
12506 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12507 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12508 assert(VT.is128BitVector() && "Unexpected vector width!");
12509
12510 int LoIdx = Offset * EltBits;
12511 SDValue Lo = DAG.getBitcast(
12512 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12513 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12514 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12515
12516 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12517 return DAG.getBitcast(VT, Lo);
12518
12519 int HiIdx = (Offset + 1) * EltBits;
12520 SDValue Hi = DAG.getBitcast(
12521 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12522 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12523 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12524 return DAG.getBitcast(VT,
12525 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12526 }
12527
12528 // If this would require more than 2 unpack instructions to expand, use
12529 // pshufb when available. We can only use more than 2 unpack instructions
12530 // when zero extending i8 elements which also makes it easier to use pshufb.
12531 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12532 assert(NumElements == 16 && "Unexpected byte vector width!");
12533 SDValue PSHUFBMask[16];
12534 for (int i = 0; i < 16; ++i) {
12535 int Idx = Offset + (i / Scale);
12536 if ((i % Scale == 0 && SafeOffset(Idx))) {
12537 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12538 continue;
12539 }
12540 PSHUFBMask[i] =
12541 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12542 }
12543 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12544 return DAG.getBitcast(
12545 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12546 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12547 }
12548
12549 // If we are extending from an offset, ensure we start on a boundary that
12550 // we can unpack from.
12551 int AlignToUnpack = Offset % (NumElements / Scale);
12552 if (AlignToUnpack) {
12553 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12554 for (int i = AlignToUnpack; i < NumElements; ++i)
12555 ShMask[i - AlignToUnpack] = i;
12556 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12557 Offset -= AlignToUnpack;
12558 }
12559
12560 // Otherwise emit a sequence of unpacks.
12561 do {
12562 unsigned UnpackLoHi = X86ISD::UNPCKL;
12563 if (Offset >= (NumElements / 2)) {
12564 UnpackLoHi = X86ISD::UNPCKH;
12565 Offset -= (NumElements / 2);
12566 }
12567
12568 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12569 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12570 : getZeroVector(InputVT, Subtarget, DAG, DL);
12571 InputV = DAG.getBitcast(InputVT, InputV);
12572 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12573 Scale /= 2;
12574 EltBits *= 2;
12575 NumElements /= 2;
12576 } while (Scale > 1);
12577 return DAG.getBitcast(VT, InputV);
12578}
12579
12580/// Try to lower a vector shuffle as a zero extension on any microarch.
12581///
12582/// This routine will try to do everything in its power to cleverly lower
12583/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12584/// check for the profitability of this lowering, it tries to aggressively
12585/// match this pattern. It will use all of the micro-architectural details it
12586/// can to emit an efficient lowering. It handles both blends with all-zero
12587/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12588/// masking out later).
12589///
12590/// The reason we have dedicated lowering for zext-style shuffles is that they
12591/// are both incredibly common and often quite performance sensitive.
12593 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12594 const APInt &Zeroable, const X86Subtarget &Subtarget,
12595 SelectionDAG &DAG) {
12596 int Bits = VT.getSizeInBits();
12597 int NumLanes = Bits / 128;
12598 int NumElements = VT.getVectorNumElements();
12599 int NumEltsPerLane = NumElements / NumLanes;
12600 assert(VT.getScalarSizeInBits() <= 32 &&
12601 "Exceeds 32-bit integer zero extension limit");
12602 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12603
12604 // Define a helper function to check a particular ext-scale and lower to it if
12605 // valid.
12606 auto Lower = [&](int Scale) -> SDValue {
12607 SDValue InputV;
12608 bool AnyExt = true;
12609 int Offset = 0;
12610 int Matches = 0;
12611 for (int i = 0; i < NumElements; ++i) {
12612 int M = Mask[i];
12613 if (M < 0)
12614 continue; // Valid anywhere but doesn't tell us anything.
12615 if (i % Scale != 0) {
12616 // Each of the extended elements need to be zeroable.
12617 if (!Zeroable[i])
12618 return SDValue();
12619
12620 // We no longer are in the anyext case.
12621 AnyExt = false;
12622 continue;
12623 }
12624
12625 // Each of the base elements needs to be consecutive indices into the
12626 // same input vector.
12627 SDValue V = M < NumElements ? V1 : V2;
12628 M = M % NumElements;
12629 if (!InputV) {
12630 InputV = V;
12631 Offset = M - (i / Scale);
12632 } else if (InputV != V)
12633 return SDValue(); // Flip-flopping inputs.
12634
12635 // Offset must start in the lowest 128-bit lane or at the start of an
12636 // upper lane.
12637 // FIXME: Is it ever worth allowing a negative base offset?
12638 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12639 (Offset % NumEltsPerLane) == 0))
12640 return SDValue();
12641
12642 // If we are offsetting, all referenced entries must come from the same
12643 // lane.
12644 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12645 return SDValue();
12646
12647 if ((M % NumElements) != (Offset + (i / Scale)))
12648 return SDValue(); // Non-consecutive strided elements.
12649 Matches++;
12650 }
12651
12652 // If we fail to find an input, we have a zero-shuffle which should always
12653 // have already been handled.
12654 // FIXME: Maybe handle this here in case during blending we end up with one?
12655 if (!InputV)
12656 return SDValue();
12657
12658 // If we are offsetting, don't extend if we only match a single input, we
12659 // can always do better by using a basic PSHUF or PUNPCK.
12660 if (Offset != 0 && Matches < 2)
12661 return SDValue();
12662
12663 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12664 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12665 InputV, Mask, Subtarget, DAG);
12666 };
12667
12668 // The widest scale possible for extending is to a 64-bit integer.
12669 assert(Bits % 64 == 0 &&
12670 "The number of bits in a vector must be divisible by 64 on x86!");
12671 int NumExtElements = Bits / 64;
12672
12673 // Each iteration, try extending the elements half as much, but into twice as
12674 // many elements.
12675 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12676 assert(NumElements % NumExtElements == 0 &&
12677 "The input vector size must be divisible by the extended size.");
12678 if (SDValue V = Lower(NumElements / NumExtElements))
12679 return V;
12680 }
12681
12682 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12683 if (Bits != 128)
12684 return SDValue();
12685
12686 // Returns one of the source operands if the shuffle can be reduced to a
12687 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12688 auto CanZExtLowHalf = [&]() {
12689 for (int i = NumElements / 2; i != NumElements; ++i)
12690 if (!Zeroable[i])
12691 return SDValue();
12692 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12693 return V1;
12694 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12695 return V2;
12696 return SDValue();
12697 };
12698
12699 if (SDValue V = CanZExtLowHalf()) {
12700 V = DAG.getBitcast(MVT::v2i64, V);
12701 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12702 return DAG.getBitcast(VT, V);
12703 }
12704
12705 // No viable ext lowering found.
12706 return SDValue();
12707}
12708
12709/// Try to get a scalar value for a specific element of a vector.
12710///
12711/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12713 SelectionDAG &DAG) {
12714 MVT VT = V.getSimpleValueType();
12715 MVT EltVT = VT.getVectorElementType();
12716 V = peekThroughBitcasts(V);
12717
12718 // If the bitcasts shift the element size, we can't extract an equivalent
12719 // element from it.
12720 MVT NewVT = V.getSimpleValueType();
12721 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12722 return SDValue();
12723
12724 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12725 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12726 // Ensure the scalar operand is the same size as the destination.
12727 // FIXME: Add support for scalar truncation where possible.
12728 SDValue S = V.getOperand(Idx);
12729 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12730 return DAG.getBitcast(EltVT, S);
12731 }
12732
12733 return SDValue();
12734}
12735
12736/// Helper to test for a load that can be folded with x86 shuffles.
12737///
12738/// This is particularly important because the set of instructions varies
12739/// significantly based on whether the operand is a load or not.
12741 return V.hasOneUse() &&
12743}
12744
12745template<typename T>
12746static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12747 T EltVT = VT.getScalarType();
12748 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12749 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12750}
12751
12752/// Try to lower insertion of a single element into a zero vector.
12753///
12754/// This is a common pattern that we have especially efficient patterns to lower
12755/// across all subtarget feature sets.
12757 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12758 const APInt &Zeroable, const X86Subtarget &Subtarget,
12759 SelectionDAG &DAG) {
12760 MVT ExtVT = VT;
12761 MVT EltVT = VT.getVectorElementType();
12762 unsigned NumElts = VT.getVectorNumElements();
12763 unsigned EltBits = VT.getScalarSizeInBits();
12764
12765 if (isSoftF16(EltVT, Subtarget))
12766 return SDValue();
12767
12768 int V2Index =
12769 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12770 Mask.begin();
12771 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12772 bool IsV1Zeroable = true;
12773 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12774 if (i != V2Index && !Zeroable[i]) {
12775 IsV1Zeroable = false;
12776 break;
12777 }
12778
12779 // Bail if a non-zero V1 isn't used in place.
12780 if (!IsV1Zeroable) {
12781 SmallVector<int, 8> V1Mask(Mask);
12782 V1Mask[V2Index] = -1;
12783 if (!isNoopShuffleMask(V1Mask))
12784 return SDValue();
12785 }
12786
12787 // Check for a single input from a SCALAR_TO_VECTOR node.
12788 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12789 // all the smarts here sunk into that routine. However, the current
12790 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12791 // vector shuffle lowering is dead.
12792 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12793 DAG);
12794 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12795 // We need to zext the scalar if it is smaller than an i32.
12796 V2S = DAG.getBitcast(EltVT, V2S);
12797 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12798 // Using zext to expand a narrow element won't work for non-zero
12799 // insertions. But we can use a masked constant vector if we're
12800 // inserting V2 into the bottom of V1.
12801 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12802 return SDValue();
12803
12804 // Zero-extend directly to i32.
12805 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12806 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12807
12808 // If we're inserting into a constant, mask off the inserted index
12809 // and OR with the zero-extended scalar.
12810 if (!IsV1Zeroable) {
12811 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12812 Bits[V2Index] = APInt::getZero(EltBits);
12813 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12814 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12815 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12816 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12817 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12818 }
12819 }
12820 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12821 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12822 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12823 // Either not inserting from the low element of the input or the input
12824 // element size is too small to use VZEXT_MOVL to clear the high bits.
12825 return SDValue();
12826 }
12827
12828 if (!IsV1Zeroable) {
12829 // If V1 can't be treated as a zero vector we have fewer options to lower
12830 // this. We can't support integer vectors or non-zero targets cheaply.
12831 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12832 if (!VT.isFloatingPoint() || V2Index != 0)
12833 return SDValue();
12834 if (!VT.is128BitVector())
12835 return SDValue();
12836
12837 // Otherwise, use MOVSD, MOVSS or MOVSH.
12838 unsigned MovOpc = 0;
12839 if (EltVT == MVT::f16)
12840 MovOpc = X86ISD::MOVSH;
12841 else if (EltVT == MVT::f32)
12842 MovOpc = X86ISD::MOVSS;
12843 else if (EltVT == MVT::f64)
12844 MovOpc = X86ISD::MOVSD;
12845 else
12846 llvm_unreachable("Unsupported floating point element type to handle!");
12847 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12848 }
12849
12850 // This lowering only works for the low element with floating point vectors.
12851 if (VT.isFloatingPoint() && V2Index != 0)
12852 return SDValue();
12853
12854 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12855 if (ExtVT != VT)
12856 V2 = DAG.getBitcast(VT, V2);
12857
12858 if (V2Index != 0) {
12859 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12860 // the desired position. Otherwise it is more efficient to do a vector
12861 // shift left. We know that we can do a vector shift left because all
12862 // the inputs are zero.
12863 if (VT.isFloatingPoint() || NumElts <= 4) {
12864 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12865 V2Shuffle[V2Index] = 0;
12866 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12867 } else {
12868 V2 = DAG.getBitcast(MVT::v16i8, V2);
12869 V2 = DAG.getNode(
12870 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12871 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12872 V2 = DAG.getBitcast(VT, V2);
12873 }
12874 }
12875 return V2;
12876}
12877
12878/// Try to lower broadcast of a single - truncated - integer element,
12879/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12880///
12881/// This assumes we have AVX2.
12883 int BroadcastIdx,
12884 const X86Subtarget &Subtarget,
12885 SelectionDAG &DAG) {
12886 assert(Subtarget.hasAVX2() &&
12887 "We can only lower integer broadcasts with AVX2!");
12888
12889 MVT EltVT = VT.getVectorElementType();
12890 MVT V0VT = V0.getSimpleValueType();
12891
12892 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12893 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12894
12895 MVT V0EltVT = V0VT.getVectorElementType();
12896 if (!V0EltVT.isInteger())
12897 return SDValue();
12898
12899 const unsigned EltSize = EltVT.getSizeInBits();
12900 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12901
12902 // This is only a truncation if the original element type is larger.
12903 if (V0EltSize <= EltSize)
12904 return SDValue();
12905
12906 assert(((V0EltSize % EltSize) == 0) &&
12907 "Scalar type sizes must all be powers of 2 on x86!");
12908
12909 const unsigned V0Opc = V0.getOpcode();
12910 const unsigned Scale = V0EltSize / EltSize;
12911 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12912
12913 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12914 V0Opc != ISD::BUILD_VECTOR)
12915 return SDValue();
12916
12917 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12918
12919 // If we're extracting non-least-significant bits, shift so we can truncate.
12920 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12921 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12922 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12923 if (const int OffsetIdx = BroadcastIdx % Scale)
12924 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12925 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12926
12927 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12928 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12929}
12930
12931/// Test whether this can be lowered with a single SHUFPS instruction.
12932///
12933/// This is used to disable more specialized lowerings when the shufps lowering
12934/// will happen to be efficient.
12936 // This routine only handles 128-bit shufps.
12937 assert(Mask.size() == 4 && "Unsupported mask size!");
12938 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12939 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12940 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12941 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12942
12943 // To lower with a single SHUFPS we need to have the low half and high half
12944 // each requiring a single input.
12945 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12946 return false;
12947 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12948 return false;
12949
12950 return true;
12951}
12952
12953/// Test whether the specified input (0 or 1) is in-place blended by the
12954/// given mask.
12955///
12956/// This returns true if the elements from a particular input are already in the
12957/// slot required by the given mask and require no permutation.
12959 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12960 int Size = Mask.size();
12961 for (int i = 0; i < Size; ++i)
12962 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12963 return false;
12964
12965 return true;
12966}
12967
12968/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12969/// the given mask.
12970///
12972 int BroadcastableElement = 0) {
12973 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12974 int Size = Mask.size();
12975 for (int i = 0; i < Size; ++i)
12976 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12977 Mask[i] % Size != BroadcastableElement)
12978 return false;
12979 return true;
12980}
12981
12982/// If we are extracting two 128-bit halves of a vector and shuffling the
12983/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12984/// multi-shuffle lowering.
12986 SDValue N1, ArrayRef<int> Mask,
12987 SelectionDAG &DAG) {
12988 MVT VT = N0.getSimpleValueType();
12989 assert((VT.is128BitVector() &&
12990 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12991 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12992
12993 // Check that both sources are extracts of the same source vector.
12994 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12996 N0.getOperand(0) != N1.getOperand(0) ||
12997 !N0.hasOneUse() || !N1.hasOneUse())
12998 return SDValue();
12999
13000 SDValue WideVec = N0.getOperand(0);
13001 MVT WideVT = WideVec.getSimpleValueType();
13002 if (!WideVT.is256BitVector())
13003 return SDValue();
13004
13005 // Match extracts of each half of the wide source vector. Commute the shuffle
13006 // if the extract of the low half is N1.
13007 unsigned NumElts = VT.getVectorNumElements();
13008 SmallVector<int, 4> NewMask(Mask);
13009 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13010 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13011 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13013 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13014 return SDValue();
13015
13016 // Final bailout: if the mask is simple, we are better off using an extract
13017 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13018 // because that avoids a constant load from memory.
13019 if (NumElts == 4 &&
13020 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13021 return SDValue();
13022
13023 // Extend the shuffle mask with undef elements.
13024 NewMask.append(NumElts, -1);
13025
13026 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13027 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13028 NewMask);
13029 // This is free: ymm -> xmm.
13030 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13031 DAG.getVectorIdxConstant(0, DL));
13032}
13033
13034/// Try to lower broadcast of a single element.
13035///
13036/// For convenience, this code also bundles all of the subtarget feature set
13037/// filtering. While a little annoying to re-dispatch on type here, there isn't
13038/// a convenient way to factor it out.
13040 SDValue V2, ArrayRef<int> Mask,
13041 const X86Subtarget &Subtarget,
13042 SelectionDAG &DAG) {
13043 MVT EltVT = VT.getVectorElementType();
13044 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13045 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13046 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13047 return SDValue();
13048
13049 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13050 // we can only broadcast from a register with AVX2.
13051 unsigned NumEltBits = VT.getScalarSizeInBits();
13052 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13055 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13056
13057 // Check that the mask is a broadcast.
13058 int BroadcastIdx = getSplatIndex(Mask);
13059 if (BroadcastIdx < 0) {
13060 // Check for hidden broadcast.
13061 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13062 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13063 return SDValue();
13064 BroadcastIdx = 0;
13065 }
13066 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13067 "a sorted mask where the broadcast "
13068 "comes from V1.");
13069 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13070
13071 // Go up the chain of (vector) values to find a scalar load that we can
13072 // combine with the broadcast.
13073 // TODO: Combine this logic with findEltLoadSrc() used by
13074 // EltsFromConsecutiveLoads().
13075 int BitOffset = BroadcastIdx * NumEltBits;
13076 SDValue V = V1;
13077 for (;;) {
13078 switch (V.getOpcode()) {
13079 case ISD::BITCAST: {
13080 V = V.getOperand(0);
13081 continue;
13082 }
13083 case ISD::CONCAT_VECTORS: {
13084 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13085 int OpIdx = BitOffset / OpBitWidth;
13086 V = V.getOperand(OpIdx);
13087 BitOffset %= OpBitWidth;
13088 continue;
13089 }
13091 // The extraction index adds to the existing offset.
13092 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13093 unsigned Idx = V.getConstantOperandVal(1);
13094 unsigned BeginOffset = Idx * EltBitWidth;
13095 BitOffset += BeginOffset;
13096 V = V.getOperand(0);
13097 continue;
13098 }
13099 case ISD::INSERT_SUBVECTOR: {
13100 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13101 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13102 int Idx = (int)V.getConstantOperandVal(2);
13103 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13104 int BeginOffset = Idx * EltBitWidth;
13105 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13106 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13107 BitOffset -= BeginOffset;
13108 V = VInner;
13109 } else {
13110 V = VOuter;
13111 }
13112 continue;
13113 }
13114 }
13115 break;
13116 }
13117 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13118 BroadcastIdx = BitOffset / NumEltBits;
13119
13120 // Do we need to bitcast the source to retrieve the original broadcast index?
13121 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13122
13123 // Check if this is a broadcast of a scalar. We special case lowering
13124 // for scalars so that we can more effectively fold with loads.
13125 // If the original value has a larger element type than the shuffle, the
13126 // broadcast element is in essence truncated. Make that explicit to ease
13127 // folding.
13128 if (BitCastSrc && VT.isInteger())
13129 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13130 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13131 return TruncBroadcast;
13132
13133 // Also check the simpler case, where we can directly reuse the scalar.
13134 if (!BitCastSrc &&
13135 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13136 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13137 V = V.getOperand(BroadcastIdx);
13138
13139 // If we can't broadcast from a register, check that the input is a load.
13140 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13141 return SDValue();
13142 } else if (ISD::isNormalLoad(V.getNode()) &&
13143 cast<LoadSDNode>(V)->isSimple()) {
13144 // We do not check for one-use of the vector load because a broadcast load
13145 // is expected to be a win for code size, register pressure, and possibly
13146 // uops even if the original vector load is not eliminated.
13147
13148 // Reduce the vector load and shuffle to a broadcasted scalar load.
13149 auto *Ld = cast<LoadSDNode>(V);
13150 SDValue BaseAddr = Ld->getBasePtr();
13151 MVT SVT = VT.getScalarType();
13152 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13153 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13154 SDValue NewAddr =
13156
13157 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13158 // than MOVDDUP.
13159 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13160 if (Opcode == X86ISD::VBROADCAST) {
13161 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13162 SDValue Ops[] = {Ld->getChain(), NewAddr};
13163 V = DAG.getMemIntrinsicNode(
13164 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13166 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13168 return DAG.getBitcast(VT, V);
13169 }
13170 assert(SVT == MVT::f64 && "Unexpected VT!");
13171 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13173 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13175 } else if (!BroadcastFromReg) {
13176 // We can't broadcast from a vector register.
13177 return SDValue();
13178 } else if (BitOffset != 0) {
13179 // We can only broadcast from the zero-element of a vector register,
13180 // but it can be advantageous to broadcast from the zero-element of a
13181 // subvector.
13182 if (!VT.is256BitVector() && !VT.is512BitVector())
13183 return SDValue();
13184
13185 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13186 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13187 return SDValue();
13188
13189 // If we are broadcasting an element from the lowest 128-bit subvector, try
13190 // to move the element in position.
13191 if (BitOffset < 128 && NumActiveElts > 1 &&
13192 V.getScalarValueSizeInBits() == NumEltBits) {
13193 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13194 "Unexpected bit-offset");
13195 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13196 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13197 V = extractSubVector(V, 0, DAG, DL, 128);
13198 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13199 } else {
13200 // Only broadcast the zero-element of a 128-bit subvector.
13201 if ((BitOffset % 128) != 0)
13202 return SDValue();
13203
13204 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13205 "Unexpected bit-offset");
13206 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13207 "Unexpected vector size");
13208 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13209 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13210 }
13211 }
13212
13213 // On AVX we can use VBROADCAST directly for scalar sources.
13214 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13215 V = DAG.getBitcast(MVT::f64, V);
13216 if (Subtarget.hasAVX()) {
13217 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13218 return DAG.getBitcast(VT, V);
13219 }
13220 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13221 }
13222
13223 // If this is a scalar, do the broadcast on this type and bitcast.
13224 if (!V.getValueType().isVector()) {
13225 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13226 "Unexpected scalar size");
13227 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13229 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13230 }
13231
13232 // We only support broadcasting from 128-bit vectors to minimize the
13233 // number of patterns we need to deal with in isel. So extract down to
13234 // 128-bits, removing as many bitcasts as possible.
13235 if (V.getValueSizeInBits() > 128)
13237
13238 // Otherwise cast V to a vector with the same element type as VT, but
13239 // possibly narrower than VT. Then perform the broadcast.
13240 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13241 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13242 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13243}
13244
13245// Check for whether we can use INSERTPS to perform the shuffle. We only use
13246// INSERTPS when the V1 elements are already in the correct locations
13247// because otherwise we can just always use two SHUFPS instructions which
13248// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13249// perform INSERTPS if a single V1 element is out of place and all V2
13250// elements are zeroable.
13252 unsigned &InsertPSMask,
13253 const APInt &Zeroable,
13254 ArrayRef<int> Mask, SelectionDAG &DAG) {
13255 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13256 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13257 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13258
13259 // Attempt to match INSERTPS with one element from VA or VB being
13260 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13261 // are updated.
13262 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13263 ArrayRef<int> CandidateMask) {
13264 unsigned ZMask = 0;
13265 int VADstIndex = -1;
13266 int VBDstIndex = -1;
13267 bool VAUsedInPlace = false;
13268
13269 for (int i = 0; i < 4; ++i) {
13270 // Synthesize a zero mask from the zeroable elements (includes undefs).
13271 if (Zeroable[i]) {
13272 ZMask |= 1 << i;
13273 continue;
13274 }
13275
13276 // Flag if we use any VA inputs in place.
13277 if (i == CandidateMask[i]) {
13278 VAUsedInPlace = true;
13279 continue;
13280 }
13281
13282 // We can only insert a single non-zeroable element.
13283 if (VADstIndex >= 0 || VBDstIndex >= 0)
13284 return false;
13285
13286 if (CandidateMask[i] < 4) {
13287 // VA input out of place for insertion.
13288 VADstIndex = i;
13289 } else {
13290 // VB input for insertion.
13291 VBDstIndex = i;
13292 }
13293 }
13294
13295 // Don't bother if we have no (non-zeroable) element for insertion.
13296 if (VADstIndex < 0 && VBDstIndex < 0)
13297 return false;
13298
13299 // Determine element insertion src/dst indices. The src index is from the
13300 // start of the inserted vector, not the start of the concatenated vector.
13301 unsigned VBSrcIndex = 0;
13302 if (VADstIndex >= 0) {
13303 // If we have a VA input out of place, we use VA as the V2 element
13304 // insertion and don't use the original V2 at all.
13305 VBSrcIndex = CandidateMask[VADstIndex];
13306 VBDstIndex = VADstIndex;
13307 VB = VA;
13308 } else {
13309 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13310 }
13311
13312 // If no V1 inputs are used in place, then the result is created only from
13313 // the zero mask and the V2 insertion - so remove V1 dependency.
13314 if (!VAUsedInPlace)
13315 VA = DAG.getUNDEF(MVT::v4f32);
13316
13317 // Update V1, V2 and InsertPSMask accordingly.
13318 V1 = VA;
13319 V2 = VB;
13320
13321 // Insert the V2 element into the desired position.
13322 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13323 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13324 return true;
13325 };
13326
13327 if (matchAsInsertPS(V1, V2, Mask))
13328 return true;
13329
13330 // Commute and try again.
13331 SmallVector<int, 4> CommutedMask(Mask);
13333 if (matchAsInsertPS(V2, V1, CommutedMask))
13334 return true;
13335
13336 return false;
13337}
13338
13340 ArrayRef<int> Mask, const APInt &Zeroable,
13341 SelectionDAG &DAG) {
13342 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13343 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13344
13345 // Attempt to match the insertps pattern.
13346 unsigned InsertPSMask = 0;
13347 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13348 return SDValue();
13349
13350 // Insert the V2 element into the desired position.
13351 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13352 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13353}
13354
13355/// Handle lowering of 2-lane 64-bit floating point shuffles.
13356///
13357/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13358/// support for floating point shuffles but not integer shuffles. These
13359/// instructions will incur a domain crossing penalty on some chips though so
13360/// it is better to avoid lowering through this for integer vectors where
13361/// possible.
13363 const APInt &Zeroable, SDValue V1, SDValue V2,
13364 const X86Subtarget &Subtarget,
13365 SelectionDAG &DAG) {
13366 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13367 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13368 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13369
13370 if (V2.isUndef()) {
13371 // Check for being able to broadcast a single element.
13372 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13373 Mask, Subtarget, DAG))
13374 return Broadcast;
13375
13376 // Straight shuffle of a single input vector. Simulate this by using the
13377 // single input as both of the "inputs" to this instruction..
13378 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13379
13380 if (Subtarget.hasAVX()) {
13381 // If we have AVX, we can use VPERMILPS which will allow folding a load
13382 // into the shuffle.
13383 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13384 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13385 }
13386
13387 return DAG.getNode(
13388 X86ISD::SHUFP, DL, MVT::v2f64,
13389 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13390 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13391 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13392 }
13393 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13394 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13395 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13396 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13397
13398 if (Subtarget.hasAVX2())
13399 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13400 return Extract;
13401
13402 // When loading a scalar and then shuffling it into a vector we can often do
13403 // the insertion cheaply.
13405 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13406 return Insertion;
13407 // Try inverting the insertion since for v2 masks it is easy to do and we
13408 // can't reliably sort the mask one way or the other.
13409 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13410 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13412 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13413 return Insertion;
13414
13415 // Try to use one of the special instruction patterns to handle two common
13416 // blend patterns if a zero-blend above didn't work.
13417 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13418 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13419 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13420 // We can either use a special instruction to load over the low double or
13421 // to move just the low double.
13422 return DAG.getNode(
13423 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13424 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13425
13426 if (Subtarget.hasSSE41())
13427 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13428 Zeroable, Subtarget, DAG))
13429 return Blend;
13430
13431 // Use dedicated unpack instructions for masks that match their pattern.
13432 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13433 return V;
13434
13435 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13436 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13437 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13438}
13439
13440/// Handle lowering of 2-lane 64-bit integer shuffles.
13441///
13442/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13443/// the integer unit to minimize domain crossing penalties. However, for blends
13444/// it falls back to the floating point shuffle operation with appropriate bit
13445/// casting.
13447 const APInt &Zeroable, SDValue V1, SDValue V2,
13448 const X86Subtarget &Subtarget,
13449 SelectionDAG &DAG) {
13450 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13451 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13452 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13453
13454 if (V2.isUndef()) {
13455 // Check for being able to broadcast a single element.
13456 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13457 Mask, Subtarget, DAG))
13458 return Broadcast;
13459
13460 // Straight shuffle of a single input vector. For everything from SSE2
13461 // onward this has a single fast instruction with no scary immediates.
13462 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13463 V1 = DAG.getBitcast(MVT::v4i32, V1);
13464 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13465 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13466 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13467 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13468 return DAG.getBitcast(
13469 MVT::v2i64,
13470 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13471 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13472 }
13473 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13474 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13475 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13476 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13477
13478 if (Subtarget.hasAVX2())
13479 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13480 return Extract;
13481
13482 // Try to use shift instructions.
13483 if (SDValue Shift =
13484 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13485 DAG, /*BitwiseOnly*/ false))
13486 return Shift;
13487
13488 // When loading a scalar and then shuffling it into a vector we can often do
13489 // the insertion cheaply.
13491 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13492 return Insertion;
13493 // Try inverting the insertion since for v2 masks it is easy to do and we
13494 // can't reliably sort the mask one way or the other.
13495 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13497 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13498 return Insertion;
13499
13500 // We have different paths for blend lowering, but they all must use the
13501 // *exact* same predicate.
13502 bool IsBlendSupported = Subtarget.hasSSE41();
13503 if (IsBlendSupported)
13504 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13505 Zeroable, Subtarget, DAG))
13506 return Blend;
13507
13508 // Use dedicated unpack instructions for masks that match their pattern.
13509 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13510 return V;
13511
13512 // Try to use byte rotation instructions.
13513 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13514 if (Subtarget.hasSSSE3()) {
13515 if (Subtarget.hasVLX())
13516 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13517 Zeroable, Subtarget, DAG))
13518 return Rotate;
13519
13520 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13521 Subtarget, DAG))
13522 return Rotate;
13523 }
13524
13525 // If we have direct support for blends, we should lower by decomposing into
13526 // a permute. That will be faster than the domain cross.
13527 if (IsBlendSupported)
13528 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13529 Zeroable, Subtarget, DAG);
13530
13531 // We implement this with SHUFPD which is pretty lame because it will likely
13532 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13533 // However, all the alternatives are still more cycles and newer chips don't
13534 // have this problem. It would be really nice if x86 had better shuffles here.
13535 V1 = DAG.getBitcast(MVT::v2f64, V1);
13536 V2 = DAG.getBitcast(MVT::v2f64, V2);
13537 return DAG.getBitcast(MVT::v2i64,
13538 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13539}
13540
13541/// Lower a vector shuffle using the SHUFPS instruction.
13542///
13543/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13544/// It makes no assumptions about whether this is the *best* lowering, it simply
13545/// uses it.
13547 ArrayRef<int> Mask, SDValue V1,
13548 SDValue V2, SelectionDAG &DAG) {
13549 SDValue LowV = V1, HighV = V2;
13550 SmallVector<int, 4> NewMask(Mask);
13551 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13552
13553 if (NumV2Elements == 1) {
13554 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13555
13556 // Compute the index adjacent to V2Index and in the same half by toggling
13557 // the low bit.
13558 int V2AdjIndex = V2Index ^ 1;
13559
13560 if (Mask[V2AdjIndex] < 0) {
13561 // Handles all the cases where we have a single V2 element and an undef.
13562 // This will only ever happen in the high lanes because we commute the
13563 // vector otherwise.
13564 if (V2Index < 2)
13565 std::swap(LowV, HighV);
13566 NewMask[V2Index] -= 4;
13567 } else {
13568 // Handle the case where the V2 element ends up adjacent to a V1 element.
13569 // To make this work, blend them together as the first step.
13570 int V1Index = V2AdjIndex;
13571 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13572 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13573 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13574
13575 // Now proceed to reconstruct the final blend as we have the necessary
13576 // high or low half formed.
13577 if (V2Index < 2) {
13578 LowV = V2;
13579 HighV = V1;
13580 } else {
13581 HighV = V2;
13582 }
13583 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13584 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13585 }
13586 } else if (NumV2Elements == 2) {
13587 if (Mask[0] < 4 && Mask[1] < 4) {
13588 // Handle the easy case where we have V1 in the low lanes and V2 in the
13589 // high lanes.
13590 NewMask[2] -= 4;
13591 NewMask[3] -= 4;
13592 } else if (Mask[2] < 4 && Mask[3] < 4) {
13593 // We also handle the reversed case because this utility may get called
13594 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13595 // arrange things in the right direction.
13596 NewMask[0] -= 4;
13597 NewMask[1] -= 4;
13598 HighV = V1;
13599 LowV = V2;
13600 } else {
13601 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13602 // trying to place elements directly, just blend them and set up the final
13603 // shuffle to place them.
13604
13605 // The first two blend mask elements are for V1, the second two are for
13606 // V2.
13607 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13608 Mask[2] < 4 ? Mask[2] : Mask[3],
13609 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13610 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13611 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13612 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13613
13614 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13615 // a blend.
13616 LowV = HighV = V1;
13617 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13618 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13619 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13620 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13621 }
13622 } else if (NumV2Elements == 3) {
13623 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13624 // we can get here due to other paths (e.g repeated mask matching) that we
13625 // don't want to do another round of lowerVECTOR_SHUFFLE.
13627 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13628 }
13629 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13630 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13631}
13632
13633/// Lower 4-lane 32-bit floating point shuffles.
13634///
13635/// Uses instructions exclusively from the floating point unit to minimize
13636/// domain crossing penalties, as these are sufficient to implement all v4f32
13637/// shuffles.
13639 const APInt &Zeroable, SDValue V1, SDValue V2,
13640 const X86Subtarget &Subtarget,
13641 SelectionDAG &DAG) {
13642 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13643 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13644 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13645
13646 if (Subtarget.hasSSE41())
13647 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13648 Zeroable, Subtarget, DAG))
13649 return Blend;
13650
13651 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13652
13653 if (NumV2Elements == 0) {
13654 // Check for being able to broadcast a single element.
13655 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13656 Mask, Subtarget, DAG))
13657 return Broadcast;
13658
13659 // Use even/odd duplicate instructions for masks that match their pattern.
13660 if (Subtarget.hasSSE3()) {
13661 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13662 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13663 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13664 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13665 }
13666
13667 if (Subtarget.hasAVX()) {
13668 // If we have AVX, we can use VPERMILPS which will allow folding a load
13669 // into the shuffle.
13670 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13671 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13672 }
13673
13674 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13675 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13676 if (!Subtarget.hasSSE2()) {
13677 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13678 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13679 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13680 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13681 }
13682
13683 // Otherwise, use a straight shuffle of a single input vector. We pass the
13684 // input vector to both operands to simulate this with a SHUFPS.
13685 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13686 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13687 }
13688
13689 if (Subtarget.hasSSE2())
13691 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13692 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13693 return ZExt;
13694 }
13695
13696 if (Subtarget.hasAVX2())
13697 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13698 return Extract;
13699
13700 // There are special ways we can lower some single-element blends. However, we
13701 // have custom ways we can lower more complex single-element blends below that
13702 // we defer to if both this and BLENDPS fail to match, so restrict this to
13703 // when the V2 input is targeting element 0 of the mask -- that is the fast
13704 // case here.
13705 if (NumV2Elements == 1 && Mask[0] >= 4)
13707 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13708 return V;
13709
13710 if (Subtarget.hasSSE41()) {
13711 // Use INSERTPS if we can complete the shuffle efficiently.
13712 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13713 return V;
13714
13715 if (!isSingleSHUFPSMask(Mask))
13716 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13717 V2, Mask, DAG))
13718 return BlendPerm;
13719 }
13720
13721 // Use low/high mov instructions. These are only valid in SSE1 because
13722 // otherwise they are widened to v2f64 and never get here.
13723 if (!Subtarget.hasSSE2()) {
13724 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13725 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13726 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13727 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13728 }
13729
13730 // Use dedicated unpack instructions for masks that match their pattern.
13731 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13732 return V;
13733
13734 // Otherwise fall back to a SHUFPS lowering strategy.
13735 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13736}
13737
13738/// Lower 4-lane i32 vector shuffles.
13739///
13740/// We try to handle these with integer-domain shuffles where we can, but for
13741/// blends we use the floating point domain blend instructions.
13743 const APInt &Zeroable, SDValue V1, SDValue V2,
13744 const X86Subtarget &Subtarget,
13745 SelectionDAG &DAG) {
13746 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13747 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13748 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13749
13750 // Whenever we can lower this as a zext, that instruction is strictly faster
13751 // than any alternative. It also allows us to fold memory operands into the
13752 // shuffle in many cases.
13753 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13754 Zeroable, Subtarget, DAG))
13755 return ZExt;
13756
13757 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13758
13759 // Try to use shift instructions if fast.
13760 if (Subtarget.preferLowerShuffleAsShift()) {
13761 if (SDValue Shift =
13762 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13763 Subtarget, DAG, /*BitwiseOnly*/ true))
13764 return Shift;
13765 if (NumV2Elements == 0)
13766 if (SDValue Rotate =
13767 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13768 return Rotate;
13769 }
13770
13771 if (NumV2Elements == 0) {
13772 // Try to use broadcast unless the mask only has one non-undef element.
13773 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13774 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13775 Mask, Subtarget, DAG))
13776 return Broadcast;
13777 }
13778
13779 // Straight shuffle of a single input vector. For everything from SSE2
13780 // onward this has a single fast instruction with no scary immediates.
13781 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13782 // but we aren't actually going to use the UNPCK instruction because doing
13783 // so prevents folding a load into this instruction or making a copy.
13784 const int UnpackLoMask[] = {0, 0, 1, 1};
13785 const int UnpackHiMask[] = {2, 2, 3, 3};
13786 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13787 Mask = UnpackLoMask;
13788 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13789 Mask = UnpackHiMask;
13790
13791 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13792 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13793 }
13794
13795 if (Subtarget.hasAVX2())
13796 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13797 return Extract;
13798
13799 // Try to use shift instructions.
13800 if (SDValue Shift =
13801 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13802 DAG, /*BitwiseOnly*/ false))
13803 return Shift;
13804
13805 // There are special ways we can lower some single-element blends.
13806 if (NumV2Elements == 1)
13808 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13809 return V;
13810
13811 // We have different paths for blend lowering, but they all must use the
13812 // *exact* same predicate.
13813 bool IsBlendSupported = Subtarget.hasSSE41();
13814 if (IsBlendSupported)
13815 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13816 Zeroable, Subtarget, DAG))
13817 return Blend;
13818
13819 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13820 Zeroable, Subtarget, DAG))
13821 return Masked;
13822
13823 // Use dedicated unpack instructions for masks that match their pattern.
13824 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13825 return V;
13826
13827 // Try to use byte rotation instructions.
13828 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13829 if (Subtarget.hasSSSE3()) {
13830 if (Subtarget.hasVLX())
13831 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13832 Zeroable, Subtarget, DAG))
13833 return Rotate;
13834
13835 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13836 Subtarget, DAG))
13837 return Rotate;
13838 }
13839
13840 // Assume that a single SHUFPS is faster than an alternative sequence of
13841 // multiple instructions (even if the CPU has a domain penalty).
13842 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13843 if (!isSingleSHUFPSMask(Mask)) {
13844 // If we have direct support for blends, we should lower by decomposing into
13845 // a permute. That will be faster than the domain cross.
13846 if (IsBlendSupported)
13847 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13848 Zeroable, Subtarget, DAG);
13849
13850 // Try to lower by permuting the inputs into an unpack instruction.
13851 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13852 Mask, Subtarget, DAG))
13853 return Unpack;
13854 }
13855
13856 // We implement this with SHUFPS because it can blend from two vectors.
13857 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13858 // up the inputs, bypassing domain shift penalties that we would incur if we
13859 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13860 // relevant.
13861 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13862 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13863 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13864 return DAG.getBitcast(MVT::v4i32, ShufPS);
13865}
13866
13867/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13868/// shuffle lowering, and the most complex part.
13869///
13870/// The lowering strategy is to try to form pairs of input lanes which are
13871/// targeted at the same half of the final vector, and then use a dword shuffle
13872/// to place them onto the right half, and finally unpack the paired lanes into
13873/// their final position.
13874///
13875/// The exact breakdown of how to form these dword pairs and align them on the
13876/// correct sides is really tricky. See the comments within the function for
13877/// more of the details.
13878///
13879/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13880/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13881/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13882/// vector, form the analogous 128-bit 8-element Mask.
13884 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13885 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13886 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13887 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13888
13889 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13890 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13891 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13892
13893 // Attempt to directly match PSHUFLW or PSHUFHW.
13894 if (isUndefOrInRange(LoMask, 0, 4) &&
13895 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13896 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13897 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13898 }
13899 if (isUndefOrInRange(HiMask, 4, 8) &&
13900 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13901 for (int i = 0; i != 4; ++i)
13902 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13903 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13904 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13905 }
13906
13907 SmallVector<int, 4> LoInputs;
13908 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13909 array_pod_sort(LoInputs.begin(), LoInputs.end());
13910 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13911 SmallVector<int, 4> HiInputs;
13912 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13913 array_pod_sort(HiInputs.begin(), HiInputs.end());
13914 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13915 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13916 int NumHToL = LoInputs.size() - NumLToL;
13917 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13918 int NumHToH = HiInputs.size() - NumLToH;
13919 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13920 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13921 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13922 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13923
13924 // If we are shuffling values from one half - check how many different DWORD
13925 // pairs we need to create. If only 1 or 2 then we can perform this as a
13926 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13927 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13928 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13929 V = DAG.getNode(ShufWOp, DL, VT, V,
13930 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13931 V = DAG.getBitcast(PSHUFDVT, V);
13932 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13933 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13934 return DAG.getBitcast(VT, V);
13935 };
13936
13937 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13938 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13939 SmallVector<std::pair<int, int>, 4> DWordPairs;
13940 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13941
13942 // Collect the different DWORD pairs.
13943 for (int DWord = 0; DWord != 4; ++DWord) {
13944 int M0 = Mask[2 * DWord + 0];
13945 int M1 = Mask[2 * DWord + 1];
13946 M0 = (M0 >= 0 ? M0 % 4 : M0);
13947 M1 = (M1 >= 0 ? M1 % 4 : M1);
13948 if (M0 < 0 && M1 < 0)
13949 continue;
13950
13951 bool Match = false;
13952 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13953 auto &DWordPair = DWordPairs[j];
13954 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13955 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13956 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13957 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13958 PSHUFDMask[DWord] = DOffset + j;
13959 Match = true;
13960 break;
13961 }
13962 }
13963 if (!Match) {
13964 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13965 DWordPairs.push_back(std::make_pair(M0, M1));
13966 }
13967 }
13968
13969 if (DWordPairs.size() <= 2) {
13970 DWordPairs.resize(2, std::make_pair(-1, -1));
13971 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13972 DWordPairs[1].first, DWordPairs[1].second};
13973 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13974 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13975 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13976 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13977 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13978 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13979 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13980 }
13981 if ((NumHToL + NumHToH) == 0)
13982 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13983 if ((NumLToL + NumLToH) == 0)
13984 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13985 }
13986 }
13987
13988 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13989 // such inputs we can swap two of the dwords across the half mark and end up
13990 // with <=2 inputs to each half in each half. Once there, we can fall through
13991 // to the generic code below. For example:
13992 //
13993 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13994 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13995 //
13996 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13997 // and an existing 2-into-2 on the other half. In this case we may have to
13998 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13999 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14000 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14001 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14002 // half than the one we target for fixing) will be fixed when we re-enter this
14003 // path. We will also combine away any sequence of PSHUFD instructions that
14004 // result into a single instruction. Here is an example of the tricky case:
14005 //
14006 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14007 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14008 //
14009 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14010 //
14011 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14012 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14013 //
14014 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14015 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14016 //
14017 // The result is fine to be handled by the generic logic.
14018 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14019 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14020 int AOffset, int BOffset) {
14021 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14022 "Must call this with A having 3 or 1 inputs from the A half.");
14023 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14024 "Must call this with B having 1 or 3 inputs from the B half.");
14025 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14026 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14027
14028 bool ThreeAInputs = AToAInputs.size() == 3;
14029
14030 // Compute the index of dword with only one word among the three inputs in
14031 // a half by taking the sum of the half with three inputs and subtracting
14032 // the sum of the actual three inputs. The difference is the remaining
14033 // slot.
14034 int ADWord = 0, BDWord = 0;
14035 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14036 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14037 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14038 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14039 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14040 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14041 int TripleNonInputIdx =
14042 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14043 TripleDWord = TripleNonInputIdx / 2;
14044
14045 // We use xor with one to compute the adjacent DWord to whichever one the
14046 // OneInput is in.
14047 OneInputDWord = (OneInput / 2) ^ 1;
14048
14049 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14050 // and BToA inputs. If there is also such a problem with the BToB and AToB
14051 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14052 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14053 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14054 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14055 // Compute how many inputs will be flipped by swapping these DWords. We
14056 // need
14057 // to balance this to ensure we don't form a 3-1 shuffle in the other
14058 // half.
14059 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14060 llvm::count(AToBInputs, 2 * ADWord + 1);
14061 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14062 llvm::count(BToBInputs, 2 * BDWord + 1);
14063 if ((NumFlippedAToBInputs == 1 &&
14064 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14065 (NumFlippedBToBInputs == 1 &&
14066 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14067 // We choose whether to fix the A half or B half based on whether that
14068 // half has zero flipped inputs. At zero, we may not be able to fix it
14069 // with that half. We also bias towards fixing the B half because that
14070 // will more commonly be the high half, and we have to bias one way.
14071 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14072 ArrayRef<int> Inputs) {
14073 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14074 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14075 // Determine whether the free index is in the flipped dword or the
14076 // unflipped dword based on where the pinned index is. We use this bit
14077 // in an xor to conditionally select the adjacent dword.
14078 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14079 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14080 if (IsFixIdxInput == IsFixFreeIdxInput)
14081 FixFreeIdx += 1;
14082 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14083 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14084 "We need to be changing the number of flipped inputs!");
14085 int PSHUFHalfMask[] = {0, 1, 2, 3};
14086 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14087 V = DAG.getNode(
14088 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14089 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14090 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14091
14092 for (int &M : Mask)
14093 if (M >= 0 && M == FixIdx)
14094 M = FixFreeIdx;
14095 else if (M >= 0 && M == FixFreeIdx)
14096 M = FixIdx;
14097 };
14098 if (NumFlippedBToBInputs != 0) {
14099 int BPinnedIdx =
14100 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14101 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14102 } else {
14103 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14104 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14105 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14106 }
14107 }
14108 }
14109
14110 int PSHUFDMask[] = {0, 1, 2, 3};
14111 PSHUFDMask[ADWord] = BDWord;
14112 PSHUFDMask[BDWord] = ADWord;
14113 V = DAG.getBitcast(
14114 VT,
14115 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14116 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14117
14118 // Adjust the mask to match the new locations of A and B.
14119 for (int &M : Mask)
14120 if (M >= 0 && M/2 == ADWord)
14121 M = 2 * BDWord + M % 2;
14122 else if (M >= 0 && M/2 == BDWord)
14123 M = 2 * ADWord + M % 2;
14124
14125 // Recurse back into this routine to re-compute state now that this isn't
14126 // a 3 and 1 problem.
14127 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14128 };
14129 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14130 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14131 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14132 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14133
14134 // At this point there are at most two inputs to the low and high halves from
14135 // each half. That means the inputs can always be grouped into dwords and
14136 // those dwords can then be moved to the correct half with a dword shuffle.
14137 // We use at most one low and one high word shuffle to collect these paired
14138 // inputs into dwords, and finally a dword shuffle to place them.
14139 int PSHUFLMask[4] = {-1, -1, -1, -1};
14140 int PSHUFHMask[4] = {-1, -1, -1, -1};
14141 int PSHUFDMask[4] = {-1, -1, -1, -1};
14142
14143 // First fix the masks for all the inputs that are staying in their
14144 // original halves. This will then dictate the targets of the cross-half
14145 // shuffles.
14146 auto fixInPlaceInputs =
14147 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14148 MutableArrayRef<int> SourceHalfMask,
14149 MutableArrayRef<int> HalfMask, int HalfOffset) {
14150 if (InPlaceInputs.empty())
14151 return;
14152 if (InPlaceInputs.size() == 1) {
14153 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14154 InPlaceInputs[0] - HalfOffset;
14155 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14156 return;
14157 }
14158 if (IncomingInputs.empty()) {
14159 // Just fix all of the in place inputs.
14160 for (int Input : InPlaceInputs) {
14161 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14162 PSHUFDMask[Input / 2] = Input / 2;
14163 }
14164 return;
14165 }
14166
14167 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14168 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14169 InPlaceInputs[0] - HalfOffset;
14170 // Put the second input next to the first so that they are packed into
14171 // a dword. We find the adjacent index by toggling the low bit.
14172 int AdjIndex = InPlaceInputs[0] ^ 1;
14173 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14174 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14175 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14176 };
14177 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14178 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14179
14180 // Now gather the cross-half inputs and place them into a free dword of
14181 // their target half.
14182 // FIXME: This operation could almost certainly be simplified dramatically to
14183 // look more like the 3-1 fixing operation.
14184 auto moveInputsToRightHalf = [&PSHUFDMask](
14185 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14186 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14187 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14188 int DestOffset) {
14189 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14190 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14191 };
14192 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14193 int Word) {
14194 int LowWord = Word & ~1;
14195 int HighWord = Word | 1;
14196 return isWordClobbered(SourceHalfMask, LowWord) ||
14197 isWordClobbered(SourceHalfMask, HighWord);
14198 };
14199
14200 if (IncomingInputs.empty())
14201 return;
14202
14203 if (ExistingInputs.empty()) {
14204 // Map any dwords with inputs from them into the right half.
14205 for (int Input : IncomingInputs) {
14206 // If the source half mask maps over the inputs, turn those into
14207 // swaps and use the swapped lane.
14208 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14209 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14210 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14211 Input - SourceOffset;
14212 // We have to swap the uses in our half mask in one sweep.
14213 for (int &M : HalfMask)
14214 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14215 M = Input;
14216 else if (M == Input)
14217 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14218 } else {
14219 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14220 Input - SourceOffset &&
14221 "Previous placement doesn't match!");
14222 }
14223 // Note that this correctly re-maps both when we do a swap and when
14224 // we observe the other side of the swap above. We rely on that to
14225 // avoid swapping the members of the input list directly.
14226 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14227 }
14228
14229 // Map the input's dword into the correct half.
14230 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14231 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14232 else
14233 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14234 Input / 2 &&
14235 "Previous placement doesn't match!");
14236 }
14237
14238 // And just directly shift any other-half mask elements to be same-half
14239 // as we will have mirrored the dword containing the element into the
14240 // same position within that half.
14241 for (int &M : HalfMask)
14242 if (M >= SourceOffset && M < SourceOffset + 4) {
14243 M = M - SourceOffset + DestOffset;
14244 assert(M >= 0 && "This should never wrap below zero!");
14245 }
14246 return;
14247 }
14248
14249 // Ensure we have the input in a viable dword of its current half. This
14250 // is particularly tricky because the original position may be clobbered
14251 // by inputs being moved and *staying* in that half.
14252 if (IncomingInputs.size() == 1) {
14253 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14254 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14255 SourceOffset;
14256 SourceHalfMask[InputFixed - SourceOffset] =
14257 IncomingInputs[0] - SourceOffset;
14258 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14259 IncomingInputs[0] = InputFixed;
14260 }
14261 } else if (IncomingInputs.size() == 2) {
14262 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14263 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14264 // We have two non-adjacent or clobbered inputs we need to extract from
14265 // the source half. To do this, we need to map them into some adjacent
14266 // dword slot in the source mask.
14267 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14268 IncomingInputs[1] - SourceOffset};
14269
14270 // If there is a free slot in the source half mask adjacent to one of
14271 // the inputs, place the other input in it. We use (Index XOR 1) to
14272 // compute an adjacent index.
14273 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14274 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14275 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14276 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14277 InputsFixed[1] = InputsFixed[0] ^ 1;
14278 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14279 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14280 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14281 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14282 InputsFixed[0] = InputsFixed[1] ^ 1;
14283 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14284 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14285 // The two inputs are in the same DWord but it is clobbered and the
14286 // adjacent DWord isn't used at all. Move both inputs to the free
14287 // slot.
14288 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14289 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14290 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14291 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14292 } else {
14293 // The only way we hit this point is if there is no clobbering
14294 // (because there are no off-half inputs to this half) and there is no
14295 // free slot adjacent to one of the inputs. In this case, we have to
14296 // swap an input with a non-input.
14297 for (int i = 0; i < 4; ++i)
14298 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14299 "We can't handle any clobbers here!");
14300 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14301 "Cannot have adjacent inputs here!");
14302
14303 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14304 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14305
14306 // We also have to update the final source mask in this case because
14307 // it may need to undo the above swap.
14308 for (int &M : FinalSourceHalfMask)
14309 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14310 M = InputsFixed[1] + SourceOffset;
14311 else if (M == InputsFixed[1] + SourceOffset)
14312 M = (InputsFixed[0] ^ 1) + SourceOffset;
14313
14314 InputsFixed[1] = InputsFixed[0] ^ 1;
14315 }
14316
14317 // Point everything at the fixed inputs.
14318 for (int &M : HalfMask)
14319 if (M == IncomingInputs[0])
14320 M = InputsFixed[0] + SourceOffset;
14321 else if (M == IncomingInputs[1])
14322 M = InputsFixed[1] + SourceOffset;
14323
14324 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14325 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14326 }
14327 } else {
14328 llvm_unreachable("Unhandled input size!");
14329 }
14330
14331 // Now hoist the DWord down to the right half.
14332 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14333 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14334 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14335 for (int &M : HalfMask)
14336 for (int Input : IncomingInputs)
14337 if (M == Input)
14338 M = FreeDWord * 2 + Input % 2;
14339 };
14340 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14341 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14342 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14343 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14344
14345 // Now enact all the shuffles we've computed to move the inputs into their
14346 // target half.
14347 if (!isNoopShuffleMask(PSHUFLMask))
14348 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14349 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14350 if (!isNoopShuffleMask(PSHUFHMask))
14351 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14352 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14353 if (!isNoopShuffleMask(PSHUFDMask))
14354 V = DAG.getBitcast(
14355 VT,
14356 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14357 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14358
14359 // At this point, each half should contain all its inputs, and we can then
14360 // just shuffle them into their final position.
14361 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14362 "Failed to lift all the high half inputs to the low mask!");
14363 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14364 "Failed to lift all the low half inputs to the high mask!");
14365
14366 // Do a half shuffle for the low mask.
14367 if (!isNoopShuffleMask(LoMask))
14368 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14369 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14370
14371 // Do a half shuffle with the high mask after shifting its values down.
14372 for (int &M : HiMask)
14373 if (M >= 0)
14374 M -= 4;
14375 if (!isNoopShuffleMask(HiMask))
14376 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14377 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14378
14379 return V;
14380}
14381
14382/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14383/// blend if only one input is used.
14385 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14386 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14388 "Lane crossing shuffle masks not supported");
14389
14390 int NumBytes = VT.getSizeInBits() / 8;
14391 int Size = Mask.size();
14392 int Scale = NumBytes / Size;
14393
14394 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14395 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14396 V1InUse = false;
14397 V2InUse = false;
14398
14399 for (int i = 0; i < NumBytes; ++i) {
14400 int M = Mask[i / Scale];
14401 if (M < 0)
14402 continue;
14403
14404 const int ZeroMask = 0x80;
14405 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14406 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14407 if (Zeroable[i / Scale])
14408 V1Idx = V2Idx = ZeroMask;
14409
14410 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14411 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14412 V1InUse |= (ZeroMask != V1Idx);
14413 V2InUse |= (ZeroMask != V2Idx);
14414 }
14415
14416 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14417 if (V1InUse)
14418 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14419 DAG.getBuildVector(ShufVT, DL, V1Mask));
14420 if (V2InUse)
14421 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14422 DAG.getBuildVector(ShufVT, DL, V2Mask));
14423
14424 // If we need shuffled inputs from both, blend the two.
14425 SDValue V;
14426 if (V1InUse && V2InUse)
14427 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14428 else
14429 V = V1InUse ? V1 : V2;
14430
14431 // Cast the result back to the correct type.
14432 return DAG.getBitcast(VT, V);
14433}
14434
14435/// Generic lowering of 8-lane i16 shuffles.
14436///
14437/// This handles both single-input shuffles and combined shuffle/blends with
14438/// two inputs. The single input shuffles are immediately delegated to
14439/// a dedicated lowering routine.
14440///
14441/// The blends are lowered in one of three fundamental ways. If there are few
14442/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14443/// of the input is significantly cheaper when lowered as an interleaving of
14444/// the two inputs, try to interleave them. Otherwise, blend the low and high
14445/// halves of the inputs separately (making them have relatively few inputs)
14446/// and then concatenate them.
14448 const APInt &Zeroable, SDValue V1, SDValue V2,
14449 const X86Subtarget &Subtarget,
14450 SelectionDAG &DAG) {
14451 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14452 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14453 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14454
14455 // Whenever we can lower this as a zext, that instruction is strictly faster
14456 // than any alternative.
14457 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14458 Zeroable, Subtarget, DAG))
14459 return ZExt;
14460
14461 // Try to use lower using a truncation.
14462 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14463 Subtarget, DAG))
14464 return V;
14465
14466 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14467
14468 if (NumV2Inputs == 0) {
14469 // Try to use shift instructions.
14470 if (SDValue Shift =
14471 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14472 Subtarget, DAG, /*BitwiseOnly*/ false))
14473 return Shift;
14474
14475 // Check for being able to broadcast a single element.
14476 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14477 Mask, Subtarget, DAG))
14478 return Broadcast;
14479
14480 // Try to use bit rotation instructions.
14481 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14482 Subtarget, DAG))
14483 return Rotate;
14484
14485 // Use dedicated unpack instructions for masks that match their pattern.
14486 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14487 return V;
14488
14489 // Use dedicated pack instructions for masks that match their pattern.
14490 if (SDValue V =
14491 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14492 return V;
14493
14494 // Try to use byte rotation instructions.
14495 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14496 Subtarget, DAG))
14497 return Rotate;
14498
14499 // Make a copy of the mask so it can be modified.
14500 SmallVector<int, 8> MutableMask(Mask);
14501 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14502 Subtarget, DAG);
14503 }
14504
14505 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14506 "All single-input shuffles should be canonicalized to be V1-input "
14507 "shuffles.");
14508
14509 // Try to use shift instructions.
14510 if (SDValue Shift =
14511 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14512 DAG, /*BitwiseOnly*/ false))
14513 return Shift;
14514
14515 // See if we can use SSE4A Extraction / Insertion.
14516 if (Subtarget.hasSSE4A())
14517 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14518 Zeroable, DAG))
14519 return V;
14520
14521 // There are special ways we can lower some single-element blends.
14522 if (NumV2Inputs == 1)
14524 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14525 return V;
14526
14527 // We have different paths for blend lowering, but they all must use the
14528 // *exact* same predicate.
14529 bool IsBlendSupported = Subtarget.hasSSE41();
14530 if (IsBlendSupported)
14531 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14532 Zeroable, Subtarget, DAG))
14533 return Blend;
14534
14535 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14536 Zeroable, Subtarget, DAG))
14537 return Masked;
14538
14539 // Use dedicated unpack instructions for masks that match their pattern.
14540 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14541 return V;
14542
14543 // Use dedicated pack instructions for masks that match their pattern.
14544 if (SDValue V =
14545 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14546 return V;
14547
14548 // Try to use lower using a truncation.
14549 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14550 Subtarget, DAG))
14551 return V;
14552
14553 // Try to use byte rotation instructions.
14554 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14555 Subtarget, DAG))
14556 return Rotate;
14557
14558 if (SDValue BitBlend =
14559 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14560 return BitBlend;
14561
14562 // Try to use byte shift instructions to mask.
14563 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14564 Zeroable, Subtarget, DAG))
14565 return V;
14566
14567 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14568 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14569 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14570 !Subtarget.hasVLX()) {
14571 // Check if this is part of a 256-bit vector truncation.
14572 unsigned PackOpc = 0;
14573 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14576 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14577 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14578 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14579 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14580 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14581 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14582 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14583 PackOpc = X86ISD::PACKUS;
14584 } else if (Subtarget.hasSSE41()) {
14585 SmallVector<SDValue, 4> DWordClearOps(4,
14586 DAG.getConstant(0, DL, MVT::i32));
14587 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14588 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14589 SDValue DWordClearMask =
14590 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14591 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14592 DWordClearMask);
14593 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14594 DWordClearMask);
14595 PackOpc = X86ISD::PACKUS;
14596 } else if (!Subtarget.hasSSSE3()) {
14597 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14598 V1 = DAG.getBitcast(MVT::v4i32, V1);
14599 V2 = DAG.getBitcast(MVT::v4i32, V2);
14600 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14601 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14602 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14603 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14604 PackOpc = X86ISD::PACKSS;
14605 }
14606 if (PackOpc) {
14607 // Now pack things back together.
14608 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14609 if (NumEvenDrops == 2) {
14610 Result = DAG.getBitcast(MVT::v4i32, Result);
14611 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14612 }
14613 return Result;
14614 }
14615 }
14616
14617 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14618 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14619 if (NumOddDrops == 1) {
14620 bool HasSSE41 = Subtarget.hasSSE41();
14621 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14622 DAG.getBitcast(MVT::v4i32, V1),
14623 DAG.getTargetConstant(16, DL, MVT::i8));
14624 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14625 DAG.getBitcast(MVT::v4i32, V2),
14626 DAG.getTargetConstant(16, DL, MVT::i8));
14627 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14628 MVT::v8i16, V1, V2);
14629 }
14630
14631 // Try to lower by permuting the inputs into an unpack instruction.
14632 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14633 Mask, Subtarget, DAG))
14634 return Unpack;
14635
14636 // If we can't directly blend but can use PSHUFB, that will be better as it
14637 // can both shuffle and set up the inefficient blend.
14638 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14639 bool V1InUse, V2InUse;
14640 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14641 Zeroable, DAG, V1InUse, V2InUse);
14642 }
14643
14644 // We can always bit-blend if we have to so the fallback strategy is to
14645 // decompose into single-input permutes and blends/unpacks.
14646 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14647 Zeroable, Subtarget, DAG);
14648}
14649
14650/// Lower 8-lane 16-bit floating point shuffles.
14652 const APInt &Zeroable, SDValue V1, SDValue V2,
14653 const X86Subtarget &Subtarget,
14654 SelectionDAG &DAG) {
14655 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14656 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14657 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14658 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14659
14660 if (Subtarget.hasFP16()) {
14661 if (NumV2Elements == 0) {
14662 // Check for being able to broadcast a single element.
14663 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14664 Mask, Subtarget, DAG))
14665 return Broadcast;
14666 }
14667 if (NumV2Elements == 1 && Mask[0] >= 8)
14669 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14670 return V;
14671 }
14672
14673 V1 = DAG.getBitcast(MVT::v8i16, V1);
14674 V2 = DAG.getBitcast(MVT::v8i16, V2);
14675 return DAG.getBitcast(MVT::v8f16,
14676 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14677}
14678
14679// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14680// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14681// the active subvector is extracted.
14683 ArrayRef<int> OriginalMask, SDValue V1,
14684 SDValue V2, const X86Subtarget &Subtarget,
14685 SelectionDAG &DAG) {
14686 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14687 SmallVector<int, 32> Mask(OriginalMask);
14688 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14689 !isShuffleFoldableLoad(V2)) {
14691 std::swap(V1, V2);
14692 }
14693
14694 MVT MaskVT = VT.changeTypeToInteger();
14695 SDValue MaskNode;
14696 MVT ShuffleVT = VT;
14697 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14698 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14699 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14700 ShuffleVT = V1.getSimpleValueType();
14701
14702 // Adjust mask to correct indices for the second input.
14703 int NumElts = VT.getVectorNumElements();
14704 unsigned Scale = 512 / VT.getSizeInBits();
14705 SmallVector<int, 32> AdjustedMask(Mask);
14706 for (int &M : AdjustedMask)
14707 if (NumElts <= M)
14708 M += (Scale - 1) * NumElts;
14709 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14710 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14711 } else {
14712 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14713 }
14714
14715 SDValue Result;
14716 if (V2.isUndef())
14717 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14718 else
14719 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14720
14721 if (VT != ShuffleVT)
14722 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14723
14724 return Result;
14725}
14726
14727/// Generic lowering of v16i8 shuffles.
14728///
14729/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14730/// detect any complexity reducing interleaving. If that doesn't help, it uses
14731/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14732/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14733/// back together.
14735 const APInt &Zeroable, SDValue V1, SDValue V2,
14736 const X86Subtarget &Subtarget,
14737 SelectionDAG &DAG) {
14738 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14739 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14740 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14741
14742 // Try to use shift instructions.
14743 if (SDValue Shift =
14744 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14745 DAG, /*BitwiseOnly*/ false))
14746 return Shift;
14747
14748 // Try to use byte rotation instructions.
14749 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14750 Subtarget, DAG))
14751 return Rotate;
14752
14753 // Use dedicated pack instructions for masks that match their pattern.
14754 if (SDValue V =
14755 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14756 return V;
14757
14758 // Try to use a zext lowering.
14759 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14760 Zeroable, Subtarget, DAG))
14761 return ZExt;
14762
14763 // Try to use lower using a truncation.
14764 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14765 Subtarget, DAG))
14766 return V;
14767
14768 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14769 Subtarget, DAG))
14770 return V;
14771
14772 // See if we can use SSE4A Extraction / Insertion.
14773 if (Subtarget.hasSSE4A())
14774 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14775 Zeroable, DAG))
14776 return V;
14777
14778 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14779
14780 // For single-input shuffles, there are some nicer lowering tricks we can use.
14781 if (NumV2Elements == 0) {
14782 // Check for being able to broadcast a single element.
14783 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14784 Mask, Subtarget, DAG))
14785 return Broadcast;
14786
14787 // Try to use bit rotation instructions.
14788 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14789 Subtarget, DAG))
14790 return Rotate;
14791
14792 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14793 return V;
14794
14795 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14796 // Notably, this handles splat and partial-splat shuffles more efficiently.
14797 // However, it only makes sense if the pre-duplication shuffle simplifies
14798 // things significantly. Currently, this means we need to be able to
14799 // express the pre-duplication shuffle as an i16 shuffle.
14800 //
14801 // FIXME: We should check for other patterns which can be widened into an
14802 // i16 shuffle as well.
14803 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14804 for (int i = 0; i < 16; i += 2)
14805 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14806 return false;
14807
14808 return true;
14809 };
14810 auto tryToWidenViaDuplication = [&]() -> SDValue {
14811 if (!canWidenViaDuplication(Mask))
14812 return SDValue();
14813 SmallVector<int, 4> LoInputs;
14814 copy_if(Mask, std::back_inserter(LoInputs),
14815 [](int M) { return M >= 0 && M < 8; });
14816 array_pod_sort(LoInputs.begin(), LoInputs.end());
14817 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14818 SmallVector<int, 4> HiInputs;
14819 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14820 array_pod_sort(HiInputs.begin(), HiInputs.end());
14821 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14822
14823 bool TargetLo = LoInputs.size() >= HiInputs.size();
14824 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14825 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14826
14827 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14829 for (int I : InPlaceInputs) {
14830 PreDupI16Shuffle[I/2] = I/2;
14831 LaneMap[I] = I;
14832 }
14833 int j = TargetLo ? 0 : 4, je = j + 4;
14834 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14835 // Check if j is already a shuffle of this input. This happens when
14836 // there are two adjacent bytes after we move the low one.
14837 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14838 // If we haven't yet mapped the input, search for a slot into which
14839 // we can map it.
14840 while (j < je && PreDupI16Shuffle[j] >= 0)
14841 ++j;
14842
14843 if (j == je)
14844 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14845 return SDValue();
14846
14847 // Map this input with the i16 shuffle.
14848 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14849 }
14850
14851 // Update the lane map based on the mapping we ended up with.
14852 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14853 }
14854 V1 = DAG.getBitcast(
14855 MVT::v16i8,
14856 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14857 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14858
14859 // Unpack the bytes to form the i16s that will be shuffled into place.
14860 bool EvenInUse = false, OddInUse = false;
14861 for (int i = 0; i < 16; i += 2) {
14862 EvenInUse |= (Mask[i + 0] >= 0);
14863 OddInUse |= (Mask[i + 1] >= 0);
14864 if (EvenInUse && OddInUse)
14865 break;
14866 }
14867 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14868 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14869 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14870
14871 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14872 for (int i = 0; i < 16; ++i)
14873 if (Mask[i] >= 0) {
14874 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14875 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14876 if (PostDupI16Shuffle[i / 2] < 0)
14877 PostDupI16Shuffle[i / 2] = MappedMask;
14878 else
14879 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14880 "Conflicting entries in the original shuffle!");
14881 }
14882 return DAG.getBitcast(
14883 MVT::v16i8,
14884 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14885 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14886 };
14887 if (SDValue V = tryToWidenViaDuplication())
14888 return V;
14889 }
14890
14891 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14892 Zeroable, Subtarget, DAG))
14893 return Masked;
14894
14895 // Use dedicated unpack instructions for masks that match their pattern.
14896 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14897 return V;
14898
14899 // Try to use byte shift instructions to mask.
14900 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14901 Zeroable, Subtarget, DAG))
14902 return V;
14903
14904 // Check for compaction patterns.
14905 bool IsSingleInput = V2.isUndef();
14906 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14907
14908 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14909 // with PSHUFB. It is important to do this before we attempt to generate any
14910 // blends but after all of the single-input lowerings. If the single input
14911 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14912 // want to preserve that and we can DAG combine any longer sequences into
14913 // a PSHUFB in the end. But once we start blending from multiple inputs,
14914 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14915 // and there are *very* few patterns that would actually be faster than the
14916 // PSHUFB approach because of its ability to zero lanes.
14917 //
14918 // If the mask is a binary compaction, we can more efficiently perform this
14919 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14920 //
14921 // FIXME: The only exceptions to the above are blends which are exact
14922 // interleavings with direct instructions supporting them. We currently don't
14923 // handle those well here.
14924 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14925 bool V1InUse = false;
14926 bool V2InUse = false;
14927
14929 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14930
14931 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14932 // do so. This avoids using them to handle blends-with-zero which is
14933 // important as a single pshufb is significantly faster for that.
14934 if (V1InUse && V2InUse) {
14935 if (Subtarget.hasSSE41())
14936 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14937 Zeroable, Subtarget, DAG))
14938 return Blend;
14939
14940 // We can use an unpack to do the blending rather than an or in some
14941 // cases. Even though the or may be (very minorly) more efficient, we
14942 // preference this lowering because there are common cases where part of
14943 // the complexity of the shuffles goes away when we do the final blend as
14944 // an unpack.
14945 // FIXME: It might be worth trying to detect if the unpack-feeding
14946 // shuffles will both be pshufb, in which case we shouldn't bother with
14947 // this.
14949 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14950 return Unpack;
14951
14952 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14953 if (Subtarget.hasVBMI())
14954 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14955 DAG);
14956
14957 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14958 if (Subtarget.hasXOP()) {
14959 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14960 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14961 }
14962
14963 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14964 // PALIGNR will be cheaper than the second PSHUFB+OR.
14966 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14967 return V;
14968 }
14969
14970 return PSHUFB;
14971 }
14972
14973 // There are special ways we can lower some single-element blends.
14974 if (NumV2Elements == 1)
14976 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14977 return V;
14978
14979 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14980 return Blend;
14981
14982 // Check whether a compaction lowering can be done. This handles shuffles
14983 // which take every Nth element for some even N. See the helper function for
14984 // details.
14985 //
14986 // We special case these as they can be particularly efficiently handled with
14987 // the PACKUSB instruction on x86 and they show up in common patterns of
14988 // rearranging bytes to truncate wide elements.
14989 if (NumEvenDrops) {
14990 // NumEvenDrops is the power of two stride of the elements. Another way of
14991 // thinking about it is that we need to drop the even elements this many
14992 // times to get the original input.
14993
14994 // First we need to zero all the dropped bytes.
14995 assert(NumEvenDrops <= 3 &&
14996 "No support for dropping even elements more than 3 times.");
14997 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14998 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14999 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15000 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15001 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15002 WordClearMask);
15003 if (!IsSingleInput)
15004 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15005 WordClearMask);
15006
15007 // Now pack things back together.
15008 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15009 IsSingleInput ? V1 : V2);
15010 for (int i = 1; i < NumEvenDrops; ++i) {
15011 Result = DAG.getBitcast(MVT::v8i16, Result);
15012 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15013 }
15014 return Result;
15015 }
15016
15017 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15018 if (NumOddDrops == 1) {
15019 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15020 DAG.getBitcast(MVT::v8i16, V1),
15021 DAG.getTargetConstant(8, DL, MVT::i8));
15022 if (!IsSingleInput)
15023 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15024 DAG.getBitcast(MVT::v8i16, V2),
15025 DAG.getTargetConstant(8, DL, MVT::i8));
15026 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15027 IsSingleInput ? V1 : V2);
15028 }
15029
15030 // Handle multi-input cases by blending/unpacking single-input shuffles.
15031 if (NumV2Elements > 0)
15032 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15033 Zeroable, Subtarget, DAG);
15034
15035 // The fallback path for single-input shuffles widens this into two v8i16
15036 // vectors with unpacks, shuffles those, and then pulls them back together
15037 // with a pack.
15038 SDValue V = V1;
15039
15040 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15041 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15042 for (int i = 0; i < 16; ++i)
15043 if (Mask[i] >= 0)
15044 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15045
15046 SDValue VLoHalf, VHiHalf;
15047 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15048 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15049 // i16s.
15050 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15051 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15052 // Use a mask to drop the high bytes.
15053 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15054 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15055 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15056
15057 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15058 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15059
15060 // Squash the masks to point directly into VLoHalf.
15061 for (int &M : LoBlendMask)
15062 if (M >= 0)
15063 M /= 2;
15064 for (int &M : HiBlendMask)
15065 if (M >= 0)
15066 M /= 2;
15067 } else {
15068 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15069 // VHiHalf so that we can blend them as i16s.
15070 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15071
15072 VLoHalf = DAG.getBitcast(
15073 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15074 VHiHalf = DAG.getBitcast(
15075 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15076 }
15077
15078 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15079 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15080
15081 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15082}
15083
15084/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15085///
15086/// This routine breaks down the specific type of 128-bit shuffle and
15087/// dispatches to the lowering routines accordingly.
15089 MVT VT, SDValue V1, SDValue V2,
15090 const APInt &Zeroable,
15091 const X86Subtarget &Subtarget,
15092 SelectionDAG &DAG) {
15093 if (VT == MVT::v8bf16) {
15094 V1 = DAG.getBitcast(MVT::v8i16, V1);
15095 V2 = DAG.getBitcast(MVT::v8i16, V2);
15096 return DAG.getBitcast(VT,
15097 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15098 }
15099
15100 switch (VT.SimpleTy) {
15101 case MVT::v2i64:
15102 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15103 case MVT::v2f64:
15104 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15105 case MVT::v4i32:
15106 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15107 case MVT::v4f32:
15108 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15109 case MVT::v8i16:
15110 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15111 case MVT::v8f16:
15112 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15113 case MVT::v16i8:
15114 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15115
15116 default:
15117 llvm_unreachable("Unimplemented!");
15118 }
15119}
15120
15121/// Generic routine to split vector shuffle into half-sized shuffles.
15122///
15123/// This routine just extracts two subvectors, shuffles them independently, and
15124/// then concatenates them back together. This should work effectively with all
15125/// AVX vector shuffle types.
15127 SDValue V2, ArrayRef<int> Mask,
15128 SelectionDAG &DAG, bool SimpleOnly) {
15129 assert(VT.getSizeInBits() >= 256 &&
15130 "Only for 256-bit or wider vector shuffles!");
15131 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15132 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15133
15134 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15135 if (VT == MVT::v8f32) {
15136 SDValue BC1 = peekThroughBitcasts(V1);
15137 SDValue BC2 = peekThroughBitcasts(V2);
15138 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15139 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15140 DAG, SimpleOnly))
15141 return DAG.getBitcast(VT, Split);
15142 }
15143 }
15144
15145 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15146 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15147
15148 int NumElements = VT.getVectorNumElements();
15149 int SplitNumElements = NumElements / 2;
15150 MVT ScalarVT = VT.getVectorElementType();
15151 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15152
15153 // Use splitVector/extractSubVector so that split build-vectors just build two
15154 // narrower build vectors. This helps shuffling with splats and zeros.
15155 auto SplitVector = [&](SDValue V) {
15156 SDValue LoV, HiV;
15157 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15158 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15159 DAG.getBitcast(SplitVT, HiV));
15160 };
15161
15162 SDValue LoV1, HiV1, LoV2, HiV2;
15163 std::tie(LoV1, HiV1) = SplitVector(V1);
15164 std::tie(LoV2, HiV2) = SplitVector(V2);
15165
15166 // Now create two 4-way blends of these half-width vectors.
15167 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15168 bool &UseHiV1, bool &UseLoV2,
15169 bool &UseHiV2) {
15170 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15171 for (int i = 0; i < SplitNumElements; ++i) {
15172 int M = HalfMask[i];
15173 if (M >= NumElements) {
15174 if (M >= NumElements + SplitNumElements)
15175 UseHiV2 = true;
15176 else
15177 UseLoV2 = true;
15178 } else if (M >= 0) {
15179 if (M >= SplitNumElements)
15180 UseHiV1 = true;
15181 else
15182 UseLoV1 = true;
15183 }
15184 }
15185 };
15186
15187 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15188 if (!SimpleOnly)
15189 return true;
15190
15191 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15192 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15193
15194 return !(UseHiV1 || UseHiV2);
15195 };
15196
15197 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15198 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15199 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15200 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15201 for (int i = 0; i < SplitNumElements; ++i) {
15202 int M = HalfMask[i];
15203 if (M >= NumElements) {
15204 V2BlendMask[i] = M - NumElements;
15205 BlendMask[i] = SplitNumElements + i;
15206 } else if (M >= 0) {
15207 V1BlendMask[i] = M;
15208 BlendMask[i] = i;
15209 }
15210 }
15211
15212 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15213 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15214
15215 // Because the lowering happens after all combining takes place, we need to
15216 // manually combine these blend masks as much as possible so that we create
15217 // a minimal number of high-level vector shuffle nodes.
15218 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15219
15220 // First try just blending the halves of V1 or V2.
15221 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15222 return DAG.getUNDEF(SplitVT);
15223 if (!UseLoV2 && !UseHiV2)
15224 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15225 if (!UseLoV1 && !UseHiV1)
15226 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15227
15228 SDValue V1Blend, V2Blend;
15229 if (UseLoV1 && UseHiV1) {
15230 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15231 } else {
15232 // We only use half of V1 so map the usage down into the final blend mask.
15233 V1Blend = UseLoV1 ? LoV1 : HiV1;
15234 for (int i = 0; i < SplitNumElements; ++i)
15235 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15236 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15237 }
15238 if (UseLoV2 && UseHiV2) {
15239 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15240 } else {
15241 // We only use half of V2 so map the usage down into the final blend mask.
15242 V2Blend = UseLoV2 ? LoV2 : HiV2;
15243 for (int i = 0; i < SplitNumElements; ++i)
15244 if (BlendMask[i] >= SplitNumElements)
15245 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15246 }
15247 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15248 };
15249
15250 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15251 return SDValue();
15252
15253 SDValue Lo = HalfBlend(LoMask);
15254 SDValue Hi = HalfBlend(HiMask);
15255 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15256}
15257
15258/// Either split a vector in halves or decompose the shuffles and the
15259/// blend/unpack.
15260///
15261/// This is provided as a good fallback for many lowerings of non-single-input
15262/// shuffles with more than one 128-bit lane. In those cases, we want to select
15263/// between splitting the shuffle into 128-bit components and stitching those
15264/// back together vs. extracting the single-input shuffles and blending those
15265/// results.
15267 SDValue V2, ArrayRef<int> Mask,
15268 const APInt &Zeroable,
15269 const X86Subtarget &Subtarget,
15270 SelectionDAG &DAG) {
15271 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15272 "shuffles as it could then recurse on itself.");
15273 int Size = Mask.size();
15274
15275 // If this can be modeled as a broadcast of two elements followed by a blend,
15276 // prefer that lowering. This is especially important because broadcasts can
15277 // often fold with memory operands.
15278 auto DoBothBroadcast = [&] {
15279 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15280 for (int M : Mask)
15281 if (M >= Size) {
15282 if (V2BroadcastIdx < 0)
15283 V2BroadcastIdx = M - Size;
15284 else if ((M - Size) != V2BroadcastIdx &&
15285 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15286 return false;
15287 } else if (M >= 0) {
15288 if (V1BroadcastIdx < 0)
15289 V1BroadcastIdx = M;
15290 else if (M != V1BroadcastIdx &&
15291 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15292 return false;
15293 }
15294 return true;
15295 };
15296 if (DoBothBroadcast())
15297 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15298 Subtarget, DAG);
15299
15300 // If the inputs all stem from a single 128-bit lane of each input, then we
15301 // split them rather than blending because the split will decompose to
15302 // unusually few instructions.
15303 int LaneCount = VT.getSizeInBits() / 128;
15304 int LaneSize = Size / LaneCount;
15305 SmallBitVector LaneInputs[2];
15306 LaneInputs[0].resize(LaneCount, false);
15307 LaneInputs[1].resize(LaneCount, false);
15308 for (int i = 0; i < Size; ++i)
15309 if (Mask[i] >= 0)
15310 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15311 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15312 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15313 /*SimpleOnly*/ false);
15314
15315 // Without AVX2, if we can freely split the subvectors then we're better off
15316 // performing half width shuffles.
15317 if (!Subtarget.hasAVX2()) {
15318 SDValue BC1 = peekThroughBitcasts(V1);
15319 SDValue BC2 = peekThroughBitcasts(V2);
15320 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15321 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15322 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15323 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15324 if (SplatOrSplitV1 && SplatOrSplitV2)
15325 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15326 /*SimpleOnly*/ false);
15327 }
15328
15329 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15330 // requires that the decomposed single-input shuffles don't end up here.
15331 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15332 Subtarget, DAG);
15333}
15334
15335// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15336// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15338 SDValue V1, SDValue V2,
15339 ArrayRef<int> Mask,
15340 SelectionDAG &DAG) {
15341 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15342
15343 int LHSMask[4] = {-1, -1, -1, -1};
15344 int RHSMask[4] = {-1, -1, -1, -1};
15345 int SHUFPDMask[4] = {-1, -1, -1, -1};
15346
15347 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15348 // perform the shuffle once the lanes have been shuffled in place.
15349 for (int i = 0; i != 4; ++i) {
15350 int M = Mask[i];
15351 if (M < 0)
15352 continue;
15353 int LaneBase = i & ~1;
15354 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15355 LaneMask[LaneBase + (M & 1)] = M;
15356 SHUFPDMask[i] = M & 1;
15357 }
15358
15359 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15360 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15361 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15362 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15363}
15364
15365/// Lower a vector shuffle crossing multiple 128-bit lanes as
15366/// a lane permutation followed by a per-lane permutation.
15367///
15368/// This is mainly for cases where we can have non-repeating permutes
15369/// in each lane.
15370///
15371/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15372/// we should investigate merging them.
15374 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15375 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15376 int NumElts = VT.getVectorNumElements();
15377 int NumLanes = VT.getSizeInBits() / 128;
15378 int NumEltsPerLane = NumElts / NumLanes;
15379 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15380
15381 /// Attempts to find a sublane permute with the given size
15382 /// that gets all elements into their target lanes.
15383 ///
15384 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15385 /// If unsuccessful, returns false and may overwrite InLaneMask.
15386 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15387 int NumSublanesPerLane = NumSublanes / NumLanes;
15388 int NumEltsPerSublane = NumElts / NumSublanes;
15389
15390 SmallVector<int, 16> CrossLaneMask;
15391 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15392 // CrossLaneMask but one entry == one sublane.
15393 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15394 APInt DemandedCrossLane = APInt::getZero(NumElts);
15395
15396 for (int i = 0; i != NumElts; ++i) {
15397 int M = Mask[i];
15398 if (M < 0)
15399 continue;
15400
15401 int SrcSublane = M / NumEltsPerSublane;
15402 int DstLane = i / NumEltsPerLane;
15403
15404 // We only need to get the elements into the right lane, not sublane.
15405 // So search all sublanes that make up the destination lane.
15406 bool Found = false;
15407 int DstSubStart = DstLane * NumSublanesPerLane;
15408 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15409 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15410 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15411 continue;
15412
15413 Found = true;
15414 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15415 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15416 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15417 DemandedCrossLane.setBit(InLaneMask[i]);
15418 break;
15419 }
15420 if (!Found)
15421 return SDValue();
15422 }
15423
15424 // Fill CrossLaneMask using CrossLaneMaskLarge.
15425 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15426
15427 if (!CanUseSublanes) {
15428 // If we're only shuffling a single lowest lane and the rest are identity
15429 // then don't bother.
15430 // TODO - isShuffleMaskInputInPlace could be extended to something like
15431 // this.
15432 int NumIdentityLanes = 0;
15433 bool OnlyShuffleLowestLane = true;
15434 for (int i = 0; i != NumLanes; ++i) {
15435 int LaneOffset = i * NumEltsPerLane;
15436 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15437 i * NumEltsPerLane))
15438 NumIdentityLanes++;
15439 else if (CrossLaneMask[LaneOffset] != 0)
15440 OnlyShuffleLowestLane = false;
15441 }
15442 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15443 return SDValue();
15444 }
15445
15446 // Simplify CrossLaneMask based on the actual demanded elements.
15447 if (V1.hasOneUse())
15448 for (int i = 0; i != NumElts; ++i)
15449 if (!DemandedCrossLane[i])
15450 CrossLaneMask[i] = SM_SentinelUndef;
15451
15452 // Avoid returning the same shuffle operation. For example,
15453 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15454 // undef:v16i16
15455 if (CrossLaneMask == Mask || InLaneMask == Mask)
15456 return SDValue();
15457
15458 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15459 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15460 InLaneMask);
15461 };
15462
15463 // First attempt a solution with full lanes.
15464 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15465 return V;
15466
15467 // The rest of the solutions use sublanes.
15468 if (!CanUseSublanes)
15469 return SDValue();
15470
15471 // Then attempt a solution with 64-bit sublanes (vpermq).
15472 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15473 return V;
15474
15475 // If that doesn't work and we have fast variable cross-lane shuffle,
15476 // attempt 32-bit sublanes (vpermd).
15477 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15478 return SDValue();
15479
15480 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15481}
15482
15483/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15484static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15485 SmallVector<int> &InLaneMask) {
15486 int Size = Mask.size();
15487 InLaneMask.assign(Mask.begin(), Mask.end());
15488 for (int i = 0; i < Size; ++i) {
15489 int &M = InLaneMask[i];
15490 if (M < 0)
15491 continue;
15492 if (((M % Size) / LaneSize) != (i / LaneSize))
15493 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15494 }
15495}
15496
15497/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15498/// source with a lane permutation.
15499///
15500/// This lowering strategy results in four instructions in the worst case for a
15501/// single-input cross lane shuffle which is lower than any other fully general
15502/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15503/// shuffle pattern should be handled prior to trying this lowering.
15505 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15506 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15507 // FIXME: This should probably be generalized for 512-bit vectors as well.
15508 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15509 int Size = Mask.size();
15510 int LaneSize = Size / 2;
15511
15512 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15513 // Only do this if the elements aren't all from the lower lane,
15514 // otherwise we're (probably) better off doing a split.
15515 if (VT == MVT::v4f64 &&
15516 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15517 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15518
15519 // If there are only inputs from one 128-bit lane, splitting will in fact be
15520 // less expensive. The flags track whether the given lane contains an element
15521 // that crosses to another lane.
15522 bool AllLanes;
15523 if (!Subtarget.hasAVX2()) {
15524 bool LaneCrossing[2] = {false, false};
15525 for (int i = 0; i < Size; ++i)
15526 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15527 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15528 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15529 } else {
15530 bool LaneUsed[2] = {false, false};
15531 for (int i = 0; i < Size; ++i)
15532 if (Mask[i] >= 0)
15533 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15534 AllLanes = LaneUsed[0] && LaneUsed[1];
15535 }
15536
15537 // TODO - we could support shuffling V2 in the Flipped input.
15538 assert(V2.isUndef() &&
15539 "This last part of this routine only works on single input shuffles");
15540
15541 SmallVector<int> InLaneMask;
15542 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15543
15544 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15545 "In-lane shuffle mask expected");
15546
15547 // If we're not using both lanes in each lane and the inlane mask is not
15548 // repeating, then we're better off splitting.
15549 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15550 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15551 /*SimpleOnly*/ false);
15552
15553 // Flip the lanes, and shuffle the results which should now be in-lane.
15554 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15555 SDValue Flipped = DAG.getBitcast(PVT, V1);
15556 Flipped =
15557 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15558 Flipped = DAG.getBitcast(VT, Flipped);
15559 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15560}
15561
15562/// Handle lowering 2-lane 128-bit shuffles.
15564 SDValue V2, ArrayRef<int> Mask,
15565 const APInt &Zeroable,
15566 const X86Subtarget &Subtarget,
15567 SelectionDAG &DAG) {
15568 if (V2.isUndef()) {
15569 // Attempt to match VBROADCAST*128 subvector broadcast load.
15570 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15571 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15572 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15574 MVT MemVT = VT.getHalfNumVectorElementsVT();
15575 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15578 VT, MemVT, Ld, Ofs, DAG))
15579 return BcstLd;
15580 }
15581
15582 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15583 if (Subtarget.hasAVX2())
15584 return SDValue();
15585 }
15586
15587 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15588
15589 SmallVector<int, 4> WidenedMask;
15590 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15591 return SDValue();
15592
15593 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15594 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15595
15596 // Try to use an insert into a zero vector.
15597 if (WidenedMask[0] == 0 && IsHighZero) {
15598 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15599 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15600 DAG.getVectorIdxConstant(0, DL));
15601 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15602 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15603 DAG.getVectorIdxConstant(0, DL));
15604 }
15605
15606 // TODO: If minimizing size and one of the inputs is a zero vector and the
15607 // the zero vector has only one use, we could use a VPERM2X128 to save the
15608 // instruction bytes needed to explicitly generate the zero vector.
15609
15610 // Blends are faster and handle all the non-lane-crossing cases.
15611 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15612 Subtarget, DAG))
15613 return Blend;
15614
15615 // If either input operand is a zero vector, use VPERM2X128 because its mask
15616 // allows us to replace the zero input with an implicit zero.
15617 if (!IsLowZero && !IsHighZero) {
15618 // Check for patterns which can be matched with a single insert of a 128-bit
15619 // subvector.
15620 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15621 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15622
15623 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15624 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15626 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15627 SDValue SubVec =
15628 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15629 DAG.getVectorIdxConstant(0, DL));
15630 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15631 DAG.getVectorIdxConstant(2, DL));
15632 }
15633 }
15634
15635 // Try to use SHUF128 if possible.
15636 if (Subtarget.hasVLX()) {
15637 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15638 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15639 ((WidenedMask[1] % 2) << 1);
15640 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15641 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15642 }
15643 }
15644 }
15645
15646 // Otherwise form a 128-bit permutation. After accounting for undefs,
15647 // convert the 64-bit shuffle mask selection values into 128-bit
15648 // selection bits by dividing the indexes by 2 and shifting into positions
15649 // defined by a vperm2*128 instruction's immediate control byte.
15650
15651 // The immediate permute control byte looks like this:
15652 // [1:0] - select 128 bits from sources for low half of destination
15653 // [2] - ignore
15654 // [3] - zero low half of destination
15655 // [5:4] - select 128 bits from sources for high half of destination
15656 // [6] - ignore
15657 // [7] - zero high half of destination
15658
15659 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15660 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15661
15662 unsigned PermMask = 0;
15663 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15664 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15665
15666 // Check the immediate mask and replace unused sources with undef.
15667 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15668 V1 = DAG.getUNDEF(VT);
15669 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15670 V2 = DAG.getUNDEF(VT);
15671
15672 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15673 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15674}
15675
15676/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15677/// shuffling each lane.
15678///
15679/// This attempts to create a repeated lane shuffle where each lane uses one
15680/// or two of the lanes of the inputs. The lanes of the input vectors are
15681/// shuffled in one or two independent shuffles to get the lanes into the
15682/// position needed by the final shuffle.
15684 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15685 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15686 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15687
15688 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15689 return SDValue();
15690
15691 int NumElts = Mask.size();
15692 int NumLanes = VT.getSizeInBits() / 128;
15693 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15694 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15695 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15696
15697 // First pass will try to fill in the RepeatMask from lanes that need two
15698 // sources.
15699 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15700 int Srcs[2] = {-1, -1};
15701 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15702 for (int i = 0; i != NumLaneElts; ++i) {
15703 int M = Mask[(Lane * NumLaneElts) + i];
15704 if (M < 0)
15705 continue;
15706 // Determine which of the possible input lanes (NumLanes from each source)
15707 // this element comes from. Assign that as one of the sources for this
15708 // lane. We can assign up to 2 sources for this lane. If we run out
15709 // sources we can't do anything.
15710 int LaneSrc = M / NumLaneElts;
15711 int Src;
15712 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15713 Src = 0;
15714 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15715 Src = 1;
15716 else
15717 return SDValue();
15718
15719 Srcs[Src] = LaneSrc;
15720 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15721 }
15722
15723 // If this lane has two sources, see if it fits with the repeat mask so far.
15724 if (Srcs[1] < 0)
15725 continue;
15726
15727 LaneSrcs[Lane][0] = Srcs[0];
15728 LaneSrcs[Lane][1] = Srcs[1];
15729
15730 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15731 assert(M1.size() == M2.size() && "Unexpected mask size");
15732 for (int i = 0, e = M1.size(); i != e; ++i)
15733 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15734 return false;
15735 return true;
15736 };
15737
15738 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15739 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15740 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15741 int M = Mask[i];
15742 if (M < 0)
15743 continue;
15744 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15745 "Unexpected mask element");
15746 MergedMask[i] = M;
15747 }
15748 };
15749
15750 if (MatchMasks(InLaneMask, RepeatMask)) {
15751 // Merge this lane mask into the final repeat mask.
15752 MergeMasks(InLaneMask, RepeatMask);
15753 continue;
15754 }
15755
15756 // Didn't find a match. Swap the operands and try again.
15757 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15759
15760 if (MatchMasks(InLaneMask, RepeatMask)) {
15761 // Merge this lane mask into the final repeat mask.
15762 MergeMasks(InLaneMask, RepeatMask);
15763 continue;
15764 }
15765
15766 // Couldn't find a match with the operands in either order.
15767 return SDValue();
15768 }
15769
15770 // Now handle any lanes with only one source.
15771 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15772 // If this lane has already been processed, skip it.
15773 if (LaneSrcs[Lane][0] >= 0)
15774 continue;
15775
15776 for (int i = 0; i != NumLaneElts; ++i) {
15777 int M = Mask[(Lane * NumLaneElts) + i];
15778 if (M < 0)
15779 continue;
15780
15781 // If RepeatMask isn't defined yet we can define it ourself.
15782 if (RepeatMask[i] < 0)
15783 RepeatMask[i] = M % NumLaneElts;
15784
15785 if (RepeatMask[i] < NumElts) {
15786 if (RepeatMask[i] != M % NumLaneElts)
15787 return SDValue();
15788 LaneSrcs[Lane][0] = M / NumLaneElts;
15789 } else {
15790 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15791 return SDValue();
15792 LaneSrcs[Lane][1] = M / NumLaneElts;
15793 }
15794 }
15795
15796 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15797 return SDValue();
15798 }
15799
15800 SmallVector<int, 16> NewMask(NumElts, -1);
15801 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15802 int Src = LaneSrcs[Lane][0];
15803 for (int i = 0; i != NumLaneElts; ++i) {
15804 int M = -1;
15805 if (Src >= 0)
15806 M = Src * NumLaneElts + i;
15807 NewMask[Lane * NumLaneElts + i] = M;
15808 }
15809 }
15810 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15811 // Ensure we didn't get back the shuffle we started with.
15812 // FIXME: This is a hack to make up for some splat handling code in
15813 // getVectorShuffle.
15814 if (isa<ShuffleVectorSDNode>(NewV1) &&
15815 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15816 return SDValue();
15817
15818 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15819 int Src = LaneSrcs[Lane][1];
15820 for (int i = 0; i != NumLaneElts; ++i) {
15821 int M = -1;
15822 if (Src >= 0)
15823 M = Src * NumLaneElts + i;
15824 NewMask[Lane * NumLaneElts + i] = M;
15825 }
15826 }
15827 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15828 // Ensure we didn't get back the shuffle we started with.
15829 // FIXME: This is a hack to make up for some splat handling code in
15830 // getVectorShuffle.
15831 if (isa<ShuffleVectorSDNode>(NewV2) &&
15832 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15833 return SDValue();
15834
15835 for (int i = 0; i != NumElts; ++i) {
15836 if (Mask[i] < 0) {
15837 NewMask[i] = -1;
15838 continue;
15839 }
15840 NewMask[i] = RepeatMask[i % NumLaneElts];
15841 if (NewMask[i] < 0)
15842 continue;
15843
15844 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15845 }
15846 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15847}
15848
15849/// If the input shuffle mask results in a vector that is undefined in all upper
15850/// or lower half elements and that mask accesses only 2 halves of the
15851/// shuffle's operands, return true. A mask of half the width with mask indexes
15852/// adjusted to access the extracted halves of the original shuffle operands is
15853/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15854/// lower half of each input operand is accessed.
15855static bool
15857 int &HalfIdx1, int &HalfIdx2) {
15858 assert((Mask.size() == HalfMask.size() * 2) &&
15859 "Expected input mask to be twice as long as output");
15860
15861 // Exactly one half of the result must be undef to allow narrowing.
15862 bool UndefLower = isUndefLowerHalf(Mask);
15863 bool UndefUpper = isUndefUpperHalf(Mask);
15864 if (UndefLower == UndefUpper)
15865 return false;
15866
15867 unsigned HalfNumElts = HalfMask.size();
15868 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15869 HalfIdx1 = -1;
15870 HalfIdx2 = -1;
15871 for (unsigned i = 0; i != HalfNumElts; ++i) {
15872 int M = Mask[i + MaskIndexOffset];
15873 if (M < 0) {
15874 HalfMask[i] = M;
15875 continue;
15876 }
15877
15878 // Determine which of the 4 half vectors this element is from.
15879 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15880 int HalfIdx = M / HalfNumElts;
15881
15882 // Determine the element index into its half vector source.
15883 int HalfElt = M % HalfNumElts;
15884
15885 // We can shuffle with up to 2 half vectors, set the new 'half'
15886 // shuffle mask accordingly.
15887 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15888 HalfMask[i] = HalfElt;
15889 HalfIdx1 = HalfIdx;
15890 continue;
15891 }
15892 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15893 HalfMask[i] = HalfElt + HalfNumElts;
15894 HalfIdx2 = HalfIdx;
15895 continue;
15896 }
15897
15898 // Too many half vectors referenced.
15899 return false;
15900 }
15901
15902 return true;
15903}
15904
15905/// Given the output values from getHalfShuffleMask(), create a half width
15906/// shuffle of extracted vectors followed by an insert back to full width.
15908 ArrayRef<int> HalfMask, int HalfIdx1,
15909 int HalfIdx2, bool UndefLower,
15910 SelectionDAG &DAG, bool UseConcat = false) {
15911 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15912 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15913
15914 MVT VT = V1.getSimpleValueType();
15915 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15916 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15917
15918 auto getHalfVector = [&](int HalfIdx) {
15919 if (HalfIdx < 0)
15920 return DAG.getUNDEF(HalfVT);
15921 SDValue V = (HalfIdx < 2 ? V1 : V2);
15922 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15923 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15924 DAG.getVectorIdxConstant(HalfIdx, DL));
15925 };
15926
15927 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15928 SDValue Half1 = getHalfVector(HalfIdx1);
15929 SDValue Half2 = getHalfVector(HalfIdx2);
15930 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15931 if (UseConcat) {
15932 SDValue Op0 = V;
15933 SDValue Op1 = DAG.getUNDEF(HalfVT);
15934 if (UndefLower)
15935 std::swap(Op0, Op1);
15936 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15937 }
15938
15939 unsigned Offset = UndefLower ? HalfNumElts : 0;
15940 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15942}
15943
15944/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15945/// This allows for fast cases such as subvector extraction/insertion
15946/// or shuffling smaller vector types which can lower more efficiently.
15948 SDValue V2, ArrayRef<int> Mask,
15949 const X86Subtarget &Subtarget,
15950 SelectionDAG &DAG) {
15951 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15952 "Expected 256-bit or 512-bit vector");
15953
15954 bool UndefLower = isUndefLowerHalf(Mask);
15955 if (!UndefLower && !isUndefUpperHalf(Mask))
15956 return SDValue();
15957
15958 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15959 "Completely undef shuffle mask should have been simplified already");
15960
15961 // Upper half is undef and lower half is whole upper subvector.
15962 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15963 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15964 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15965 if (!UndefLower &&
15966 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15967 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15968 DAG.getVectorIdxConstant(HalfNumElts, DL));
15969 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15970 DAG.getVectorIdxConstant(0, DL));
15971 }
15972
15973 // Lower half is undef and upper half is whole lower subvector.
15974 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15975 if (UndefLower &&
15976 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15977 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15978 DAG.getVectorIdxConstant(0, DL));
15979 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15980 DAG.getVectorIdxConstant(HalfNumElts, DL));
15981 }
15982
15983 int HalfIdx1, HalfIdx2;
15984 SmallVector<int, 8> HalfMask(HalfNumElts);
15985 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15986 return SDValue();
15987
15988 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15989
15990 // Only shuffle the halves of the inputs when useful.
15991 unsigned NumLowerHalves =
15992 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15993 unsigned NumUpperHalves =
15994 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15995 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15996
15997 // Determine the larger pattern of undef/halves, then decide if it's worth
15998 // splitting the shuffle based on subtarget capabilities and types.
15999 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16000 if (!UndefLower) {
16001 // XXXXuuuu: no insert is needed.
16002 // Always extract lowers when setting lower - these are all free subreg ops.
16003 if (NumUpperHalves == 0)
16004 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16005 UndefLower, DAG);
16006
16007 if (NumUpperHalves == 1) {
16008 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16009 if (Subtarget.hasAVX2()) {
16010 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16011 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16012 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16013 (!isSingleSHUFPSMask(HalfMask) ||
16014 Subtarget.hasFastVariableCrossLaneShuffle()))
16015 return SDValue();
16016 // If this is an unary shuffle (assume that the 2nd operand is
16017 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16018 // are better off extracting the upper half of 1 operand and using a
16019 // narrow shuffle.
16020 if (EltWidth == 64 && V2.isUndef())
16021 return SDValue();
16022 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16023 // full width pshufb, and then merge.
16024 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16025 return SDValue();
16026 }
16027 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16028 if (Subtarget.hasAVX512() && VT.is512BitVector())
16029 return SDValue();
16030 // Extract + narrow shuffle is better than the wide alternative.
16031 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16032 UndefLower, DAG);
16033 }
16034
16035 // Don't extract both uppers, instead shuffle and then extract.
16036 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16037 return SDValue();
16038 }
16039
16040 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16041 if (NumUpperHalves == 0) {
16042 // AVX2 has efficient 64-bit element cross-lane shuffles.
16043 // TODO: Refine to account for unary shuffle, splat, and other masks?
16044 if (Subtarget.hasAVX2() && EltWidth == 64)
16045 return SDValue();
16046 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16047 if (Subtarget.hasAVX512() && VT.is512BitVector())
16048 return SDValue();
16049 // Narrow shuffle + insert is better than the wide alternative.
16050 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16051 UndefLower, DAG);
16052 }
16053
16054 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16055 return SDValue();
16056}
16057
16058/// Handle case where shuffle sources are coming from the same 128-bit lane and
16059/// every lane can be represented as the same repeating mask - allowing us to
16060/// shuffle the sources with the repeating shuffle and then permute the result
16061/// to the destination lanes.
16063 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16064 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16065 int NumElts = VT.getVectorNumElements();
16066 int NumLanes = VT.getSizeInBits() / 128;
16067 int NumLaneElts = NumElts / NumLanes;
16068
16069 // On AVX2 we may be able to just shuffle the lowest elements and then
16070 // broadcast the result.
16071 if (Subtarget.hasAVX2()) {
16072 for (unsigned BroadcastSize : {16, 32, 64}) {
16073 if (BroadcastSize <= VT.getScalarSizeInBits())
16074 continue;
16075 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16076
16077 // Attempt to match a repeating pattern every NumBroadcastElts,
16078 // accounting for UNDEFs but only references the lowest 128-bit
16079 // lane of the inputs.
16080 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16081 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16082 for (int j = 0; j != NumBroadcastElts; ++j) {
16083 int M = Mask[i + j];
16084 if (M < 0)
16085 continue;
16086 int &R = RepeatMask[j];
16087 if (0 != ((M % NumElts) / NumLaneElts))
16088 return false;
16089 if (0 <= R && R != M)
16090 return false;
16091 R = M;
16092 }
16093 return true;
16094 };
16095
16096 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16097 if (!FindRepeatingBroadcastMask(RepeatMask))
16098 continue;
16099
16100 // Shuffle the (lowest) repeated elements in place for broadcast.
16101 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16102
16103 // Shuffle the actual broadcast.
16104 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16105 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16106 for (int j = 0; j != NumBroadcastElts; ++j)
16107 BroadcastMask[i + j] = j;
16108
16109 // Avoid returning the same shuffle operation. For example,
16110 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16111 if (BroadcastMask == Mask)
16112 return SDValue();
16113
16114 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16115 BroadcastMask);
16116 }
16117 }
16118
16119 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16120 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16121 return SDValue();
16122
16123 // Bail if we already have a repeated lane shuffle mask.
16124 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16125 return SDValue();
16126
16127 // Helper to look for repeated mask in each split sublane, and that those
16128 // sublanes can then be permuted into place.
16129 auto ShuffleSubLanes = [&](int SubLaneScale) {
16130 int NumSubLanes = NumLanes * SubLaneScale;
16131 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16132
16133 // Check that all the sources are coming from the same lane and see if we
16134 // can form a repeating shuffle mask (local to each sub-lane). At the same
16135 // time, determine the source sub-lane for each destination sub-lane.
16136 int TopSrcSubLane = -1;
16137 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16138 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16139 SubLaneScale,
16140 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16141
16142 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16143 // Extract the sub-lane mask, check that it all comes from the same lane
16144 // and normalize the mask entries to come from the first lane.
16145 int SrcLane = -1;
16146 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16147 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16148 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16149 if (M < 0)
16150 continue;
16151 int Lane = (M % NumElts) / NumLaneElts;
16152 if ((0 <= SrcLane) && (SrcLane != Lane))
16153 return SDValue();
16154 SrcLane = Lane;
16155 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16156 SubLaneMask[Elt] = LocalM;
16157 }
16158
16159 // Whole sub-lane is UNDEF.
16160 if (SrcLane < 0)
16161 continue;
16162
16163 // Attempt to match against the candidate repeated sub-lane masks.
16164 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16165 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16166 for (int i = 0; i != NumSubLaneElts; ++i) {
16167 if (M1[i] < 0 || M2[i] < 0)
16168 continue;
16169 if (M1[i] != M2[i])
16170 return false;
16171 }
16172 return true;
16173 };
16174
16175 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16176 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16177 continue;
16178
16179 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16180 for (int i = 0; i != NumSubLaneElts; ++i) {
16181 int M = SubLaneMask[i];
16182 if (M < 0)
16183 continue;
16184 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16185 "Unexpected mask element");
16186 RepeatedSubLaneMask[i] = M;
16187 }
16188
16189 // Track the top most source sub-lane - by setting the remaining to
16190 // UNDEF we can greatly simplify shuffle matching.
16191 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16192 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16193 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16194 break;
16195 }
16196
16197 // Bail if we failed to find a matching repeated sub-lane mask.
16198 if (Dst2SrcSubLanes[DstSubLane] < 0)
16199 return SDValue();
16200 }
16201 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16202 "Unexpected source lane");
16203
16204 // Create a repeating shuffle mask for the entire vector.
16205 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16206 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16207 int Lane = SubLane / SubLaneScale;
16208 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16209 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16210 int M = RepeatedSubLaneMask[Elt];
16211 if (M < 0)
16212 continue;
16213 int Idx = (SubLane * NumSubLaneElts) + Elt;
16214 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16215 }
16216 }
16217
16218 // Shuffle each source sub-lane to its destination.
16219 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16220 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16221 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16222 if (SrcSubLane < 0)
16223 continue;
16224 for (int j = 0; j != NumSubLaneElts; ++j)
16225 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16226 }
16227
16228 // Avoid returning the same shuffle operation.
16229 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16230 if (RepeatedMask == Mask || SubLaneMask == Mask)
16231 return SDValue();
16232
16233 SDValue RepeatedShuffle =
16234 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16235
16236 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16237 SubLaneMask);
16238 };
16239
16240 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16241 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16242 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16243 // Otherwise we can only permute whole 128-bit lanes.
16244 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16245 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16246 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16247 MinSubLaneScale = 2;
16248 MaxSubLaneScale =
16249 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16250 }
16251 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16252 MinSubLaneScale = MaxSubLaneScale = 4;
16253
16254 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16255 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16256 return Shuffle;
16257
16258 return SDValue();
16259}
16260
16262 bool &ForceV1Zero, bool &ForceV2Zero,
16263 unsigned &ShuffleImm, ArrayRef<int> Mask,
16264 const APInt &Zeroable) {
16265 int NumElts = VT.getVectorNumElements();
16266 assert(VT.getScalarSizeInBits() == 64 &&
16267 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16268 "Unexpected data type for VSHUFPD");
16269 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16270 "Illegal shuffle mask");
16271
16272 bool ZeroLane[2] = { true, true };
16273 for (int i = 0; i < NumElts; ++i)
16274 ZeroLane[i & 1] &= Zeroable[i];
16275
16276 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16277 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16278 bool IsSHUFPD = true;
16279 bool IsCommutable = true;
16280 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16281 for (int i = 0; i < NumElts; ++i) {
16282 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16283 continue;
16284 if (Mask[i] < 0)
16285 return false;
16286 int Val = (i & 6) + NumElts * (i & 1);
16287 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16288 if (Mask[i] < Val || Mask[i] > Val + 1)
16289 IsSHUFPD = false;
16290 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16291 IsCommutable = false;
16292 SHUFPDMask[i] = Mask[i] % 2;
16293 }
16294
16295 if (!IsSHUFPD && !IsCommutable)
16296 return false;
16297
16298 if (!IsSHUFPD && IsCommutable)
16299 std::swap(V1, V2);
16300
16301 ForceV1Zero = ZeroLane[0];
16302 ForceV2Zero = ZeroLane[1];
16303 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16304 return true;
16305}
16306
16308 SDValue V2, ArrayRef<int> Mask,
16309 const APInt &Zeroable,
16310 const X86Subtarget &Subtarget,
16311 SelectionDAG &DAG) {
16312 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16313 "Unexpected data type for VSHUFPD");
16314
16315 unsigned Immediate = 0;
16316 bool ForceV1Zero = false, ForceV2Zero = false;
16317 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16318 Mask, Zeroable))
16319 return SDValue();
16320
16321 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16322 if (ForceV1Zero)
16323 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16324 if (ForceV2Zero)
16325 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16326
16327 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16328 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16329}
16330
16331// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16332// by zeroable elements in the remaining 24 elements. Turn this into two
16333// vmovqb instructions shuffled together.
16335 SDValue V1, SDValue V2,
16336 ArrayRef<int> Mask,
16337 const APInt &Zeroable,
16338 SelectionDAG &DAG) {
16339 assert(VT == MVT::v32i8 && "Unexpected type!");
16340
16341 // The first 8 indices should be every 8th element.
16342 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16343 return SDValue();
16344
16345 // Remaining elements need to be zeroable.
16346 if (Zeroable.countl_one() < (Mask.size() - 8))
16347 return SDValue();
16348
16349 V1 = DAG.getBitcast(MVT::v4i64, V1);
16350 V2 = DAG.getBitcast(MVT::v4i64, V2);
16351
16352 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16353 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16354
16355 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16356 // the upper bits of the result using an unpckldq.
16357 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16358 { 0, 1, 2, 3, 16, 17, 18, 19,
16359 4, 5, 6, 7, 20, 21, 22, 23 });
16360 // Insert the unpckldq into a zero vector to widen to v32i8.
16361 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16362 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16363 DAG.getVectorIdxConstant(0, DL));
16364}
16365
16366// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16367// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16368// =>
16369// ul = unpckl v1, v2
16370// uh = unpckh v1, v2
16371// a = vperm ul, uh
16372// b = vperm ul, uh
16373//
16374// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16375// and permute. We cannot directly match v3 because it is split into two
16376// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16377// pair of 256-bit shuffles and makes sure the masks are consecutive.
16378//
16379// Once unpck and permute nodes are created, the permute corresponding to this
16380// shuffle is returned, while the other permute replaces the other half of the
16381// shuffle in the selection dag.
16383 SDValue V1, SDValue V2,
16384 ArrayRef<int> Mask,
16385 SelectionDAG &DAG) {
16386 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16387 VT != MVT::v32i8)
16388 return SDValue();
16389 // <B0, B1, B0+1, B1+1, ..., >
16390 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16391 unsigned Begin1) {
16392 size_t Size = Mask.size();
16393 assert(Size % 2 == 0 && "Expected even mask size");
16394 for (unsigned I = 0; I < Size; I += 2) {
16395 if (Mask[I] != (int)(Begin0 + I / 2) ||
16396 Mask[I + 1] != (int)(Begin1 + I / 2))
16397 return false;
16398 }
16399 return true;
16400 };
16401 // Check which half is this shuffle node
16402 int NumElts = VT.getVectorNumElements();
16403 size_t FirstQtr = NumElts / 2;
16404 size_t ThirdQtr = NumElts + NumElts / 2;
16405 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16406 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16407 if (!IsFirstHalf && !IsSecondHalf)
16408 return SDValue();
16409
16410 // Find the intersection between shuffle users of V1 and V2.
16411 SmallVector<SDNode *, 2> Shuffles;
16412 for (SDNode *User : V1->users())
16413 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16414 User->getOperand(1) == V2)
16415 Shuffles.push_back(User);
16416 // Limit user size to two for now.
16417 if (Shuffles.size() != 2)
16418 return SDValue();
16419 // Find out which half of the 512-bit shuffles is each smaller shuffle
16420 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16421 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16422 SDNode *FirstHalf;
16423 SDNode *SecondHalf;
16424 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16425 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16426 FirstHalf = Shuffles[0];
16427 SecondHalf = Shuffles[1];
16428 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16429 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16430 FirstHalf = Shuffles[1];
16431 SecondHalf = Shuffles[0];
16432 } else {
16433 return SDValue();
16434 }
16435 // Lower into unpck and perm. Return the perm of this shuffle and replace
16436 // the other.
16437 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16438 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16439 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16440 DAG.getTargetConstant(0x20, DL, MVT::i8));
16441 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16442 DAG.getTargetConstant(0x31, DL, MVT::i8));
16443 if (IsFirstHalf) {
16444 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16445 return Perm1;
16446 }
16447 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16448 return Perm2;
16449}
16450
16451/// Handle lowering of 4-lane 64-bit floating point shuffles.
16452///
16453/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16454/// isn't available.
16456 const APInt &Zeroable, SDValue V1, SDValue V2,
16457 const X86Subtarget &Subtarget,
16458 SelectionDAG &DAG) {
16459 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16460 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16461 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16462
16463 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16464 Subtarget, DAG))
16465 return V;
16466
16467 if (V2.isUndef()) {
16468 // Check for being able to broadcast a single element.
16469 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16470 Mask, Subtarget, DAG))
16471 return Broadcast;
16472
16473 // Use low duplicate instructions for masks that match their pattern.
16474 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16475 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16476
16477 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16478 // Non-half-crossing single input shuffles can be lowered with an
16479 // interleaved permutation.
16480 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16481 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16482 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16483 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16484 }
16485
16486 // With AVX2 we have direct support for this permutation.
16487 if (Subtarget.hasAVX2())
16488 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16489 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16490
16491 // Try to create an in-lane repeating shuffle mask and then shuffle the
16492 // results into the target lanes.
16494 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16495 return V;
16496
16497 // Try to permute the lanes and then use a per-lane permute.
16498 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16499 Mask, DAG, Subtarget))
16500 return V;
16501
16502 // Otherwise, fall back.
16503 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16504 DAG, Subtarget);
16505 }
16506
16507 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16508 Zeroable, Subtarget, DAG))
16509 return Blend;
16510
16511 // Use dedicated unpack instructions for masks that match their pattern.
16512 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16513 return V;
16514
16515 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16516 Zeroable, Subtarget, DAG))
16517 return Op;
16518
16519 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16520 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16521 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16522 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16523
16524 // If we have lane crossing shuffles AND they don't all come from the lower
16525 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16526 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16527 // canonicalize to a blend of splat which isn't necessary for this combine.
16528 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16529 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16530 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16531 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16532 (!Subtarget.hasAVX2() ||
16533 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16534 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16535
16536 // If we have one input in place, then we can permute the other input and
16537 // blend the result.
16538 if (V1IsInPlace || V2IsInPlace)
16539 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16540 Zeroable, Subtarget, DAG);
16541
16542 // Try to create an in-lane repeating shuffle mask and then shuffle the
16543 // results into the target lanes.
16545 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16546 return V;
16547
16548 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16549 // shuffle. However, if we have AVX2 and either inputs are already in place,
16550 // we will be able to shuffle even across lanes the other input in a single
16551 // instruction so skip this pattern.
16552 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16554 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16555 return V;
16556
16557 // If we have VLX support, we can use VEXPAND.
16558 if (Subtarget.hasVLX())
16559 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16560 Zeroable, Subtarget, DAG))
16561 return V;
16562
16563 // If we have AVX2 then we always want to lower with a blend because an v4 we
16564 // can fully permute the elements.
16565 if (Subtarget.hasAVX2())
16566 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16567 Zeroable, Subtarget, DAG);
16568
16569 // Otherwise fall back on generic lowering.
16570 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16571 Subtarget, DAG);
16572}
16573
16574/// Handle lowering of 4-lane 64-bit integer shuffles.
16575///
16576/// This routine is only called when we have AVX2 and thus a reasonable
16577/// instruction set for v4i64 shuffling..
16579 const APInt &Zeroable, SDValue V1, SDValue V2,
16580 const X86Subtarget &Subtarget,
16581 SelectionDAG &DAG) {
16582 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16583 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16584 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16585 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16586
16587 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16588 Subtarget, DAG))
16589 return V;
16590
16591 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16592 Zeroable, Subtarget, DAG))
16593 return Blend;
16594
16595 // Check for being able to broadcast a single element.
16596 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16597 Subtarget, DAG))
16598 return Broadcast;
16599
16600 // Try to use shift instructions if fast.
16601 if (Subtarget.preferLowerShuffleAsShift())
16602 if (SDValue Shift =
16603 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16604 Subtarget, DAG, /*BitwiseOnly*/ true))
16605 return Shift;
16606
16607 if (V2.isUndef()) {
16608 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16609 // can use lower latency instructions that will operate on both lanes.
16610 SmallVector<int, 2> RepeatedMask;
16611 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16612 SmallVector<int, 4> PSHUFDMask;
16613 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16614 return DAG.getBitcast(
16615 MVT::v4i64,
16616 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16617 DAG.getBitcast(MVT::v8i32, V1),
16618 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16619 }
16620
16621 // AVX2 provides a direct instruction for permuting a single input across
16622 // lanes.
16623 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16624 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16625 }
16626
16627 // Try to use shift instructions.
16628 if (SDValue Shift =
16629 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16630 DAG, /*BitwiseOnly*/ false))
16631 return Shift;
16632
16633 // If we have VLX support, we can use VALIGN or VEXPAND.
16634 if (Subtarget.hasVLX()) {
16635 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16636 Zeroable, Subtarget, DAG))
16637 return Rotate;
16638
16639 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16640 Zeroable, Subtarget, DAG))
16641 return V;
16642 }
16643
16644 // Try to use PALIGNR.
16645 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16646 Subtarget, DAG))
16647 return Rotate;
16648
16649 // Use dedicated unpack instructions for masks that match their pattern.
16650 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16651 return V;
16652
16653 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16654 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16655
16656 // If we have one input in place, then we can permute the other input and
16657 // blend the result.
16658 if (V1IsInPlace || V2IsInPlace)
16659 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16660 Zeroable, Subtarget, DAG);
16661
16662 // Try to create an in-lane repeating shuffle mask and then shuffle the
16663 // results into the target lanes.
16665 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16666 return V;
16667
16668 // Try to lower to PERMQ(BLENDD(V1,V2)).
16669 if (SDValue V =
16670 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16671 return V;
16672
16673 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16674 // shuffle. However, if we have AVX2 and either inputs are already in place,
16675 // we will be able to shuffle even across lanes the other input in a single
16676 // instruction so skip this pattern.
16677 if (!V1IsInPlace && !V2IsInPlace)
16679 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16680 return Result;
16681
16682 // Otherwise fall back on generic blend lowering.
16683 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16684 Zeroable, Subtarget, DAG);
16685}
16686
16687/// Handle lowering of 8-lane 32-bit floating point shuffles.
16688///
16689/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16690/// isn't available.
16692 const APInt &Zeroable, SDValue V1, SDValue V2,
16693 const X86Subtarget &Subtarget,
16694 SelectionDAG &DAG) {
16695 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16696 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16697 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16698
16699 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16700 Zeroable, Subtarget, DAG))
16701 return Blend;
16702
16703 // Check for being able to broadcast a single element.
16704 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16705 Subtarget, DAG))
16706 return Broadcast;
16707
16708 if (!Subtarget.hasAVX2()) {
16709 SmallVector<int> InLaneMask;
16710 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16711
16712 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16713 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16714 /*SimpleOnly*/ true))
16715 return R;
16716 }
16717 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16718 Zeroable, Subtarget, DAG))
16719 return DAG.getBitcast(MVT::v8f32, ZExt);
16720
16721 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16722 // options to efficiently lower the shuffle.
16723 SmallVector<int, 4> RepeatedMask;
16724 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16725 assert(RepeatedMask.size() == 4 &&
16726 "Repeated masks must be half the mask width!");
16727
16728 // Use even/odd duplicate instructions for masks that match their pattern.
16729 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16730 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16731 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16732 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16733
16734 if (V2.isUndef())
16735 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16736 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16737
16738 // Use dedicated unpack instructions for masks that match their pattern.
16739 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16740 return V;
16741
16742 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16743 // have already handled any direct blends.
16744 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16745 }
16746
16747 // Try to create an in-lane repeating shuffle mask and then shuffle the
16748 // results into the target lanes.
16750 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16751 return V;
16752
16753 // If we have a single input shuffle with different shuffle patterns in the
16754 // two 128-bit lanes use the variable mask to VPERMILPS.
16755 if (V2.isUndef()) {
16756 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16757 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16758 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16759 }
16760 if (Subtarget.hasAVX2()) {
16761 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16762 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16763 }
16764 // Otherwise, fall back.
16765 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16766 DAG, Subtarget);
16767 }
16768
16769 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16770 // shuffle.
16772 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16773 return Result;
16774
16775 // If we have VLX support, we can use VEXPAND.
16776 if (Subtarget.hasVLX())
16777 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16778 Zeroable, Subtarget, DAG))
16779 return V;
16780
16781 // Try to match an interleave of two v8f32s and lower them as unpck and
16782 // permutes using ymms. This needs to go before we try to split the vectors.
16783 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16784 if ((Subtarget.hasAVX2() ||
16787 !Subtarget.hasAVX512())
16788 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16789 Mask, DAG))
16790 return V;
16791
16792 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16793 // since after split we get a more efficient code using vpunpcklwd and
16794 // vpunpckhwd instrs than vblend.
16795 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16796 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16797 Subtarget, DAG);
16798
16799 // If we have AVX2 then we always want to lower with a blend because at v8 we
16800 // can fully permute the elements.
16801 if (Subtarget.hasAVX2())
16802 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16803 Zeroable, Subtarget, DAG);
16804
16805 // Otherwise fall back on generic lowering.
16806 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16807 Subtarget, DAG);
16808}
16809
16810/// Handle lowering of 8-lane 32-bit integer shuffles.
16811///
16812/// This routine is only called when we have AVX2 and thus a reasonable
16813/// instruction set for v8i32 shuffling..
16815 const APInt &Zeroable, SDValue V1, SDValue V2,
16816 const X86Subtarget &Subtarget,
16817 SelectionDAG &DAG) {
16818 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16819 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16820 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16821 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16822
16823 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16824
16825 // Whenever we can lower this as a zext, that instruction is strictly faster
16826 // than any alternative. It also allows us to fold memory operands into the
16827 // shuffle in many cases.
16828 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16829 Zeroable, Subtarget, DAG))
16830 return ZExt;
16831
16832 // Try to match an interleave of two v8i32s and lower them as unpck and
16833 // permutes using ymms. This needs to go before we try to split the vectors.
16834 if (!Subtarget.hasAVX512())
16835 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16836 Mask, DAG))
16837 return V;
16838
16839 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16840 // since after split we get a more efficient code than vblend by using
16841 // vpunpcklwd and vpunpckhwd instrs.
16842 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16843 !Subtarget.hasAVX512())
16844 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16845 Subtarget, DAG);
16846
16847 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16848 Zeroable, Subtarget, DAG))
16849 return Blend;
16850
16851 // Check for being able to broadcast a single element.
16852 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16853 Subtarget, DAG))
16854 return Broadcast;
16855
16856 // Try to use shift instructions if fast.
16857 if (Subtarget.preferLowerShuffleAsShift()) {
16858 if (SDValue Shift =
16859 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16860 Subtarget, DAG, /*BitwiseOnly*/ true))
16861 return Shift;
16862 if (NumV2Elements == 0)
16863 if (SDValue Rotate =
16864 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16865 return Rotate;
16866 }
16867
16868 // If the shuffle mask is repeated in each 128-bit lane we can use more
16869 // efficient instructions that mirror the shuffles across the two 128-bit
16870 // lanes.
16871 SmallVector<int, 4> RepeatedMask;
16872 bool Is128BitLaneRepeatedShuffle =
16873 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16874 if (Is128BitLaneRepeatedShuffle) {
16875 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16876 if (V2.isUndef())
16877 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16878 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16879
16880 // Use dedicated unpack instructions for masks that match their pattern.
16881 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16882 return V;
16883 }
16884
16885 // Try to use shift instructions.
16886 if (SDValue Shift =
16887 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16888 DAG, /*BitwiseOnly*/ false))
16889 return Shift;
16890
16891 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16892 if (SDValue Rotate =
16893 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16894 return Rotate;
16895
16896 // If we have VLX support, we can use VALIGN or EXPAND.
16897 if (Subtarget.hasVLX()) {
16898 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16899 Zeroable, Subtarget, DAG))
16900 return Rotate;
16901
16902 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16903 Zeroable, Subtarget, DAG))
16904 return V;
16905 }
16906
16907 // Try to use byte rotation instructions.
16908 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16909 Subtarget, DAG))
16910 return Rotate;
16911
16912 // Try to create an in-lane repeating shuffle mask and then shuffle the
16913 // results into the target lanes.
16915 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16916 return V;
16917
16918 if (V2.isUndef()) {
16919 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16920 // because that should be faster than the variable permute alternatives.
16921 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16922 return V;
16923
16924 // If the shuffle patterns aren't repeated but it's a single input, directly
16925 // generate a cross-lane VPERMD instruction.
16926 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16927 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16928 }
16929
16930 // Assume that a single SHUFPS is faster than an alternative sequence of
16931 // multiple instructions (even if the CPU has a domain penalty).
16932 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16933 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16934 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16935 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16936 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16937 CastV1, CastV2, DAG);
16938 return DAG.getBitcast(MVT::v8i32, ShufPS);
16939 }
16940
16941 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16942 // shuffle.
16944 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16945 return Result;
16946
16947 // Otherwise fall back on generic blend lowering.
16948 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16949 Zeroable, Subtarget, DAG);
16950}
16951
16952/// Handle lowering of 16-lane 16-bit integer shuffles.
16953///
16954/// This routine is only called when we have AVX2 and thus a reasonable
16955/// instruction set for v16i16 shuffling..
16957 const APInt &Zeroable, SDValue V1, SDValue V2,
16958 const X86Subtarget &Subtarget,
16959 SelectionDAG &DAG) {
16960 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16961 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16962 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16963 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16964
16965 // Whenever we can lower this as a zext, that instruction is strictly faster
16966 // than any alternative. It also allows us to fold memory operands into the
16967 // shuffle in many cases.
16969 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16970 return ZExt;
16971
16972 // Check for being able to broadcast a single element.
16973 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16974 Subtarget, DAG))
16975 return Broadcast;
16976
16977 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16978 Zeroable, Subtarget, DAG))
16979 return Blend;
16980
16981 // Use dedicated unpack instructions for masks that match their pattern.
16982 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16983 return V;
16984
16985 // Use dedicated pack instructions for masks that match their pattern.
16986 if (SDValue V =
16987 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16988 return V;
16989
16990 // Try to use lower using a truncation.
16991 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16992 Subtarget, DAG))
16993 return V;
16994
16995 // Try to use shift instructions.
16996 if (SDValue Shift =
16997 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16998 Subtarget, DAG, /*BitwiseOnly*/ false))
16999 return Shift;
17000
17001 // Try to use byte rotation instructions.
17002 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17003 Subtarget, DAG))
17004 return Rotate;
17005
17006 // Try to create an in-lane repeating shuffle mask and then shuffle the
17007 // results into the target lanes.
17009 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17010 return V;
17011
17012 if (V2.isUndef()) {
17013 // Try to use bit rotation instructions.
17014 if (SDValue Rotate =
17015 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17016 return Rotate;
17017
17018 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17019 // because that should be faster than the variable permute alternatives.
17020 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17021 return V;
17022
17023 // There are no generalized cross-lane shuffle operations available on i16
17024 // element types.
17025 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17027 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17028 return V;
17029
17030 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17031 DAG, Subtarget);
17032 }
17033
17034 SmallVector<int, 8> RepeatedMask;
17035 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17036 // As this is a single-input shuffle, the repeated mask should be
17037 // a strictly valid v8i16 mask that we can pass through to the v8i16
17038 // lowering to handle even the v16 case.
17040 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17041 }
17042 }
17043
17044 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17045 Zeroable, Subtarget, DAG))
17046 return PSHUFB;
17047
17048 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17049 if (Subtarget.hasBWI())
17050 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17051
17052 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17053 // shuffle.
17055 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17056 return Result;
17057
17058 // Try to permute the lanes and then use a per-lane permute.
17060 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17061 return V;
17062
17063 // Try to match an interleave of two v16i16s and lower them as unpck and
17064 // permutes using ymms.
17065 if (!Subtarget.hasAVX512())
17066 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17067 Mask, DAG))
17068 return V;
17069
17070 // Otherwise fall back on generic lowering.
17071 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17072 Subtarget, DAG);
17073}
17074
17075/// Handle lowering of 32-lane 8-bit integer shuffles.
17076///
17077/// This routine is only called when we have AVX2 and thus a reasonable
17078/// instruction set for v32i8 shuffling..
17080 const APInt &Zeroable, SDValue V1, SDValue V2,
17081 const X86Subtarget &Subtarget,
17082 SelectionDAG &DAG) {
17083 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17084 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17085 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17086 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17087
17088 // Whenever we can lower this as a zext, that instruction is strictly faster
17089 // than any alternative. It also allows us to fold memory operands into the
17090 // shuffle in many cases.
17091 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17092 Zeroable, Subtarget, DAG))
17093 return ZExt;
17094
17095 // Check for being able to broadcast a single element.
17096 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17097 Subtarget, DAG))
17098 return Broadcast;
17099
17100 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17101 Zeroable, Subtarget, DAG))
17102 return Blend;
17103
17104 // Use dedicated unpack instructions for masks that match their pattern.
17105 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17106 return V;
17107
17108 // Use dedicated pack instructions for masks that match their pattern.
17109 if (SDValue V =
17110 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17111 return V;
17112
17113 // Try to use lower using a truncation.
17114 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17115 Subtarget, DAG))
17116 return V;
17117
17118 // Try to use shift instructions.
17119 if (SDValue Shift =
17120 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17121 DAG, /*BitwiseOnly*/ false))
17122 return Shift;
17123
17124 // Try to use byte rotation instructions.
17125 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17126 Subtarget, DAG))
17127 return Rotate;
17128
17129 // Try to use bit rotation instructions.
17130 if (V2.isUndef())
17131 if (SDValue Rotate =
17132 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17133 return Rotate;
17134
17135 // Try to create an in-lane repeating shuffle mask and then shuffle the
17136 // results into the target lanes.
17138 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17139 return V;
17140
17141 // There are no generalized cross-lane shuffle operations available on i8
17142 // element types.
17143 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17144 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17145 // because that should be faster than the variable permute alternatives.
17146 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17147 return V;
17148
17150 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17151 return V;
17152
17153 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17154 DAG, Subtarget);
17155 }
17156
17157 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17158 Zeroable, Subtarget, DAG))
17159 return PSHUFB;
17160
17161 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17162 if (Subtarget.hasVBMI())
17163 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17164
17165 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17166 // shuffle.
17168 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17169 return Result;
17170
17171 // Try to permute the lanes and then use a per-lane permute.
17173 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17174 return V;
17175
17176 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17177 // by zeroable elements in the remaining 24 elements. Turn this into two
17178 // vmovqb instructions shuffled together.
17179 if (Subtarget.hasVLX())
17180 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17181 Mask, Zeroable, DAG))
17182 return V;
17183
17184 // Try to match an interleave of two v32i8s and lower them as unpck and
17185 // permutes using ymms.
17186 if (!Subtarget.hasAVX512())
17187 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17188 Mask, DAG))
17189 return V;
17190
17191 // Otherwise fall back on generic lowering.
17192 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17193 Subtarget, DAG);
17194}
17195
17196/// High-level routine to lower various 256-bit x86 vector shuffles.
17197///
17198/// This routine either breaks down the specific type of a 256-bit x86 vector
17199/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17200/// together based on the available instructions.
17202 SDValue V1, SDValue V2, const APInt &Zeroable,
17203 const X86Subtarget &Subtarget,
17204 SelectionDAG &DAG) {
17205 // If we have a single input to the zero element, insert that into V1 if we
17206 // can do so cheaply.
17207 int NumElts = VT.getVectorNumElements();
17208 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17209
17210 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17212 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17213 return Insertion;
17214
17215 // Handle special cases where the lower or upper half is UNDEF.
17216 if (SDValue V =
17217 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17218 return V;
17219
17220 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17221 // can check for those subtargets here and avoid much of the subtarget
17222 // querying in the per-vector-type lowering routines. With AVX1 we have
17223 // essentially *zero* ability to manipulate a 256-bit vector with integer
17224 // types. Since we'll use floating point types there eventually, just
17225 // immediately cast everything to a float and operate entirely in that domain.
17226 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17227 int ElementBits = VT.getScalarSizeInBits();
17228 if (ElementBits < 32) {
17229 // No floating point type available, if we can't use the bit operations
17230 // for masking/blending then decompose into 128-bit vectors.
17231 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17232 Subtarget, DAG))
17233 return V;
17234 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17235 return V;
17236 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17237 }
17238
17239 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17241 V1 = DAG.getBitcast(FpVT, V1);
17242 V2 = DAG.getBitcast(FpVT, V2);
17243 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17244 }
17245
17246 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17247 V1 = DAG.getBitcast(MVT::v16i16, V1);
17248 V2 = DAG.getBitcast(MVT::v16i16, V2);
17249 return DAG.getBitcast(VT,
17250 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17251 }
17252
17253 switch (VT.SimpleTy) {
17254 case MVT::v4f64:
17255 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17256 case MVT::v4i64:
17257 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17258 case MVT::v8f32:
17259 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17260 case MVT::v8i32:
17261 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17262 case MVT::v16i16:
17263 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17264 case MVT::v32i8:
17265 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17266
17267 default:
17268 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17269 }
17270}
17271
17272/// Try to lower a vector shuffle as a 128-bit shuffles.
17274 const APInt &Zeroable, SDValue V1, SDValue V2,
17275 const X86Subtarget &Subtarget,
17276 SelectionDAG &DAG) {
17277 assert(VT.getScalarSizeInBits() == 64 &&
17278 "Unexpected element type size for 128bit shuffle.");
17279
17280 // To handle 256 bit vector requires VLX and most probably
17281 // function lowerV2X128VectorShuffle() is better solution.
17282 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17283
17284 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17285 SmallVector<int, 4> Widened128Mask;
17286 if (!canWidenShuffleElements(Mask, Widened128Mask))
17287 return SDValue();
17288 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17289
17290 // Try to use an insert into a zero vector.
17291 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17292 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17293 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17294 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17295 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17296 DAG.getVectorIdxConstant(0, DL));
17297 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17298 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17299 DAG.getVectorIdxConstant(0, DL));
17300 }
17301
17302 // Check for patterns which can be matched with a single insert of a 256-bit
17303 // subvector.
17304 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17305 if (OnlyUsesV1 ||
17306 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17307 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17308 SDValue SubVec =
17309 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17310 DAG.getVectorIdxConstant(0, DL));
17311 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17312 DAG.getVectorIdxConstant(4, DL));
17313 }
17314
17315 // See if this is an insertion of the lower 128-bits of V2 into V1.
17316 bool IsInsert = true;
17317 int V2Index = -1;
17318 for (int i = 0; i < 4; ++i) {
17319 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17320 if (Widened128Mask[i] < 0)
17321 continue;
17322
17323 // Make sure all V1 subvectors are in place.
17324 if (Widened128Mask[i] < 4) {
17325 if (Widened128Mask[i] != i) {
17326 IsInsert = false;
17327 break;
17328 }
17329 } else {
17330 // Make sure we only have a single V2 index and its the lowest 128-bits.
17331 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17332 IsInsert = false;
17333 break;
17334 }
17335 V2Index = i;
17336 }
17337 }
17338 if (IsInsert && V2Index >= 0) {
17339 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17340 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17341 DAG.getVectorIdxConstant(0, DL));
17342 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17343 }
17344
17345 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17346 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17347 // possible we at least ensure the lanes stay sequential to help later
17348 // combines.
17349 SmallVector<int, 2> Widened256Mask;
17350 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17351 Widened128Mask.clear();
17352 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17353 }
17354
17355 // Try to lower to vshuf64x2/vshuf32x4.
17356 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17357 int PermMask[4] = {-1, -1, -1, -1};
17358 // Ensure elements came from the same Op.
17359 for (int i = 0; i < 4; ++i) {
17360 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17361 if (Widened128Mask[i] < 0)
17362 continue;
17363
17364 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17365 unsigned OpIndex = i / 2;
17366 if (Ops[OpIndex].isUndef())
17367 Ops[OpIndex] = Op;
17368 else if (Ops[OpIndex] != Op)
17369 return SDValue();
17370
17371 PermMask[i] = Widened128Mask[i] % 4;
17372 }
17373
17374 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17375 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17376}
17377
17378/// Handle lowering of 8-lane 64-bit floating point shuffles.
17380 const APInt &Zeroable, SDValue V1, SDValue V2,
17381 const X86Subtarget &Subtarget,
17382 SelectionDAG &DAG) {
17383 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17384 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17385 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17386
17387 if (V2.isUndef()) {
17388 // Use low duplicate instructions for masks that match their pattern.
17389 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17390 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17391
17392 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17393 // Non-half-crossing single input shuffles can be lowered with an
17394 // interleaved permutation.
17395 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17396 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17397 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17398 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17399 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17400 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17401 }
17402
17403 SmallVector<int, 4> RepeatedMask;
17404 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17405 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17406 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17407 }
17408
17409 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17410 V2, Subtarget, DAG))
17411 return Shuf128;
17412
17413 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17414 return Unpck;
17415
17416 // Check if the blend happens to exactly fit that of SHUFPD.
17417 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17418 Zeroable, Subtarget, DAG))
17419 return Op;
17420
17421 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17422 Subtarget, DAG))
17423 return V;
17424
17425 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17426 Zeroable, Subtarget, DAG))
17427 return Blend;
17428
17429 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17430}
17431
17432/// Handle lowering of 16-lane 32-bit floating point shuffles.
17434 const APInt &Zeroable, SDValue V1, SDValue V2,
17435 const X86Subtarget &Subtarget,
17436 SelectionDAG &DAG) {
17437 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17438 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17439 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17440
17441 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17442 // options to efficiently lower the shuffle.
17443 SmallVector<int, 4> RepeatedMask;
17444 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17445 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17446
17447 // Use even/odd duplicate instructions for masks that match their pattern.
17448 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17449 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17450 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17451 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17452
17453 if (V2.isUndef())
17454 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17455 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17456
17457 // Use dedicated unpack instructions for masks that match their pattern.
17458 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17459 return V;
17460
17461 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17462 Zeroable, Subtarget, DAG))
17463 return Blend;
17464
17465 // Otherwise, fall back to a SHUFPS sequence.
17466 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17467 }
17468
17469 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17470 Zeroable, Subtarget, DAG))
17471 return Blend;
17472
17474 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17475 return DAG.getBitcast(MVT::v16f32, ZExt);
17476
17477 // Try to create an in-lane repeating shuffle mask and then shuffle the
17478 // results into the target lanes.
17480 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17481 return V;
17482
17483 // If we have a single input shuffle with different shuffle patterns in the
17484 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17485 if (V2.isUndef() &&
17486 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17487 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17488 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17489 }
17490
17491 // If we have AVX512F support, we can use VEXPAND.
17492 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17493 Zeroable, Subtarget, DAG))
17494 return V;
17495
17496 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17497}
17498
17499/// Handle lowering of 8-lane 64-bit integer shuffles.
17501 const APInt &Zeroable, SDValue V1, SDValue V2,
17502 const X86Subtarget &Subtarget,
17503 SelectionDAG &DAG) {
17504 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17505 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17506 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17507
17508 // Try to use shift instructions if fast.
17509 if (Subtarget.preferLowerShuffleAsShift())
17510 if (SDValue Shift =
17511 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17512 Subtarget, DAG, /*BitwiseOnly*/ true))
17513 return Shift;
17514
17515 if (V2.isUndef()) {
17516 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17517 // can use lower latency instructions that will operate on all four
17518 // 128-bit lanes.
17519 SmallVector<int, 2> Repeated128Mask;
17520 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17521 SmallVector<int, 4> PSHUFDMask;
17522 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17523 return DAG.getBitcast(
17524 MVT::v8i64,
17525 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17526 DAG.getBitcast(MVT::v16i32, V1),
17527 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17528 }
17529
17530 SmallVector<int, 4> Repeated256Mask;
17531 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17532 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17533 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17534 }
17535
17536 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17537 V2, Subtarget, DAG))
17538 return Shuf128;
17539
17540 // Try to use shift instructions.
17541 if (SDValue Shift =
17542 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17543 DAG, /*BitwiseOnly*/ false))
17544 return Shift;
17545
17546 // Try to use VALIGN.
17547 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17548 Zeroable, Subtarget, DAG))
17549 return Rotate;
17550
17551 // Try to use PALIGNR.
17552 if (Subtarget.hasBWI())
17553 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17554 Subtarget, DAG))
17555 return Rotate;
17556
17557 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17558 return Unpck;
17559
17560 // If we have AVX512F support, we can use VEXPAND.
17561 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17562 Subtarget, DAG))
17563 return V;
17564
17565 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17566 Zeroable, Subtarget, DAG))
17567 return Blend;
17568
17569 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17570}
17571
17572/// Handle lowering of 16-lane 32-bit integer shuffles.
17574 const APInt &Zeroable, SDValue V1, SDValue V2,
17575 const X86Subtarget &Subtarget,
17576 SelectionDAG &DAG) {
17577 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17578 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17579 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17580
17581 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17582
17583 // Whenever we can lower this as a zext, that instruction is strictly faster
17584 // than any alternative. It also allows us to fold memory operands into the
17585 // shuffle in many cases.
17587 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17588 return ZExt;
17589
17590 // Try to use shift instructions if fast.
17591 if (Subtarget.preferLowerShuffleAsShift()) {
17592 if (SDValue Shift =
17593 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17594 Subtarget, DAG, /*BitwiseOnly*/ true))
17595 return Shift;
17596 if (NumV2Elements == 0)
17597 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17598 Subtarget, DAG))
17599 return Rotate;
17600 }
17601
17602 // If the shuffle mask is repeated in each 128-bit lane we can use more
17603 // efficient instructions that mirror the shuffles across the four 128-bit
17604 // lanes.
17605 SmallVector<int, 4> RepeatedMask;
17606 bool Is128BitLaneRepeatedShuffle =
17607 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17608 if (Is128BitLaneRepeatedShuffle) {
17609 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17610 if (V2.isUndef())
17611 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17612 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17613
17614 // Use dedicated unpack instructions for masks that match their pattern.
17615 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17616 return V;
17617 }
17618
17619 // Try to use shift instructions.
17620 if (SDValue Shift =
17621 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17622 Subtarget, DAG, /*BitwiseOnly*/ false))
17623 return Shift;
17624
17625 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17626 if (SDValue Rotate =
17627 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17628 return Rotate;
17629
17630 // Try to use VALIGN.
17631 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17632 Zeroable, Subtarget, DAG))
17633 return Rotate;
17634
17635 // Try to use byte rotation instructions.
17636 if (Subtarget.hasBWI())
17637 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17638 Subtarget, DAG))
17639 return Rotate;
17640
17641 // Assume that a single SHUFPS is faster than using a permv shuffle.
17642 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17643 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17644 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17645 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17646 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17647 CastV1, CastV2, DAG);
17648 return DAG.getBitcast(MVT::v16i32, ShufPS);
17649 }
17650
17651 // Try to create an in-lane repeating shuffle mask and then shuffle the
17652 // results into the target lanes.
17654 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17655 return V;
17656
17657 // If we have AVX512F support, we can use VEXPAND.
17658 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17659 Zeroable, Subtarget, DAG))
17660 return V;
17661
17662 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17663 Zeroable, Subtarget, DAG))
17664 return Blend;
17665
17666 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17667}
17668
17669/// Handle lowering of 32-lane 16-bit integer shuffles.
17671 const APInt &Zeroable, SDValue V1, SDValue V2,
17672 const X86Subtarget &Subtarget,
17673 SelectionDAG &DAG) {
17674 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17675 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17676 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17677 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17678
17679 // Whenever we can lower this as a zext, that instruction is strictly faster
17680 // than any alternative. It also allows us to fold memory operands into the
17681 // shuffle in many cases.
17683 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17684 return ZExt;
17685
17686 // Use dedicated unpack instructions for masks that match their pattern.
17687 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17688 return V;
17689
17690 // Use dedicated pack instructions for masks that match their pattern.
17691 if (SDValue V =
17692 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17693 return V;
17694
17695 // Try to use shift instructions.
17696 if (SDValue Shift =
17697 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17698 Subtarget, DAG, /*BitwiseOnly*/ false))
17699 return Shift;
17700
17701 // Try to use byte rotation instructions.
17702 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17703 Subtarget, DAG))
17704 return Rotate;
17705
17706 if (V2.isUndef()) {
17707 // Try to use bit rotation instructions.
17708 if (SDValue Rotate =
17709 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17710 return Rotate;
17711
17712 SmallVector<int, 8> RepeatedMask;
17713 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17714 // As this is a single-input shuffle, the repeated mask should be
17715 // a strictly valid v8i16 mask that we can pass through to the v8i16
17716 // lowering to handle even the v32 case.
17717 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17718 RepeatedMask, Subtarget, DAG);
17719 }
17720 }
17721
17722 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17723 Zeroable, Subtarget, DAG))
17724 return Blend;
17725
17726 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17727 Zeroable, Subtarget, DAG))
17728 return PSHUFB;
17729
17730 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17731 // shuffle.
17732 if (!V2.isUndef())
17734 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17735 return Result;
17736
17737 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17738}
17739
17740/// Handle lowering of 64-lane 8-bit integer shuffles.
17742 const APInt &Zeroable, SDValue V1, SDValue V2,
17743 const X86Subtarget &Subtarget,
17744 SelectionDAG &DAG) {
17745 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17746 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17747 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17748 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17749
17750 // Whenever we can lower this as a zext, that instruction is strictly faster
17751 // than any alternative. It also allows us to fold memory operands into the
17752 // shuffle in many cases.
17754 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17755 return ZExt;
17756
17757 // Use dedicated unpack instructions for masks that match their pattern.
17758 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17759 return V;
17760
17761 // Use dedicated pack instructions for masks that match their pattern.
17762 if (SDValue V =
17763 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17764 return V;
17765
17766 // Try to use shift instructions.
17767 if (SDValue Shift =
17768 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17769 DAG, /*BitwiseOnly*/ false))
17770 return Shift;
17771
17772 // Try to use byte rotation instructions.
17773 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17774 Subtarget, DAG))
17775 return Rotate;
17776
17777 // Try to use bit rotation instructions.
17778 if (V2.isUndef())
17779 if (SDValue Rotate =
17780 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17781 return Rotate;
17782
17783 // Lower as AND if possible.
17784 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17785 Zeroable, Subtarget, DAG))
17786 return Masked;
17787
17788 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17789 Zeroable, Subtarget, DAG))
17790 return PSHUFB;
17791
17792 // Try to create an in-lane repeating shuffle mask and then shuffle the
17793 // results into the target lanes.
17795 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17796 return V;
17797
17799 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17800 return Result;
17801
17802 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17803 Zeroable, Subtarget, DAG))
17804 return Blend;
17805
17806 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17807 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17808 // PALIGNR will be cheaper than the second PSHUFB+OR.
17809 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17810 Mask, Subtarget, DAG))
17811 return V;
17812
17813 // If we can't directly blend but can use PSHUFB, that will be better as it
17814 // can both shuffle and set up the inefficient blend.
17815 bool V1InUse, V2InUse;
17816 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17817 DAG, V1InUse, V2InUse);
17818 }
17819
17820 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17821 // shuffle.
17822 if (!V2.isUndef())
17824 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17825 return Result;
17826
17827 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17828 if (Subtarget.hasVBMI())
17829 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17830
17831 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17832}
17833
17834/// High-level routine to lower various 512-bit x86 vector shuffles.
17835///
17836/// This routine either breaks down the specific type of a 512-bit x86 vector
17837/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17838/// together based on the available instructions.
17840 MVT VT, SDValue V1, SDValue V2,
17841 const APInt &Zeroable,
17842 const X86Subtarget &Subtarget,
17843 SelectionDAG &DAG) {
17844 assert(Subtarget.hasAVX512() &&
17845 "Cannot lower 512-bit vectors w/ basic ISA!");
17846
17847 // If we have a single input to the zero element, insert that into V1 if we
17848 // can do so cheaply.
17849 int NumElts = Mask.size();
17850 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17851
17852 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17854 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17855 return Insertion;
17856
17857 // Handle special cases where the lower or upper half is UNDEF.
17858 if (SDValue V =
17859 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17860 return V;
17861
17862 // Check for being able to broadcast a single element.
17863 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17864 Subtarget, DAG))
17865 return Broadcast;
17866
17867 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17868 // Try using bit ops for masking and blending before falling back to
17869 // splitting.
17870 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17871 Subtarget, DAG))
17872 return V;
17873 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17874 return V;
17875
17876 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17877 }
17878
17879 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17880 if (!Subtarget.hasBWI())
17881 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17882 /*SimpleOnly*/ false);
17883
17884 V1 = DAG.getBitcast(MVT::v32i16, V1);
17885 V2 = DAG.getBitcast(MVT::v32i16, V2);
17886 return DAG.getBitcast(VT,
17887 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17888 }
17889
17890 // Dispatch to each element type for lowering. If we don't have support for
17891 // specific element type shuffles at 512 bits, immediately split them and
17892 // lower them. Each lowering routine of a given type is allowed to assume that
17893 // the requisite ISA extensions for that element type are available.
17894 switch (VT.SimpleTy) {
17895 case MVT::v8f64:
17896 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17897 case MVT::v16f32:
17898 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17899 case MVT::v8i64:
17900 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17901 case MVT::v16i32:
17902 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17903 case MVT::v32i16:
17904 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17905 case MVT::v64i8:
17906 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17907
17908 default:
17909 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17910 }
17911}
17912
17914 MVT VT, SDValue V1, SDValue V2,
17915 const X86Subtarget &Subtarget,
17916 SelectionDAG &DAG) {
17917 // Shuffle should be unary.
17918 if (!V2.isUndef())
17919 return SDValue();
17920
17921 int ShiftAmt = -1;
17922 int NumElts = Mask.size();
17923 for (int i = 0; i != NumElts; ++i) {
17924 int M = Mask[i];
17925 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17926 "Unexpected mask index.");
17927 if (M < 0)
17928 continue;
17929
17930 // The first non-undef element determines our shift amount.
17931 if (ShiftAmt < 0) {
17932 ShiftAmt = M - i;
17933 // Need to be shifting right.
17934 if (ShiftAmt <= 0)
17935 return SDValue();
17936 }
17937 // All non-undef elements must shift by the same amount.
17938 if (ShiftAmt != M - i)
17939 return SDValue();
17940 }
17941 assert(ShiftAmt >= 0 && "All undef?");
17942
17943 // Great we found a shift right.
17944 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17945 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17946 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17947 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17948 DAG.getVectorIdxConstant(0, DL));
17949}
17950
17951// Determine if this shuffle can be implemented with a KSHIFT instruction.
17952// Returns the shift amount if possible or -1 if not. This is a simplified
17953// version of matchShuffleAsShift.
17954static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17955 int MaskOffset, const APInt &Zeroable) {
17956 int Size = Mask.size();
17957
17958 auto CheckZeros = [&](int Shift, bool Left) {
17959 for (int j = 0; j < Shift; ++j)
17960 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17961 return false;
17962
17963 return true;
17964 };
17965
17966 auto MatchShift = [&](int Shift, bool Left) {
17967 unsigned Pos = Left ? Shift : 0;
17968 unsigned Low = Left ? 0 : Shift;
17969 unsigned Len = Size - Shift;
17970 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17971 };
17972
17973 for (int Shift = 1; Shift != Size; ++Shift)
17974 for (bool Left : {true, false})
17975 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17977 return Shift;
17978 }
17979
17980 return -1;
17981}
17982
17983
17984// Lower vXi1 vector shuffles.
17985// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17986// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17987// vector, shuffle and then truncate it back.
17989 MVT VT, SDValue V1, SDValue V2,
17990 const APInt &Zeroable,
17991 const X86Subtarget &Subtarget,
17992 SelectionDAG &DAG) {
17993 assert(Subtarget.hasAVX512() &&
17994 "Cannot lower 512-bit vectors w/o basic ISA!");
17995
17996 int NumElts = Mask.size();
17997 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17998
17999 // Try to recognize shuffles that are just padding a subvector with zeros.
18000 int SubvecElts = 0;
18001 int Src = -1;
18002 for (int i = 0; i != NumElts; ++i) {
18003 if (Mask[i] >= 0) {
18004 // Grab the source from the first valid mask. All subsequent elements need
18005 // to use this same source.
18006 if (Src < 0)
18007 Src = Mask[i] / NumElts;
18008 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18009 break;
18010 }
18011
18012 ++SubvecElts;
18013 }
18014 assert(SubvecElts != NumElts && "Identity shuffle?");
18015
18016 // Clip to a power 2.
18017 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18018
18019 // Make sure the number of zeroable bits in the top at least covers the bits
18020 // not covered by the subvector.
18021 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18022 assert(Src >= 0 && "Expected a source!");
18023 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18024 SDValue Extract =
18025 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18026 DAG.getVectorIdxConstant(0, DL));
18027 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18028 DAG.getConstant(0, DL, VT), Extract,
18029 DAG.getVectorIdxConstant(0, DL));
18030 }
18031
18032 // Try a simple shift right with undef elements. Later we'll try with zeros.
18033 if (SDValue Shift =
18034 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18035 return Shift;
18036
18037 // Try to match KSHIFTs.
18038 unsigned Offset = 0;
18039 for (SDValue V : {V1, V2}) {
18040 unsigned Opcode;
18041 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18042 if (ShiftAmt >= 0) {
18043 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18044 MVT WideVT = Res.getSimpleValueType();
18045 // Widened right shifts need two shifts to ensure we shift in zeroes.
18046 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18047 int WideElts = WideVT.getVectorNumElements();
18048 // Shift left to put the original vector in the MSBs of the new size.
18049 Res =
18050 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18051 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18052 // Increase the shift amount to account for the left shift.
18053 ShiftAmt += WideElts - NumElts;
18054 }
18055
18056 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18057 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18058 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18059 DAG.getVectorIdxConstant(0, DL));
18060 }
18061 Offset += NumElts; // Increment for next iteration.
18062 }
18063
18064 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18065 // ops instead.
18066 // TODO: What other unary shuffles would benefit from this?
18067 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18068 SDValue Op0 = V1.getOperand(0);
18069 SDValue Op1 = V1.getOperand(1);
18071 EVT OpVT = Op0.getValueType();
18072 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18073 return DAG.getSetCC(
18074 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18075 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18076 }
18077
18078 MVT ExtVT;
18079 switch (VT.SimpleTy) {
18080 default:
18081 llvm_unreachable("Expected a vector of i1 elements");
18082 case MVT::v2i1:
18083 ExtVT = MVT::v2i64;
18084 break;
18085 case MVT::v4i1:
18086 ExtVT = MVT::v4i32;
18087 break;
18088 case MVT::v8i1:
18089 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18090 // shuffle.
18091 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18092 break;
18093 case MVT::v16i1:
18094 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18095 // 256-bit operation available.
18096 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18097 break;
18098 case MVT::v32i1:
18099 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18100 // 256-bit operation available.
18101 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18102 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18103 break;
18104 case MVT::v64i1:
18105 // Fall back to scalarization. FIXME: We can do better if the shuffle
18106 // can be partitioned cleanly.
18107 if (!Subtarget.useBWIRegs())
18108 return SDValue();
18109 ExtVT = MVT::v64i8;
18110 break;
18111 }
18112
18113 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18114 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18115
18116 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18117 // i1 was sign extended we can use X86ISD::CVT2MASK.
18118 int NumElems = VT.getVectorNumElements();
18119 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18120 (Subtarget.hasDQI() && (NumElems < 32)))
18121 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18122 Shuffle, ISD::SETGT);
18123
18124 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18125}
18126
18127/// Helper function that returns true if the shuffle mask should be
18128/// commuted to improve canonicalization.
18130 int NumElements = Mask.size();
18131
18132 int NumV1Elements = 0, NumV2Elements = 0;
18133 for (int M : Mask)
18134 if (M < 0)
18135 continue;
18136 else if (M < NumElements)
18137 ++NumV1Elements;
18138 else
18139 ++NumV2Elements;
18140
18141 // Commute the shuffle as needed such that more elements come from V1 than
18142 // V2. This allows us to match the shuffle pattern strictly on how many
18143 // elements come from V1 without handling the symmetric cases.
18144 if (NumV2Elements > NumV1Elements)
18145 return true;
18146
18147 assert(NumV1Elements > 0 && "No V1 indices");
18148
18149 if (NumV2Elements == 0)
18150 return false;
18151
18152 // When the number of V1 and V2 elements are the same, try to minimize the
18153 // number of uses of V2 in the low half of the vector. When that is tied,
18154 // ensure that the sum of indices for V1 is equal to or lower than the sum
18155 // indices for V2. When those are equal, try to ensure that the number of odd
18156 // indices for V1 is lower than the number of odd indices for V2.
18157 if (NumV1Elements == NumV2Elements) {
18158 int LowV1Elements = 0, LowV2Elements = 0;
18159 for (int M : Mask.slice(0, NumElements / 2))
18160 if (M >= NumElements)
18161 ++LowV2Elements;
18162 else if (M >= 0)
18163 ++LowV1Elements;
18164 if (LowV2Elements > LowV1Elements)
18165 return true;
18166 if (LowV2Elements == LowV1Elements) {
18167 int SumV1Indices = 0, SumV2Indices = 0;
18168 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18169 if (Mask[i] >= NumElements)
18170 SumV2Indices += i;
18171 else if (Mask[i] >= 0)
18172 SumV1Indices += i;
18173 if (SumV2Indices < SumV1Indices)
18174 return true;
18175 if (SumV2Indices == SumV1Indices) {
18176 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18177 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18178 if (Mask[i] >= NumElements)
18179 NumV2OddIndices += i % 2;
18180 else if (Mask[i] >= 0)
18181 NumV1OddIndices += i % 2;
18182 if (NumV2OddIndices < NumV1OddIndices)
18183 return true;
18184 }
18185 }
18186 }
18187
18188 return false;
18189}
18190
18192 const X86Subtarget &Subtarget) {
18193 if (!Subtarget.hasAVX512())
18194 return false;
18195
18196 if (!V.getValueType().isSimple())
18197 return false;
18198
18199 MVT VT = V.getSimpleValueType().getScalarType();
18200 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18201 return false;
18202
18203 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18204 // are preferable to blendw/blendvb/masked-mov.
18205 if ((VT == MVT::i16 || VT == MVT::i8) &&
18206 V.getSimpleValueType().getSizeInBits() < 512)
18207 return false;
18208
18209 auto HasMaskOperation = [&](SDValue V) {
18210 // TODO: Currently we only check limited opcode. We probably extend
18211 // it to all binary operation by checking TLI.isBinOp().
18212 switch (V->getOpcode()) {
18213 default:
18214 return false;
18215 case ISD::ADD:
18216 case ISD::SUB:
18217 case ISD::AND:
18218 case ISD::XOR:
18219 case ISD::OR:
18220 case ISD::SMAX:
18221 case ISD::SMIN:
18222 case ISD::UMAX:
18223 case ISD::UMIN:
18224 case ISD::ABS:
18225 case ISD::SHL:
18226 case ISD::SRL:
18227 case ISD::SRA:
18228 case ISD::MUL:
18229 break;
18230 }
18231 if (!V->hasOneUse())
18232 return false;
18233
18234 return true;
18235 };
18236
18237 if (HasMaskOperation(V))
18238 return true;
18239
18240 return false;
18241}
18242
18243// Forward declaration.
18246 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18247 const X86Subtarget &Subtarget);
18248
18249 /// Top-level lowering for x86 vector shuffles.
18250///
18251/// This handles decomposition, canonicalization, and lowering of all x86
18252/// vector shuffles. Most of the specific lowering strategies are encapsulated
18253/// above in helper routines. The canonicalization attempts to widen shuffles
18254/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18255/// s.t. only one of the two inputs needs to be tested, etc.
18257 SelectionDAG &DAG) {
18259 ArrayRef<int> OrigMask = SVOp->getMask();
18260 SDValue V1 = Op.getOperand(0);
18261 SDValue V2 = Op.getOperand(1);
18262 MVT VT = Op.getSimpleValueType();
18263 int NumElements = VT.getVectorNumElements();
18264 SDLoc DL(Op);
18265 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18266
18267 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18268 "Can't lower MMX shuffles");
18269
18270 bool V1IsUndef = V1.isUndef();
18271 bool V2IsUndef = V2.isUndef();
18272 if (V1IsUndef && V2IsUndef)
18273 return DAG.getUNDEF(VT);
18274
18275 // When we create a shuffle node we put the UNDEF node to second operand,
18276 // but in some cases the first operand may be transformed to UNDEF.
18277 // In this case we should just commute the node.
18278 if (V1IsUndef)
18279 return DAG.getCommutedVectorShuffle(*SVOp);
18280
18281 // Check for non-undef masks pointing at an undef vector and make the masks
18282 // undef as well. This makes it easier to match the shuffle based solely on
18283 // the mask.
18284 if (V2IsUndef &&
18285 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18286 SmallVector<int, 8> NewMask(OrigMask);
18287 for (int &M : NewMask)
18288 if (M >= NumElements)
18289 M = -1;
18290 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18291 }
18292
18293 // Check for illegal shuffle mask element index values.
18294 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18295 (void)MaskUpperLimit;
18296 assert(llvm::all_of(OrigMask,
18297 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18298 "Out of bounds shuffle index");
18299
18300 // We actually see shuffles that are entirely re-arrangements of a set of
18301 // zero inputs. This mostly happens while decomposing complex shuffles into
18302 // simple ones. Directly lower these as a buildvector of zeros.
18303 APInt KnownUndef, KnownZero;
18304 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18305
18306 APInt Zeroable = KnownUndef | KnownZero;
18307 if (Zeroable.isAllOnes())
18308 return getZeroVector(VT, Subtarget, DAG, DL);
18309
18310 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18311
18312 // Try to collapse shuffles into using a vector type with fewer elements but
18313 // wider element types. We cap this to not form integers or floating point
18314 // elements wider than 64 bits. It does not seem beneficial to form i128
18315 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18316 SmallVector<int, 16> WidenedMask;
18317 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18318 !canCombineAsMaskOperation(V1, Subtarget) &&
18319 !canCombineAsMaskOperation(V2, Subtarget) &&
18320 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18321 // Shuffle mask widening should not interfere with a broadcast opportunity
18322 // by obfuscating the operands with bitcasts.
18323 // TODO: Avoid lowering directly from this top-level function: make this
18324 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18325 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18326 Subtarget, DAG))
18327 return Broadcast;
18328
18329 MVT NewEltVT = VT.isFloatingPoint()
18332 int NewNumElts = NumElements / 2;
18333 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18334 // Make sure that the new vector type is legal. For example, v2f64 isn't
18335 // legal on SSE1.
18336 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18337 if (V2IsZero) {
18338 // Modify the new Mask to take all zeros from the all-zero vector.
18339 // Choose indices that are blend-friendly.
18340 bool UsedZeroVector = false;
18341 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18342 "V2's non-undef elements are used?!");
18343 for (int i = 0; i != NewNumElts; ++i)
18344 if (WidenedMask[i] == SM_SentinelZero) {
18345 WidenedMask[i] = i + NewNumElts;
18346 UsedZeroVector = true;
18347 }
18348 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18349 // some elements to be undef.
18350 if (UsedZeroVector)
18351 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18352 }
18353 V1 = DAG.getBitcast(NewVT, V1);
18354 V2 = DAG.getBitcast(NewVT, V2);
18355 return DAG.getBitcast(
18356 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18357 }
18358 }
18359
18360 SmallVector<SDValue> Ops = {V1, V2};
18361 SmallVector<int> Mask(OrigMask);
18362
18363 // Canonicalize the shuffle with any horizontal ops inputs.
18364 // NOTE: This may update Ops and Mask.
18366 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18367 return DAG.getBitcast(VT, HOp);
18368
18369 V1 = DAG.getBitcast(VT, Ops[0]);
18370 V2 = DAG.getBitcast(VT, Ops[1]);
18371 assert(NumElements == (int)Mask.size() &&
18372 "canonicalizeShuffleMaskWithHorizOp "
18373 "shouldn't alter the shuffle mask size");
18374
18375 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18376 // These will be materialized uniformly anyway, so make splat matching easier.
18377 // TODO: Allow all int constants?
18378 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18379 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18380 BitVector Undefs;
18381 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18382 if (Undefs.any() &&
18385 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18386 }
18387 }
18388 }
18389 return V;
18390 };
18391 V1 = CanonicalizeConstant(V1);
18392 V2 = CanonicalizeConstant(V2);
18393
18394 // Commute the shuffle if it will improve canonicalization.
18397 std::swap(V1, V2);
18398 }
18399
18400 // For each vector width, delegate to a specialized lowering routine.
18401 if (VT.is128BitVector())
18402 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18403
18404 if (VT.is256BitVector())
18405 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18406
18407 if (VT.is512BitVector())
18408 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18409
18410 if (Is1BitVector)
18411 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18412
18413 llvm_unreachable("Unimplemented!");
18414}
18415
18416// As legal vpcompress instructions depend on various AVX512 extensions, try to
18417// convert illegal vector sizes to legal ones to avoid expansion.
18419 SelectionDAG &DAG) {
18420 assert(Subtarget.hasAVX512() &&
18421 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18422
18423 SDLoc DL(Op);
18424 SDValue Vec = Op.getOperand(0);
18425 SDValue Mask = Op.getOperand(1);
18426 SDValue Passthru = Op.getOperand(2);
18427
18428 EVT VecVT = Vec.getValueType();
18429 EVT ElementVT = VecVT.getVectorElementType();
18430 unsigned NumElements = VecVT.getVectorNumElements();
18431 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18432 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18433
18434 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18435 // compressed as 512-bit vectors in AVX512F.
18436 if (NumVecBits != 128 && NumVecBits != 256)
18437 return SDValue();
18438
18439 if (NumElementBits == 32 || NumElementBits == 64) {
18440 unsigned NumLargeElements = 512 / NumElementBits;
18441 MVT LargeVecVT =
18442 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18443 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18444
18445 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18446 DAG, DL);
18447 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18448 Subtarget, DAG, DL);
18449 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18450 : widenSubVector(LargeVecVT, Passthru,
18451 /*ZeroNewElements=*/false,
18452 Subtarget, DAG, DL);
18453
18454 SDValue Compressed =
18455 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18456 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18457 DAG.getConstant(0, DL, MVT::i64));
18458 }
18459
18460 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18461 VecVT == MVT::v16i16) {
18462 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18463 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18464
18465 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18466 Passthru = Passthru.isUndef()
18467 ? DAG.getUNDEF(LargeVecVT)
18468 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18469
18470 SDValue Compressed =
18471 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18472 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18473 }
18474
18475 return SDValue();
18476}
18477
18478/// Try to lower a VSELECT instruction to a vector shuffle.
18480 const X86Subtarget &Subtarget,
18481 SelectionDAG &DAG) {
18482 SDValue Cond = Op.getOperand(0);
18483 SDValue LHS = Op.getOperand(1);
18484 SDValue RHS = Op.getOperand(2);
18485 MVT VT = Op.getSimpleValueType();
18486
18487 // Only non-legal VSELECTs reach this lowering, convert those into generic
18488 // shuffles and re-use the shuffle lowering path for blends.
18492 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18493 }
18494
18495 return SDValue();
18496}
18497
18498SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18499 SDValue Cond = Op.getOperand(0);
18500 SDValue LHS = Op.getOperand(1);
18501 SDValue RHS = Op.getOperand(2);
18502
18503 SDLoc dl(Op);
18504 MVT VT = Op.getSimpleValueType();
18505 if (isSoftF16(VT, Subtarget)) {
18506 MVT NVT = VT.changeVectorElementTypeToInteger();
18507 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18508 DAG.getBitcast(NVT, LHS),
18509 DAG.getBitcast(NVT, RHS)));
18510 }
18511
18512 // A vselect where all conditions and data are constants can be optimized into
18513 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18517 return SDValue();
18518
18519 // Try to lower this to a blend-style vector shuffle. This can handle all
18520 // constant condition cases.
18521 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18522 return BlendOp;
18523
18524 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18525 // with patterns on the mask registers on AVX-512.
18526 MVT CondVT = Cond.getSimpleValueType();
18527 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18528 if (CondEltSize == 1)
18529 return Op;
18530
18531 // Variable blends are only legal from SSE4.1 onward.
18532 if (!Subtarget.hasSSE41())
18533 return SDValue();
18534
18535 unsigned EltSize = VT.getScalarSizeInBits();
18536 unsigned NumElts = VT.getVectorNumElements();
18537
18538 // Expand v32i16/v64i8 without BWI.
18539 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18540 return SDValue();
18541
18542 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18543 // into an i1 condition so that we can use the mask-based 512-bit blend
18544 // instructions.
18545 if (VT.getSizeInBits() == 512) {
18546 // Build a mask by testing the condition against zero.
18547 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18548 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18549 DAG.getConstant(0, dl, CondVT),
18550 ISD::SETNE);
18551 // Now return a new VSELECT using the mask.
18552 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18553 }
18554
18555 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18556 if (CondEltSize != EltSize) {
18557 // If we don't have a sign splat, rely on the expansion.
18558 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18559 return SDValue();
18560
18561 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18562 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18563 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18564 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18565 }
18566
18567 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18568 // are free to split, then better to split before expanding the
18569 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18570 // TODO: This is very similar to narrowVectorSelect.
18571 // TODO: Add Load splitting to isFreeToSplitVector ?
18572 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18573 !Subtarget.hasXOP()) {
18574 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18575 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18576 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18577 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18578 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18579 if (FreeCond && (FreeLHS || FreeRHS))
18580 return splitVectorOp(Op, DAG, dl);
18581 }
18582
18583 // Only some types will be legal on some subtargets. If we can emit a legal
18584 // VSELECT-matching blend, return Op, and but if we need to expand, return
18585 // a null value.
18586 switch (VT.SimpleTy) {
18587 default:
18588 // Most of the vector types have blends past SSE4.1.
18589 return Op;
18590
18591 case MVT::v32i8:
18592 // The byte blends for AVX vectors were introduced only in AVX2.
18593 if (Subtarget.hasAVX2())
18594 return Op;
18595
18596 return SDValue();
18597
18598 case MVT::v8i16:
18599 case MVT::v16i16:
18600 case MVT::v8f16:
18601 case MVT::v16f16: {
18602 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18603 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18604 Cond = DAG.getBitcast(CastVT, Cond);
18605 LHS = DAG.getBitcast(CastVT, LHS);
18606 RHS = DAG.getBitcast(CastVT, RHS);
18607 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18608 return DAG.getBitcast(VT, Select);
18609 }
18610 }
18611}
18612
18614 MVT VT = Op.getSimpleValueType();
18615 SDValue Vec = Op.getOperand(0);
18616 SDValue Idx = Op.getOperand(1);
18617 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18618 SDLoc dl(Op);
18619
18621 return SDValue();
18622
18623 if (VT.getSizeInBits() == 8) {
18624 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18625 // we're going to zero extend the register or fold the store.
18628 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18629 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18630 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18631
18632 unsigned IdxVal = Idx->getAsZExtVal();
18633 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18634 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18635 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18636 }
18637
18638 if (VT == MVT::f32) {
18639 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18640 // the result back to FR32 register. It's only worth matching if the
18641 // result has a single use which is a store or a bitcast to i32. And in
18642 // the case of a store, it's not worth it if the index is a constant 0,
18643 // because a MOVSSmr can be used instead, which is smaller and faster.
18644 if (!Op.hasOneUse())
18645 return SDValue();
18646 SDNode *User = *Op.getNode()->user_begin();
18647 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18648 (User->getOpcode() != ISD::BITCAST ||
18649 User->getValueType(0) != MVT::i32))
18650 return SDValue();
18651 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18652 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18653 return DAG.getBitcast(MVT::f32, Extract);
18654 }
18655
18656 if (VT == MVT::i32 || VT == MVT::i64)
18657 return Op;
18658
18659 return SDValue();
18660}
18661
18662/// Extract one bit from mask vector, like v16i1 or v8i1.
18663/// AVX-512 feature.
18665 const X86Subtarget &Subtarget) {
18666 SDValue Vec = Op.getOperand(0);
18667 SDLoc dl(Vec);
18668 MVT VecVT = Vec.getSimpleValueType();
18669 SDValue Idx = Op.getOperand(1);
18670 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18671 MVT EltVT = Op.getSimpleValueType();
18672
18673 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18674 "Unexpected vector type in ExtractBitFromMaskVector");
18675
18676 // variable index can't be handled in mask registers,
18677 // extend vector to VR512/128
18678 if (!IdxC) {
18679 unsigned NumElts = VecVT.getVectorNumElements();
18680 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18681 // than extending to 128/256bit.
18682 if (NumElts == 1) {
18683 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18685 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18686 }
18687 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18688 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18689 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18690 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18691 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18692 }
18693
18694 unsigned IdxVal = IdxC->getZExtValue();
18695 if (IdxVal == 0) // the operation is legal
18696 return Op;
18697
18698 // Extend to natively supported kshift.
18699 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18700
18701 // Use kshiftr instruction to move to the lower element.
18702 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18703 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18704
18705 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18706 DAG.getVectorIdxConstant(0, dl));
18707}
18708
18709// Helper to find all the extracted elements from a vector.
18711 MVT VT = N->getSimpleValueType(0);
18712 unsigned NumElts = VT.getVectorNumElements();
18713 APInt DemandedElts = APInt::getZero(NumElts);
18714 for (SDNode *User : N->users()) {
18715 switch (User->getOpcode()) {
18716 case X86ISD::PEXTRB:
18717 case X86ISD::PEXTRW:
18720 DemandedElts.setAllBits();
18721 return DemandedElts;
18722 }
18723 DemandedElts.setBit(User->getConstantOperandVal(1));
18724 break;
18725 case ISD::BITCAST: {
18726 if (!User->getValueType(0).isSimple() ||
18727 !User->getValueType(0).isVector()) {
18728 DemandedElts.setAllBits();
18729 return DemandedElts;
18730 }
18731 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18732 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18733 break;
18734 }
18735 default:
18736 DemandedElts.setAllBits();
18737 return DemandedElts;
18738 }
18739 }
18740 return DemandedElts;
18741}
18742
18743SDValue
18744X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18745 SelectionDAG &DAG) const {
18746 SDLoc dl(Op);
18747 SDValue Vec = Op.getOperand(0);
18748 MVT VecVT = Vec.getSimpleValueType();
18749 SDValue Idx = Op.getOperand(1);
18750 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18751
18752 if (VecVT.getVectorElementType() == MVT::i1)
18753 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18754
18755 if (!IdxC) {
18756 // Its more profitable to go through memory (1 cycles throughput)
18757 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18758 // IACA tool was used to get performance estimation
18759 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18760 //
18761 // example : extractelement <16 x i8> %a, i32 %i
18762 //
18763 // Block Throughput: 3.00 Cycles
18764 // Throughput Bottleneck: Port5
18765 //
18766 // | Num Of | Ports pressure in cycles | |
18767 // | Uops | 0 - DV | 5 | 6 | 7 | |
18768 // ---------------------------------------------
18769 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18770 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18771 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18772 // Total Num Of Uops: 4
18773 //
18774 //
18775 // Block Throughput: 1.00 Cycles
18776 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18777 //
18778 // | | Ports pressure in cycles | |
18779 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18780 // ---------------------------------------------------------
18781 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18782 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18783 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18784 // Total Num Of Uops: 4
18785
18786 return SDValue();
18787 }
18788
18789 unsigned IdxVal = IdxC->getZExtValue();
18790
18791 // If this is a 256-bit vector result, first extract the 128-bit vector and
18792 // then extract the element from the 128-bit vector.
18793 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18794 // Get the 128-bit vector.
18795 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18796 MVT EltVT = VecVT.getVectorElementType();
18797
18798 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18799 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18800
18801 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18802 // this can be done with a mask.
18803 IdxVal &= ElemsPerChunk - 1;
18804 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18805 DAG.getVectorIdxConstant(IdxVal, dl));
18806 }
18807
18808 assert(VecVT.is128BitVector() && "Unexpected vector length");
18809
18810 MVT VT = Op.getSimpleValueType();
18811
18812 if (VT == MVT::i16) {
18813 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18814 // we're going to zero extend the register or fold the store (SSE41 only).
18815 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18816 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18817 if (Subtarget.hasFP16())
18818 return Op;
18819
18820 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18821 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18822 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18823 }
18824
18825 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18826 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18827 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18828 }
18829
18830 if (Subtarget.hasSSE41())
18831 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18832 return Res;
18833
18834 // Only extract a single element from a v16i8 source - determine the common
18835 // DWORD/WORD that all extractions share, and extract the sub-byte.
18836 // TODO: Add QWORD MOVQ extraction?
18837 if (VT == MVT::i8) {
18838 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18839 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18840
18841 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18842 int DWordIdx = IdxVal / 4;
18843 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18844 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18845 DAG.getBitcast(MVT::v4i32, Vec),
18846 DAG.getVectorIdxConstant(DWordIdx, dl));
18847 int ShiftVal = (IdxVal % 4) * 8;
18848 if (ShiftVal != 0)
18849 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18850 DAG.getConstant(ShiftVal, dl, MVT::i8));
18851 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18852 }
18853
18854 int WordIdx = IdxVal / 2;
18855 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18856 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18857 DAG.getBitcast(MVT::v8i16, Vec),
18858 DAG.getVectorIdxConstant(WordIdx, dl));
18859 int ShiftVal = (IdxVal % 2) * 8;
18860 if (ShiftVal != 0)
18861 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18862 DAG.getConstant(ShiftVal, dl, MVT::i8));
18863 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18864 }
18865 }
18866
18867 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18868 if (IdxVal == 0)
18869 return Op;
18870
18871 // Shuffle the element to the lowest element, then movss or movsh.
18872 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18873 Mask[0] = static_cast<int>(IdxVal);
18874 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18875 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18876 DAG.getVectorIdxConstant(0, dl));
18877 }
18878
18879 if (VT.getSizeInBits() == 64) {
18880 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18881 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18882 // to match extract_elt for f64.
18883 if (IdxVal == 0)
18884 return Op;
18885
18886 // UNPCKHPD the element to the lowest double word, then movsd.
18887 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18888 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18889 int Mask[2] = { 1, -1 };
18890 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18891 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18892 DAG.getVectorIdxConstant(0, dl));
18893 }
18894
18895 return SDValue();
18896}
18897
18898/// Insert one bit to mask vector, like v16i1 or v8i1.
18899/// AVX-512 feature.
18901 const X86Subtarget &Subtarget) {
18902 SDLoc dl(Op);
18903 SDValue Vec = Op.getOperand(0);
18904 SDValue Elt = Op.getOperand(1);
18905 SDValue Idx = Op.getOperand(2);
18906 MVT VecVT = Vec.getSimpleValueType();
18907
18908 if (!isa<ConstantSDNode>(Idx)) {
18909 // Non constant index. Extend source and destination,
18910 // insert element and then truncate the result.
18911 unsigned NumElts = VecVT.getVectorNumElements();
18912 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18913 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18914 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18915 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18916 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18917 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18918 }
18919
18920 // Copy into a k-register, extract to v1i1 and insert_subvector.
18921 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18922 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18923}
18924
18925SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18926 SelectionDAG &DAG) const {
18927 MVT VT = Op.getSimpleValueType();
18928 MVT EltVT = VT.getVectorElementType();
18929 unsigned NumElts = VT.getVectorNumElements();
18930 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18931
18932 if (EltVT == MVT::i1)
18933 return InsertBitToMaskVector(Op, DAG, Subtarget);
18934
18935 SDLoc dl(Op);
18936 SDValue N0 = Op.getOperand(0);
18937 SDValue N1 = Op.getOperand(1);
18938 SDValue N2 = Op.getOperand(2);
18939 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18940
18941 if (EltVT == MVT::bf16) {
18942 MVT IVT = VT.changeVectorElementTypeToInteger();
18943 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18944 DAG.getBitcast(IVT, N0),
18945 DAG.getBitcast(MVT::i16, N1), N2);
18946 return DAG.getBitcast(VT, Res);
18947 }
18948
18949 if (!N2C) {
18950 // Variable insertion indices, usually we're better off spilling to stack,
18951 // but AVX512 can use a variable compare+select by comparing against all
18952 // possible vector indices, and FP insertion has less gpr->simd traffic.
18953 if (!(Subtarget.hasBWI() ||
18954 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18955 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18956 return SDValue();
18957
18958 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18959 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18960 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18961 return SDValue();
18962
18963 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18964 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18965 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18966
18967 SmallVector<SDValue, 16> RawIndices;
18968 for (unsigned I = 0; I != NumElts; ++I)
18969 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18970 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18971
18972 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18973 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18975 }
18976
18977 if (N2C->getAPIntValue().uge(NumElts))
18978 return SDValue();
18979 uint64_t IdxVal = N2C->getZExtValue();
18980
18981 bool IsZeroElt = X86::isZeroNode(N1);
18982 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18983
18984 if (IsZeroElt || IsAllOnesElt) {
18985 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18986 // We don't deal with i8 0 since it appears to be handled elsewhere.
18987 if (IsAllOnesElt &&
18988 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18989 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18990 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18991 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18992 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18993 CstVectorElts[IdxVal] = OnesCst;
18994 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18995 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18996 }
18997 // See if we can do this more efficiently with a blend shuffle with a
18998 // rematerializable vector.
18999 if (Subtarget.hasSSE41() &&
19000 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19001 SmallVector<int, 8> BlendMask;
19002 for (unsigned i = 0; i != NumElts; ++i)
19003 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19004 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19005 : getOnesVector(VT, DAG, dl);
19006 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19007 }
19008 }
19009
19010 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19011 // into that, and then insert the subvector back into the result.
19012 if (VT.is256BitVector() || VT.is512BitVector()) {
19013 // With a 256-bit vector, we can insert into the zero element efficiently
19014 // using a blend if we have AVX or AVX2 and the right data type.
19015 if (VT.is256BitVector() && IdxVal == 0) {
19016 // TODO: It is worthwhile to cast integer to floating point and back
19017 // and incur a domain crossing penalty if that's what we'll end up
19018 // doing anyway after extracting to a 128-bit vector.
19019 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19020 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19021 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19022 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19023 DAG.getTargetConstant(1, dl, MVT::i8));
19024 }
19025 }
19026
19027 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19028 assert(isPowerOf2_32(NumEltsIn128) &&
19029 "Vectors will always have power-of-two number of elements.");
19030
19031 // If we are not inserting into the low 128-bit vector chunk,
19032 // then prefer the broadcast+blend sequence.
19033 // FIXME: relax the profitability check iff all N1 uses are insertions.
19034 if (IdxVal >= NumEltsIn128 &&
19035 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19036 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19037 X86::mayFoldLoad(N1, Subtarget)))) {
19038 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19039 SmallVector<int, 8> BlendMask;
19040 for (unsigned i = 0; i != NumElts; ++i)
19041 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19042 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19043 }
19044
19045 // Get the desired 128-bit vector chunk.
19046 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19047
19048 // Insert the element into the desired chunk.
19049 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19050 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19051
19052 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19053 DAG.getVectorIdxConstant(IdxIn128, dl));
19054
19055 // Insert the changed part back into the bigger vector
19056 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19057 }
19058 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19059
19060 // This will be just movw/movd/movq/movsh/movss/movsd.
19061 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19062 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19063 EltVT == MVT::f16 || EltVT == MVT::i64) {
19064 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19065 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19066 }
19067
19068 // We can't directly insert an i8 or i16 into a vector, so zero extend
19069 // it to i32 first.
19070 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19071 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19072 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19073 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19074 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19075 return DAG.getBitcast(VT, N1);
19076 }
19077 }
19078
19079 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19080 // argument. SSE41 required for pinsrb.
19081 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19082 unsigned Opc;
19083 if (VT == MVT::v8i16) {
19084 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19086 } else {
19087 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19088 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19090 }
19091
19092 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19093 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19094 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19095 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19096 }
19097
19098 if (Subtarget.hasSSE41()) {
19099 if (EltVT == MVT::f32) {
19100 // Bits [7:6] of the constant are the source select. This will always be
19101 // zero here. The DAG Combiner may combine an extract_elt index into
19102 // these bits. For example (insert (extract, 3), 2) could be matched by
19103 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19104 // Bits [5:4] of the constant are the destination select. This is the
19105 // value of the incoming immediate.
19106 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19107 // combine either bitwise AND or insert of float 0.0 to set these bits.
19108
19109 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19110 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19111 // If this is an insertion of 32-bits into the low 32-bits of
19112 // a vector, we prefer to generate a blend with immediate rather
19113 // than an insertps. Blends are simpler operations in hardware and so
19114 // will always have equal or better performance than insertps.
19115 // But if optimizing for size and there's a load folding opportunity,
19116 // generate insertps because blendps does not have a 32-bit memory
19117 // operand form.
19118 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19119 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19120 DAG.getTargetConstant(1, dl, MVT::i8));
19121 }
19122 // Create this as a scalar to vector..
19123 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19124 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19125 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19126 }
19127
19128 // PINSR* works with constant index.
19129 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19130 return Op;
19131 }
19132
19133 return SDValue();
19134}
19135
19137 SelectionDAG &DAG) {
19138 SDLoc dl(Op);
19139 MVT OpVT = Op.getSimpleValueType();
19140
19141 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19142 // combines.
19143 if (X86::isZeroNode(Op.getOperand(0)))
19144 return getZeroVector(OpVT, Subtarget, DAG, dl);
19145
19146 // If this is a 256-bit vector result, first insert into a 128-bit
19147 // vector and then insert into the 256-bit vector.
19148 if (!OpVT.is128BitVector()) {
19149 // Insert into a 128-bit vector.
19150 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19152 OpVT.getVectorNumElements() / SizeFactor);
19153
19154 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19155
19156 // Insert the 128-bit vector.
19157 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19158 }
19159 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19160 "Expected an SSE type!");
19161
19162 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19163 // tblgen.
19164 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19165 return Op;
19166
19167 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19168 return DAG.getBitcast(
19169 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19170}
19171
19172// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19173// simple superregister reference or explicit instructions to insert
19174// the upper bits of a vector.
19176 SelectionDAG &DAG) {
19177 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19178
19179 return insert1BitVector(Op, DAG, Subtarget);
19180}
19181
19183 SelectionDAG &DAG) {
19184 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19185 "Only vXi1 extract_subvectors need custom lowering");
19186
19187 SDLoc dl(Op);
19188 SDValue Vec = Op.getOperand(0);
19189 uint64_t IdxVal = Op.getConstantOperandVal(1);
19190
19191 if (IdxVal == 0) // the operation is legal
19192 return Op;
19193
19194 // Extend to natively supported kshift.
19195 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19196
19197 // Shift to the LSB.
19198 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19199 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19200
19201 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19202 DAG.getVectorIdxConstant(0, dl));
19203}
19204
19205// Returns the appropriate wrapper opcode for a global reference.
19206unsigned X86TargetLowering::getGlobalWrapperKind(
19207 const GlobalValue *GV, const unsigned char OpFlags) const {
19208 // References to absolute symbols are never PC-relative.
19209 if (GV && GV->isAbsoluteSymbolRef())
19210 return X86ISD::Wrapper;
19211
19212 // The following OpFlags under RIP-rel PIC use RIP.
19213 if (Subtarget.isPICStyleRIPRel() &&
19214 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19215 OpFlags == X86II::MO_DLLIMPORT))
19216 return X86ISD::WrapperRIP;
19217
19218 // GOTPCREL references must always use RIP.
19219 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19220 return X86ISD::WrapperRIP;
19221
19222 return X86ISD::Wrapper;
19223}
19224
19225// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19226// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19227// one of the above mentioned nodes. It has to be wrapped because otherwise
19228// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19229// be used to form addressing mode. These wrapped nodes will be selected
19230// into MOV32ri.
19231SDValue
19232X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19233 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19234
19235 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19236 // global base reg.
19237 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19238
19239 auto PtrVT = getPointerTy(DAG.getDataLayout());
19241 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19242 SDLoc DL(CP);
19243 Result =
19244 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19245 // With PIC, the address is actually $g + Offset.
19246 if (OpFlag) {
19247 Result =
19248 DAG.getNode(ISD::ADD, DL, PtrVT,
19249 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19250 }
19251
19252 return Result;
19253}
19254
19255SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19256 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19257
19258 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19259 // global base reg.
19260 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19261
19262 EVT PtrVT = Op.getValueType();
19263 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19264 SDLoc DL(JT);
19265 Result =
19266 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19267
19268 // With PIC, the address is actually $g + Offset.
19269 if (OpFlag)
19270 Result =
19271 DAG.getNode(ISD::ADD, DL, PtrVT,
19272 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19273
19274 return Result;
19275}
19276
19277SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19278 SelectionDAG &DAG) const {
19279 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19280}
19281
19282SDValue
19283X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19284 // Create the TargetBlockAddressAddress node.
19285 unsigned char OpFlags =
19286 Subtarget.classifyBlockAddressReference();
19287 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19288 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19289 SDLoc dl(Op);
19290 EVT PtrVT = Op.getValueType();
19291 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19292 Result =
19293 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19294
19295 // With PIC, the address is actually $g + Offset.
19296 if (isGlobalRelativeToPICBase(OpFlags)) {
19297 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19298 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19299 }
19300
19301 return Result;
19302}
19303
19304/// Creates target global address or external symbol nodes for calls or
19305/// other uses.
19306SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19307 bool ForCall,
19308 bool *IsImpCall) const {
19309 // Unpack the global address or external symbol.
19310 SDLoc dl(Op);
19311 const GlobalValue *GV = nullptr;
19312 int64_t Offset = 0;
19313 const char *ExternalSym = nullptr;
19314 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19315 GV = G->getGlobal();
19316 Offset = G->getOffset();
19317 } else {
19318 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19319 ExternalSym = ES->getSymbol();
19320 }
19321
19322 // Calculate some flags for address lowering.
19324 unsigned char OpFlags;
19325 if (ForCall)
19326 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19327 else
19328 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19329 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19330 bool NeedsLoad = isGlobalStubReference(OpFlags);
19331
19333 EVT PtrVT = Op.getValueType();
19335
19336 if (GV) {
19337 // Create a target global address if this is a global. If possible, fold the
19338 // offset into the global address reference. Otherwise, ADD it on later.
19339 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19340 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19341 // relocation will compute to a negative value, which is invalid.
19342 int64_t GlobalOffset = 0;
19343 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19345 std::swap(GlobalOffset, Offset);
19346 }
19347 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19348 } else {
19349 // If this is not a global address, this must be an external symbol.
19350 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19351 }
19352
19353 // If this is a direct call, avoid the wrapper if we don't need to do any
19354 // loads or adds. This allows SDAG ISel to match direct calls.
19355 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19356 return Result;
19357
19358 // If Import Call Optimization is enabled and this is an imported function
19359 // then make a note of it and return the global address without wrapping.
19360 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19361 Mod.getModuleFlag("import-call-optimization")) {
19362 assert(ForCall && "Should only enable import call optimization if we are "
19363 "lowering a call");
19364 *IsImpCall = true;
19365 return Result;
19366 }
19367
19368 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19369
19370 // With PIC, the address is actually $g + Offset.
19371 if (HasPICReg) {
19372 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19373 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19374 }
19375
19376 // For globals that require a load from a stub to get the address, emit the
19377 // load.
19378 if (NeedsLoad)
19379 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19381
19382 // If there was a non-zero offset that we didn't fold, create an explicit
19383 // addition for it.
19384 if (Offset != 0)
19385 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19386 DAG.getSignedConstant(Offset, dl, PtrVT));
19387
19388 return Result;
19389}
19390
19391SDValue
19392X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19393 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19394}
19395
19397 const EVT PtrVT, unsigned ReturnReg,
19398 unsigned char OperandFlags,
19399 bool LoadGlobalBaseReg = false,
19400 bool LocalDynamic = false) {
19402 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19403 SDLoc dl(GA);
19404 SDValue TGA;
19405 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19406 SDValue Chain = DAG.getEntryNode();
19407 SDValue Ret;
19408 if (LocalDynamic && UseTLSDESC) {
19409 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19410 // Reuse existing GetTLSADDR node if we can find it.
19411 if (TGA->hasOneUse()) {
19412 // TLSDESC uses TGA.
19413 SDNode *TLSDescOp = *TGA->user_begin();
19414 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19415 "Unexpected TLSDESC DAG");
19416 // CALLSEQ_END uses TGA via a chain and glue.
19417 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19418 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19419 "Unexpected TLSDESC DAG");
19420 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19421 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19422 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19423 "Unexpected TLSDESC DAG");
19424 Ret = SDValue(CopyFromRegOp, 0);
19425 }
19426 } else {
19427 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19428 GA->getOffset(), OperandFlags);
19429 }
19430
19431 if (!Ret) {
19432 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19433 : LocalDynamic ? X86ISD::TLSBASEADDR
19435
19436 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19437 if (LoadGlobalBaseReg) {
19438 SDValue InGlue;
19439 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19440 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19441 InGlue);
19442 InGlue = Chain.getValue(1);
19443 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19444 } else {
19445 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19446 }
19447 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19448
19449 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19450 MFI.setHasCalls(true);
19451
19452 SDValue Glue = Chain.getValue(1);
19453 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19454 }
19455
19456 if (!UseTLSDESC)
19457 return Ret;
19458
19459 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19460 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19461
19463 SDValue Offset =
19464 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19466 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19467}
19468
19469// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19470static SDValue
19472 const EVT PtrVT) {
19473 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19474 /*LoadGlobalBaseReg=*/true);
19475}
19476
19477// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19478static SDValue
19480 const EVT PtrVT) {
19481 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19482}
19483
19484// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19485static SDValue
19487 const EVT PtrVT) {
19488 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19489}
19490
19492 SelectionDAG &DAG, const EVT PtrVT,
19493 bool Is64Bit, bool Is64BitLP64) {
19494 SDLoc dl(GA);
19495
19496 // Get the start address of the TLS block for this module.
19500
19501 SDValue Base;
19502 if (Is64Bit) {
19503 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19504 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19505 /*LoadGlobalBaseReg=*/false,
19506 /*LocalDynamic=*/true);
19507 } else {
19508 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19509 /*LoadGlobalBaseReg=*/true,
19510 /*LocalDynamic=*/true);
19511 }
19512
19513 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19514 // of Base.
19515
19516 // Build x@dtpoff.
19517 unsigned char OperandFlags = X86II::MO_DTPOFF;
19518 unsigned WrapperKind = X86ISD::Wrapper;
19519 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19520 GA->getValueType(0),
19521 GA->getOffset(), OperandFlags);
19522 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19523
19524 // Add x@dtpoff with the base.
19525 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19526}
19527
19528// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19530 const EVT PtrVT, TLSModel::Model model,
19531 bool is64Bit, bool isPIC) {
19532 SDLoc dl(GA);
19533
19534 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19537
19538 SDValue ThreadPointer =
19539 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19541
19542 unsigned char OperandFlags = 0;
19543 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19544 // initialexec.
19545 unsigned WrapperKind = X86ISD::Wrapper;
19546 if (model == TLSModel::LocalExec) {
19547 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19548 } else if (model == TLSModel::InitialExec) {
19549 if (is64Bit) {
19550 OperandFlags = X86II::MO_GOTTPOFF;
19551 WrapperKind = X86ISD::WrapperRIP;
19552 } else {
19553 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19554 }
19555 } else {
19556 llvm_unreachable("Unexpected model");
19557 }
19558
19559 // emit "addl x@ntpoff,%eax" (local exec)
19560 // or "addl x@indntpoff,%eax" (initial exec)
19561 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19562 SDValue TGA =
19563 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19564 GA->getOffset(), OperandFlags);
19565 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19566
19567 if (model == TLSModel::InitialExec) {
19568 if (isPIC && !is64Bit) {
19569 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19570 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19571 Offset);
19572 }
19573
19574 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19576 }
19577
19578 // The address of the thread local variable is the add of the thread
19579 // pointer with the offset of the variable.
19580 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19581}
19582
19583SDValue
19584X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19585
19586 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19587
19588 if (DAG.getTarget().useEmulatedTLS())
19589 return LowerToTLSEmulatedModel(GA, DAG);
19590
19591 const GlobalValue *GV = GA->getGlobal();
19592 EVT PtrVT = Op.getValueType();
19593 bool PositionIndependent = isPositionIndependent();
19594
19595 if (Subtarget.isTargetELF()) {
19596 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19597 switch (model) {
19599 if (Subtarget.is64Bit()) {
19600 if (Subtarget.isTarget64BitLP64())
19601 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19602 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19603 }
19604 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19606 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19607 Subtarget.isTarget64BitLP64());
19610 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19611 PositionIndependent);
19612 }
19613 llvm_unreachable("Unknown TLS model.");
19614 }
19615
19616 if (Subtarget.isTargetDarwin()) {
19617 // Darwin only has one model of TLS. Lower to that.
19618 unsigned char OpFlag = 0;
19619 unsigned WrapperKind = 0;
19620
19621 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19622 // global base reg.
19623 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19624 if (PIC32) {
19625 OpFlag = X86II::MO_TLVP_PIC_BASE;
19626 WrapperKind = X86ISD::Wrapper;
19627 } else {
19628 OpFlag = X86II::MO_TLVP;
19629 WrapperKind = X86ISD::WrapperRIP;
19630 }
19631 SDLoc DL(Op);
19633 GA->getValueType(0),
19634 GA->getOffset(), OpFlag);
19635 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19636
19637 // With PIC32, the address is actually $g + Offset.
19638 if (PIC32)
19639 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19640 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19641 Offset);
19642
19643 // Lowering the machine isd will make sure everything is in the right
19644 // location.
19645 SDValue Chain = DAG.getEntryNode();
19646 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19647 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19648 SDValue Args[] = { Chain, Offset };
19649 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19650 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19651
19652 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19653 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19654 MFI.setAdjustsStack(true);
19655
19656 // And our return value (tls address) is in the standard call return value
19657 // location.
19658 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19659 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19660 }
19661
19662 if (Subtarget.isOSWindows()) {
19663 // Just use the implicit TLS architecture
19664 // Need to generate something similar to:
19665 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19666 // ; from TEB
19667 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19668 // mov rcx, qword [rdx+rcx*8]
19669 // mov eax, .tls$:tlsvar
19670 // [rax+rcx] contains the address
19671 // Windows 64bit: gs:0x58
19672 // Windows 32bit: fs:__tls_array
19673
19674 SDLoc dl(GA);
19675 SDValue Chain = DAG.getEntryNode();
19676
19677 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19678 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19679 // use its literal value of 0x2C.
19681 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19683
19684 SDValue TlsArray = Subtarget.is64Bit()
19685 ? DAG.getIntPtrConstant(0x58, dl)
19686 : (Subtarget.isTargetWindowsGNU()
19687 ? DAG.getIntPtrConstant(0x2C, dl)
19688 : DAG.getExternalSymbol("_tls_array", PtrVT));
19689
19691 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19692
19693 SDValue res;
19695 res = ThreadPointer;
19696 } else {
19697 // Load the _tls_index variable
19698 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19699 if (Subtarget.is64Bit())
19700 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19701 MachinePointerInfo(), MVT::i32);
19702 else
19703 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19704
19705 const DataLayout &DL = DAG.getDataLayout();
19706 SDValue Scale =
19707 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19708 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19709
19710 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19711 }
19712
19713 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19714
19715 // Get the offset of start of .tls section
19716 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19717 GA->getValueType(0),
19719 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19720
19721 // The address of the thread local variable is the add of the thread
19722 // pointer with the offset of the variable.
19723 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19724 }
19725
19726 llvm_unreachable("TLS not implemented for this target.");
19727}
19728
19730 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19731 const TargetMachine &TM = getTargetMachine();
19732 TLSModel::Model Model = TM.getTLSModel(&GV);
19733 switch (Model) {
19736 // We can include the %fs segment register in addressing modes.
19737 return true;
19740 // These models do not result in %fs relative addresses unless
19741 // TLS descriptior are used.
19742 //
19743 // Even in the case of TLS descriptors we currently have no way to model
19744 // the difference between %fs access and the computations needed for the
19745 // offset and returning `true` for TLS-desc currently duplicates both
19746 // which is detrimental :-/
19747 return false;
19748 }
19749 }
19750 return false;
19751}
19752
19753/// Lower SRA_PARTS and friends, which return two i32 values
19754/// and take a 2 x i32 value to shift plus a shift amount.
19755/// TODO: Can this be moved to general expansion code?
19757 SDValue Lo, Hi;
19758 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19759 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19760}
19761
19762// Try to use a packed vector operation to handle i64 on 32-bit targets when
19763// AVX512DQ is enabled.
19765 SelectionDAG &DAG,
19766 const X86Subtarget &Subtarget) {
19767 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19768 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19769 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19770 Op.getOpcode() == ISD::UINT_TO_FP) &&
19771 "Unexpected opcode!");
19772 bool IsStrict = Op->isStrictFPOpcode();
19773 unsigned OpNo = IsStrict ? 1 : 0;
19774 SDValue Src = Op.getOperand(OpNo);
19775 MVT SrcVT = Src.getSimpleValueType();
19776 MVT VT = Op.getSimpleValueType();
19777
19778 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19779 (VT != MVT::f32 && VT != MVT::f64))
19780 return SDValue();
19781
19782 // Pack the i64 into a vector, do the operation and extract.
19783
19784 // Using 256-bit to ensure result is 128-bits for f32 case.
19785 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19786 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19787 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19788
19789 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19790 if (IsStrict) {
19791 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19792 {Op.getOperand(0), InVec});
19793 SDValue Chain = CvtVec.getValue(1);
19794 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19795 DAG.getVectorIdxConstant(0, dl));
19796 return DAG.getMergeValues({Value, Chain}, dl);
19797 }
19798
19799 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19800
19801 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19802 DAG.getVectorIdxConstant(0, dl));
19803}
19804
19805// Try to use a packed vector operation to handle i64 on 32-bit targets.
19807 const X86Subtarget &Subtarget) {
19808 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19809 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19810 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19811 Op.getOpcode() == ISD::UINT_TO_FP) &&
19812 "Unexpected opcode!");
19813 bool IsStrict = Op->isStrictFPOpcode();
19814 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19815 MVT SrcVT = Src.getSimpleValueType();
19816 MVT VT = Op.getSimpleValueType();
19817
19818 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19819 return SDValue();
19820
19821 // Pack the i64 into a vector, do the operation and extract.
19822
19823 assert(Subtarget.hasFP16() && "Expected FP16");
19824
19825 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19826 if (IsStrict) {
19827 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19828 {Op.getOperand(0), InVec});
19829 SDValue Chain = CvtVec.getValue(1);
19830 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19831 DAG.getVectorIdxConstant(0, dl));
19832 return DAG.getMergeValues({Value, Chain}, dl);
19833 }
19834
19835 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19836
19837 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19838 DAG.getVectorIdxConstant(0, dl));
19839}
19840
19841static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19842 const X86Subtarget &Subtarget) {
19843 switch (Opcode) {
19844 case ISD::SINT_TO_FP:
19845 // TODO: Handle wider types with AVX/AVX512.
19846 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19847 return false;
19848 // CVTDQ2PS or (V)CVTDQ2PD
19849 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19850
19851 case ISD::UINT_TO_FP:
19852 // TODO: Handle wider types and i64 elements.
19853 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19854 return false;
19855 // VCVTUDQ2PS or VCVTUDQ2PD
19856 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19857
19858 default:
19859 return false;
19860 }
19861}
19862
19863/// Given a scalar cast operation that is extracted from a vector, try to
19864/// vectorize the cast op followed by extraction. This will avoid an expensive
19865/// round-trip between XMM and GPR.
19867 SelectionDAG &DAG,
19868 const X86Subtarget &Subtarget) {
19869 // TODO: This could be enhanced to handle smaller integer types by peeking
19870 // through an extend.
19871 SDValue Extract = Cast.getOperand(0);
19872 MVT DestVT = Cast.getSimpleValueType();
19873 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19874 !isa<ConstantSDNode>(Extract.getOperand(1)))
19875 return SDValue();
19876
19877 // See if we have a 128-bit vector cast op for this type of cast.
19878 SDValue VecOp = Extract.getOperand(0);
19879 MVT FromVT = VecOp.getSimpleValueType();
19880 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19881 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19882 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19883 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19884 return SDValue();
19885
19886 // If we are extracting from a non-zero element, first shuffle the source
19887 // vector to allow extracting from element zero.
19888 if (!isNullConstant(Extract.getOperand(1))) {
19889 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19890 Mask[0] = Extract.getConstantOperandVal(1);
19891 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19892 }
19893 // If the source vector is wider than 128-bits, extract the low part. Do not
19894 // create an unnecessarily wide vector cast op.
19895 if (FromVT != Vec128VT)
19896 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19897
19898 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19899 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19900 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19901 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19902 DAG.getVectorIdxConstant(0, DL));
19903}
19904
19905/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19906/// try to vectorize the cast ops. This will avoid an expensive round-trip
19907/// between XMM and GPR.
19908static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19909 SelectionDAG &DAG,
19910 const X86Subtarget &Subtarget) {
19911 // TODO: Allow FP_TO_UINT.
19912 SDValue CastToInt = CastToFP.getOperand(0);
19913 MVT VT = CastToFP.getSimpleValueType();
19914 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19915 return SDValue();
19916
19917 MVT IntVT = CastToInt.getSimpleValueType();
19918 SDValue X = CastToInt.getOperand(0);
19919 MVT SrcVT = X.getSimpleValueType();
19920 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19921 return SDValue();
19922
19923 // See if we have 128-bit vector cast instructions for this type of cast.
19924 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19925 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19926 IntVT != MVT::i32)
19927 return SDValue();
19928
19929 unsigned SrcSize = SrcVT.getSizeInBits();
19930 unsigned IntSize = IntVT.getSizeInBits();
19931 unsigned VTSize = VT.getSizeInBits();
19932 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19933 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19934 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19935
19936 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19937 unsigned ToIntOpcode =
19938 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19939 unsigned ToFPOpcode =
19940 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19941
19942 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19943 //
19944 // We are not defining the high elements (for example, zero them) because
19945 // that could nullify any performance advantage that we hoped to gain from
19946 // this vector op hack. We do not expect any adverse effects (like denorm
19947 // penalties) with cast ops.
19948 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19949 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19950 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19951 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19952 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19953}
19954
19956 SelectionDAG &DAG,
19957 const X86Subtarget &Subtarget) {
19958 bool IsStrict = Op->isStrictFPOpcode();
19959 MVT VT = Op->getSimpleValueType(0);
19960 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19961
19962 if (Subtarget.hasDQI()) {
19963 assert(!Subtarget.hasVLX() && "Unexpected features");
19964
19965 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19966 Src.getSimpleValueType() == MVT::v4i64) &&
19967 "Unsupported custom type");
19968
19969 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19970 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19971 "Unexpected VT!");
19972 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19973
19974 // Need to concat with zero vector for strict fp to avoid spurious
19975 // exceptions.
19976 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19977 : DAG.getUNDEF(MVT::v8i64);
19978 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19979 DAG.getVectorIdxConstant(0, DL));
19980 SDValue Res, Chain;
19981 if (IsStrict) {
19982 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19983 {Op->getOperand(0), Src});
19984 Chain = Res.getValue(1);
19985 } else {
19986 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19987 }
19988
19989 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19990 DAG.getVectorIdxConstant(0, DL));
19991
19992 if (IsStrict)
19993 return DAG.getMergeValues({Res, Chain}, DL);
19994 return Res;
19995 }
19996
19997 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19998 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19999 if (VT != MVT::v4f32 || IsSigned)
20000 return SDValue();
20001
20002 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20003 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20004 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20005 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20006 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20007 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20008 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20009 SmallVector<SDValue, 4> SignCvts(4);
20010 SmallVector<SDValue, 4> Chains(4);
20011 for (int i = 0; i != 4; ++i) {
20012 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20013 DAG.getVectorIdxConstant(i, DL));
20014 if (IsStrict) {
20015 SignCvts[i] =
20016 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20017 {Op.getOperand(0), Elt});
20018 Chains[i] = SignCvts[i].getValue(1);
20019 } else {
20020 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20021 }
20022 }
20023 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20024
20025 SDValue Slow, Chain;
20026 if (IsStrict) {
20027 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20028 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20029 {Chain, SignCvt, SignCvt});
20030 Chain = Slow.getValue(1);
20031 } else {
20032 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20033 }
20034
20035 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20036 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20037
20038 if (IsStrict)
20039 return DAG.getMergeValues({Cvt, Chain}, DL);
20040
20041 return Cvt;
20042}
20043
20045 SelectionDAG &DAG) {
20046 bool IsStrict = Op->isStrictFPOpcode();
20047 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20048 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20049 MVT VT = Op.getSimpleValueType();
20050 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20051
20052 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20053 if (IsStrict)
20054 return DAG.getNode(
20055 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20056 {Chain,
20057 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20058 Rnd});
20059 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20060 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20061}
20062
20063static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20064 const X86Subtarget &Subtarget) {
20065 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20066 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20067 return true;
20068 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20069 return true;
20070 }
20071 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20072 return true;
20073 if (Subtarget.useAVX512Regs()) {
20074 if (VT == MVT::v16i32)
20075 return true;
20076 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20077 return true;
20078 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20079 return true;
20080 }
20081 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20082 (VT == MVT::v2i64 || VT == MVT::v4i64))
20083 return true;
20084 return false;
20085}
20086
20087SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20088 SelectionDAG &DAG) const {
20089 bool IsStrict = Op->isStrictFPOpcode();
20090 unsigned OpNo = IsStrict ? 1 : 0;
20091 SDValue Src = Op.getOperand(OpNo);
20092 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20093 MVT SrcVT = Src.getSimpleValueType();
20094 MVT VT = Op.getSimpleValueType();
20095 SDLoc dl(Op);
20096
20097 if (isSoftF16(VT, Subtarget))
20098 return promoteXINT_TO_FP(Op, dl, DAG);
20099 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20100 return Op;
20101
20102 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20103 return LowerWin64_INT128_TO_FP(Op, DAG);
20104
20105 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20106 return Extract;
20107
20108 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20109 return R;
20110
20111 if (SrcVT.isVector()) {
20112 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20113 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20114 // source for strict FP.
20115 if (IsStrict)
20116 return DAG.getNode(
20117 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20118 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20119 DAG.getUNDEF(SrcVT))});
20120 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20121 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20122 DAG.getUNDEF(SrcVT)));
20123 }
20124 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20125 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20126
20127 return SDValue();
20128 }
20129
20130 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20131 "Unknown SINT_TO_FP to lower!");
20132
20133 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20134
20135 // These are really Legal; return the operand so the caller accepts it as
20136 // Legal.
20137 if (SrcVT == MVT::i32 && UseSSEReg)
20138 return Op;
20139 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20140 return Op;
20141
20142 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20143 return V;
20144 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20145 return V;
20146
20147 // SSE doesn't have an i16 conversion so we need to promote.
20148 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20149 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20150 if (IsStrict)
20151 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20152 {Chain, Ext});
20153
20154 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20155 }
20156
20157 if (VT == MVT::f128 || !Subtarget.hasX87())
20158 return SDValue();
20159
20160 SDValue ValueToStore = Src;
20161 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20162 // Bitcasting to f64 here allows us to do a single 64-bit store from
20163 // an SSE register, avoiding the store forwarding penalty that would come
20164 // with two 32-bit stores.
20165 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20166
20167 unsigned Size = SrcVT.getStoreSize();
20168 Align Alignment(Size);
20169 MachineFunction &MF = DAG.getMachineFunction();
20170 auto PtrVT = getPointerTy(MF.getDataLayout());
20171 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20172 MachinePointerInfo MPI =
20174 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20175 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20176 std::pair<SDValue, SDValue> Tmp =
20177 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20178
20179 if (IsStrict)
20180 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20181
20182 return Tmp.first;
20183}
20184
20185std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20186 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20187 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20188 // Build the FILD
20189 SDVTList Tys;
20190 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20191 if (useSSE)
20192 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20193 else
20194 Tys = DAG.getVTList(DstVT, MVT::Other);
20195
20196 SDValue FILDOps[] = {Chain, Pointer};
20197 SDValue Result =
20198 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20199 Alignment, MachineMemOperand::MOLoad);
20200 Chain = Result.getValue(1);
20201
20202 if (useSSE) {
20204 unsigned SSFISize = DstVT.getStoreSize();
20205 int SSFI =
20206 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20207 auto PtrVT = getPointerTy(MF.getDataLayout());
20208 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20209 Tys = DAG.getVTList(MVT::Other);
20210 SDValue FSTOps[] = {Chain, Result, StackSlot};
20213 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20214
20215 Chain =
20216 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20217 Result = DAG.getLoad(
20218 DstVT, DL, Chain, StackSlot,
20220 Chain = Result.getValue(1);
20221 }
20222
20223 return { Result, Chain };
20224}
20225
20226/// Horizontal vector math instructions may be slower than normal math with
20227/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20228/// implementation, and likely shuffle complexity of the alternate sequence.
20229static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20230 const X86Subtarget &Subtarget) {
20231 bool IsOptimizingSize = DAG.shouldOptForSize();
20232 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20233 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20234}
20235
20236/// 64-bit unsigned integer to double expansion.
20238 SelectionDAG &DAG,
20239 const X86Subtarget &Subtarget) {
20240 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20241 // when converting 0 when rounding toward negative infinity. Caller will
20242 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20243 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20244 // This algorithm is not obvious. Here it is what we're trying to output:
20245 /*
20246 movq %rax, %xmm0
20247 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20248 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20249 #ifdef __SSE3__
20250 haddpd %xmm0, %xmm0
20251 #else
20252 pshufd $0x4e, %xmm0, %xmm1
20253 addpd %xmm1, %xmm0
20254 #endif
20255 */
20256
20257 LLVMContext *Context = DAG.getContext();
20258
20259 // Build some magic constants.
20260 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20261 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20262 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20263 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20264
20266 CV1.push_back(
20267 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20268 APInt(64, 0x4330000000000000ULL))));
20269 CV1.push_back(
20270 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20271 APInt(64, 0x4530000000000000ULL))));
20272 Constant *C1 = ConstantVector::get(CV1);
20273 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20274
20275 // Load the 64-bit value into an XMM register.
20276 SDValue XR1 =
20277 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20278 SDValue CLod0 = DAG.getLoad(
20279 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20281 SDValue Unpck1 =
20282 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20283
20284 SDValue CLod1 = DAG.getLoad(
20285 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20287 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20288 // TODO: Are there any fast-math-flags to propagate here?
20289 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20290 SDValue Result;
20291
20292 if (Subtarget.hasSSE3() &&
20293 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20294 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20295 } else {
20296 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20297 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20298 }
20299 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20300 DAG.getVectorIdxConstant(0, dl));
20301 return Result;
20302}
20303
20304/// 32-bit unsigned integer to float expansion.
20306 SelectionDAG &DAG,
20307 const X86Subtarget &Subtarget) {
20308 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20309 // FP constant to bias correct the final result.
20310 SDValue Bias = DAG.getConstantFP(
20311 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20312
20313 // Load the 32-bit value into an XMM register.
20314 SDValue Load =
20315 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20316
20317 // Zero out the upper parts of the register.
20318 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20319
20320 // Or the load with the bias.
20321 SDValue Or = DAG.getNode(
20322 ISD::OR, dl, MVT::v2i64,
20323 DAG.getBitcast(MVT::v2i64, Load),
20324 DAG.getBitcast(MVT::v2i64,
20325 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20326 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20327 DAG.getBitcast(MVT::v2f64, Or),
20328 DAG.getVectorIdxConstant(0, dl));
20329
20330 if (Op.getNode()->isStrictFPOpcode()) {
20331 // Subtract the bias.
20332 // TODO: Are there any fast-math-flags to propagate here?
20333 SDValue Chain = Op.getOperand(0);
20334 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20335 {Chain, Or, Bias});
20336
20337 if (Op.getValueType() == Sub.getValueType())
20338 return Sub;
20339
20340 // Handle final rounding.
20341 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20342 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20343
20344 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20345 }
20346
20347 // Subtract the bias.
20348 // TODO: Are there any fast-math-flags to propagate here?
20349 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20350
20351 // Handle final rounding.
20352 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20353}
20354
20356 SelectionDAG &DAG,
20357 const X86Subtarget &Subtarget) {
20358 if (Op.getSimpleValueType() != MVT::v2f64)
20359 return SDValue();
20360
20361 bool IsStrict = Op->isStrictFPOpcode();
20362
20363 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20364 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20365
20366 if (Subtarget.hasAVX512()) {
20367 if (!Subtarget.hasVLX()) {
20368 // Let generic type legalization widen this.
20369 if (!IsStrict)
20370 return SDValue();
20371 // Otherwise pad the integer input with 0s and widen the operation.
20372 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20373 DAG.getConstant(0, DL, MVT::v2i32));
20374 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20375 {Op.getOperand(0), N0});
20376 SDValue Chain = Res.getValue(1);
20377 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20378 DAG.getVectorIdxConstant(0, DL));
20379 return DAG.getMergeValues({Res, Chain}, DL);
20380 }
20381
20382 // Legalize to v4i32 type.
20383 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20384 DAG.getUNDEF(MVT::v2i32));
20385 if (IsStrict)
20386 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20387 {Op.getOperand(0), N0});
20388 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20389 }
20390
20391 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20392 // This gives us the floating point equivalent of 2^52 + the i32 integer
20393 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20394 // point leaving just our i32 integers in double format.
20395 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20396 SDValue VBias = DAG.getConstantFP(
20397 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20398 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20399 DAG.getBitcast(MVT::v2i64, VBias));
20400 Or = DAG.getBitcast(MVT::v2f64, Or);
20401
20402 if (IsStrict)
20403 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20404 {Op.getOperand(0), Or, VBias});
20405 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20406}
20407
20409 SelectionDAG &DAG,
20410 const X86Subtarget &Subtarget) {
20411 bool IsStrict = Op->isStrictFPOpcode();
20412 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20413 MVT VecIntVT = V.getSimpleValueType();
20414 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20415 "Unsupported custom type");
20416
20417 if (Subtarget.hasAVX512()) {
20418 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20419 assert(!Subtarget.hasVLX() && "Unexpected features");
20420 MVT VT = Op->getSimpleValueType(0);
20421
20422 // v8i32->v8f64 is legal with AVX512 so just return it.
20423 if (VT == MVT::v8f64)
20424 return Op;
20425
20426 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20427 VT == MVT::v8f16) &&
20428 "Unexpected VT!");
20429 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20430 MVT WideIntVT = MVT::v16i32;
20431 if (VT == MVT::v4f64) {
20432 WideVT = MVT::v8f64;
20433 WideIntVT = MVT::v8i32;
20434 }
20435
20436 // Need to concat with zero vector for strict fp to avoid spurious
20437 // exceptions.
20438 SDValue Tmp =
20439 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20440 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20441 DAG.getVectorIdxConstant(0, DL));
20442 SDValue Res, Chain;
20443 if (IsStrict) {
20444 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20445 {Op->getOperand(0), V});
20446 Chain = Res.getValue(1);
20447 } else {
20448 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20449 }
20450
20451 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20452 DAG.getVectorIdxConstant(0, DL));
20453
20454 if (IsStrict)
20455 return DAG.getMergeValues({Res, Chain}, DL);
20456 return Res;
20457 }
20458
20459 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20460 Op->getSimpleValueType(0) == MVT::v4f64) {
20461 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20462 Constant *Bias = ConstantFP::get(
20463 *DAG.getContext(),
20464 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20465 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20466 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20467 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20468 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20469 SDValue VBias = DAG.getMemIntrinsicNode(
20470 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20473
20474 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20475 DAG.getBitcast(MVT::v4i64, VBias));
20476 Or = DAG.getBitcast(MVT::v4f64, Or);
20477
20478 if (IsStrict)
20479 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20480 {Op.getOperand(0), Or, VBias});
20481 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20482 }
20483
20484 // The algorithm is the following:
20485 // #ifdef __SSE4_1__
20486 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20487 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20488 // (uint4) 0x53000000, 0xaa);
20489 // #else
20490 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20491 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20492 // #endif
20493 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20494 // return (float4) lo + fhi;
20495
20496 bool Is128 = VecIntVT == MVT::v4i32;
20497 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20498 // If we convert to something else than the supported type, e.g., to v4f64,
20499 // abort early.
20500 if (VecFloatVT != Op->getSimpleValueType(0))
20501 return SDValue();
20502
20503 // In the #idef/#else code, we have in common:
20504 // - The vector of constants:
20505 // -- 0x4b000000
20506 // -- 0x53000000
20507 // - A shift:
20508 // -- v >> 16
20509
20510 // Create the splat vector for 0x4b000000.
20511 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20512 // Create the splat vector for 0x53000000.
20513 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20514
20515 // Create the right shift.
20516 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20517 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20518
20519 SDValue Low, High;
20520 if (Subtarget.hasSSE41()) {
20521 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20522 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20523 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20524 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20525 // Low will be bitcasted right away, so do not bother bitcasting back to its
20526 // original type.
20527 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20528 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20529 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20530 // (uint4) 0x53000000, 0xaa);
20531 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20532 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20533 // High will be bitcasted right away, so do not bother bitcasting back to
20534 // its original type.
20535 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20536 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20537 } else {
20538 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20539 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20540 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20541 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20542
20543 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20544 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20545 }
20546
20547 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20548 SDValue VecCstFSub = DAG.getConstantFP(
20549 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20550
20551 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20552 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20553 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20554 // enabled. See PR24512.
20555 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20556 // TODO: Are there any fast-math-flags to propagate here?
20557 // (float4) lo;
20558 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20559 // return (float4) lo + fhi;
20560 if (IsStrict) {
20561 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20562 {Op.getOperand(0), HighBitcast, VecCstFSub});
20563 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20564 {FHigh.getValue(1), LowBitcast, FHigh});
20565 }
20566
20567 SDValue FHigh =
20568 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20569 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20570}
20571
20573 const X86Subtarget &Subtarget) {
20574 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20575 SDValue N0 = Op.getOperand(OpNo);
20576 MVT SrcVT = N0.getSimpleValueType();
20577
20578 switch (SrcVT.SimpleTy) {
20579 default:
20580 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20581 case MVT::v2i32:
20582 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20583 case MVT::v4i32:
20584 case MVT::v8i32:
20585 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20586 case MVT::v2i64:
20587 case MVT::v4i64:
20588 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20589 }
20590}
20591
20592SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20593 SelectionDAG &DAG) const {
20594 bool IsStrict = Op->isStrictFPOpcode();
20595 unsigned OpNo = IsStrict ? 1 : 0;
20596 SDValue Src = Op.getOperand(OpNo);
20597 SDLoc dl(Op);
20598 auto PtrVT = getPointerTy(DAG.getDataLayout());
20599 MVT SrcVT = Src.getSimpleValueType();
20600 MVT DstVT = Op->getSimpleValueType(0);
20601 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20602
20603 // Bail out when we don't have native conversion instructions.
20604 if (DstVT == MVT::f128)
20605 return SDValue();
20606
20607 if (isSoftF16(DstVT, Subtarget))
20608 return promoteXINT_TO_FP(Op, dl, DAG);
20609 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20610 return Op;
20611
20612 if (DstVT.isVector())
20613 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20614
20615 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20616 return LowerWin64_INT128_TO_FP(Op, DAG);
20617
20618 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20619 return Extract;
20620
20621 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20622 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20623 // Conversions from unsigned i32 to f32/f64 are legal,
20624 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20625 return Op;
20626 }
20627
20628 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20629 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20630 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20631 if (IsStrict)
20632 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20633 {Chain, Src});
20634 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20635 }
20636
20637 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20638 return V;
20639 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20640 return V;
20641
20642 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20643 // infinity. It produces -0.0, so disable under strictfp.
20644 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20645 !IsStrict)
20646 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20647 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20648 // negative infinity. So disable under strictfp. Using FILD instead.
20649 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20650 !IsStrict)
20651 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20652 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20653 (DstVT == MVT::f32 || DstVT == MVT::f64))
20654 return SDValue();
20655
20656 // Make a 64-bit buffer, and use it to build an FILD.
20657 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20658 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20659 Align SlotAlign(8);
20660 MachinePointerInfo MPI =
20662 if (SrcVT == MVT::i32) {
20663 SDValue OffsetSlot =
20664 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20665 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20666 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20667 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20668 std::pair<SDValue, SDValue> Tmp =
20669 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20670 if (IsStrict)
20671 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20672
20673 return Tmp.first;
20674 }
20675
20676 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20677 SDValue ValueToStore = Src;
20678 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20679 // Bitcasting to f64 here allows us to do a single 64-bit store from
20680 // an SSE register, avoiding the store forwarding penalty that would come
20681 // with two 32-bit stores.
20682 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20683 }
20684 SDValue Store =
20685 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20686 // For i64 source, we need to add the appropriate power of 2 if the input
20687 // was negative. We must be careful to do the computation in x87 extended
20688 // precision, not in SSE.
20689 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20690 SDValue Ops[] = {Store, StackSlot};
20691 SDValue Fild =
20692 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20693 SlotAlign, MachineMemOperand::MOLoad);
20694 Chain = Fild.getValue(1);
20695
20696 // Check whether the sign bit is set.
20697 SDValue SignSet = DAG.getSetCC(
20698 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20699 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20700
20701 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20702 APInt FF(64, 0x5F80000000000000ULL);
20703 SDValue FudgePtr =
20704 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20705 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20706
20707 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20708 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20709 SDValue Four = DAG.getIntPtrConstant(4, dl);
20710 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20711 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20712
20713 // Load the value out, extending it from f32 to f80.
20714 SDValue Fudge = DAG.getExtLoad(
20715 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20717 CPAlignment);
20718 Chain = Fudge.getValue(1);
20719 // Extend everything to 80 bits to force it to be done on x87.
20720 // TODO: Are there any fast-math-flags to propagate here?
20721 if (IsStrict) {
20722 unsigned Opc = ISD::STRICT_FADD;
20723 // Windows needs the precision control changed to 80bits around this add.
20724 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20726
20727 SDValue Add =
20728 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20729 // STRICT_FP_ROUND can't handle equal types.
20730 if (DstVT == MVT::f80)
20731 return Add;
20732 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20733 {Add.getValue(1), Add,
20734 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20735 }
20736 unsigned Opc = ISD::FADD;
20737 // Windows needs the precision control changed to 80bits around this add.
20738 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20740
20741 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20742 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20743 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20744}
20745
20746// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20747// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20748// just return an SDValue().
20749// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20750// to i16, i32 or i64, and we lower it to a legal sequence and return the
20751// result.
20752SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20753 bool IsSigned,
20754 SDValue &Chain) const {
20755 bool IsStrict = Op->isStrictFPOpcode();
20756 SDLoc DL(Op);
20757
20758 EVT DstTy = Op.getValueType();
20759 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20760 EVT TheVT = Value.getValueType();
20761 auto PtrVT = getPointerTy(DAG.getDataLayout());
20762
20763 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20764 // f16 must be promoted before using the lowering in this routine.
20765 // fp128 does not use this lowering.
20766 return SDValue();
20767 }
20768
20769 // If using FIST to compute an unsigned i64, we'll need some fixup
20770 // to handle values above the maximum signed i64. A FIST is always
20771 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20772 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20773
20774 // FIXME: This does not generate an invalid exception if the input does not
20775 // fit in i32. PR44019
20776 if (!IsSigned && DstTy != MVT::i64) {
20777 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20778 // The low 32 bits of the fist result will have the correct uint32 result.
20779 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20780 DstTy = MVT::i64;
20781 }
20782
20783 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20784 DstTy.getSimpleVT() >= MVT::i16 &&
20785 "Unknown FP_TO_INT to lower!");
20786
20787 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20788 // stack slot.
20789 MachineFunction &MF = DAG.getMachineFunction();
20790 unsigned MemSize = DstTy.getStoreSize();
20791 int SSFI =
20792 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20793 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20794
20795 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20796
20797 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20798
20799 if (UnsignedFixup) {
20800 //
20801 // Conversion to unsigned i64 is implemented with a select,
20802 // depending on whether the source value fits in the range
20803 // of a signed i64. Let Thresh be the FP equivalent of
20804 // 0x8000000000000000ULL.
20805 //
20806 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20807 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20808 // FistSrc = (Value - FltOfs);
20809 // Fist-to-mem64 FistSrc
20810 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20811 // to XOR'ing the high 32 bits with Adjust.
20812 //
20813 // Being a power of 2, Thresh is exactly representable in all FP formats.
20814 // For X87 we'd like to use the smallest FP type for this constant, but
20815 // for DAG type consistency we have to match the FP operand type.
20816
20817 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20819 bool LosesInfo = false;
20820 if (TheVT == MVT::f64)
20821 // The rounding mode is irrelevant as the conversion should be exact.
20822 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20823 &LosesInfo);
20824 else if (TheVT == MVT::f80)
20825 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20826 APFloat::rmNearestTiesToEven, &LosesInfo);
20827
20828 assert(Status == APFloat::opOK && !LosesInfo &&
20829 "FP conversion should have been exact");
20830
20831 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20832
20833 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20834 *DAG.getContext(), TheVT);
20835 SDValue Cmp;
20836 if (IsStrict) {
20837 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20838 /*IsSignaling*/ true);
20839 Chain = Cmp.getValue(1);
20840 } else {
20841 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20842 }
20843
20844 // Our preferred lowering of
20845 //
20846 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20847 //
20848 // is
20849 //
20850 // (Value >= Thresh) << 63
20851 //
20852 // but since we can get here after LegalOperations, DAGCombine might do the
20853 // wrong thing if we create a select. So, directly create the preferred
20854 // version.
20855 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20856 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20857 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20858
20859 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20860 DAG.getConstantFP(0.0, DL, TheVT));
20861
20862 if (IsStrict) {
20863 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20864 { Chain, Value, FltOfs });
20865 Chain = Value.getValue(1);
20866 } else
20867 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20868 }
20869
20870 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20871
20872 // FIXME This causes a redundant load/store if the SSE-class value is already
20873 // in memory, such as if it is on the callstack.
20874 if (isScalarFPTypeInSSEReg(TheVT)) {
20875 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20876 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20877 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20878 SDValue Ops[] = { Chain, StackSlot };
20879
20880 unsigned FLDSize = TheVT.getStoreSize();
20881 assert(FLDSize <= MemSize && "Stack slot not big enough");
20882 MachineMemOperand *MMO = MF.getMachineMemOperand(
20883 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20884 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20885 Chain = Value.getValue(1);
20886 }
20887
20888 // Build the FP_TO_INT*_IN_MEM
20889 MachineMemOperand *MMO = MF.getMachineMemOperand(
20890 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20891 SDValue Ops[] = { Chain, Value, StackSlot };
20893 DAG.getVTList(MVT::Other),
20894 Ops, DstTy, MMO);
20895
20896 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20897 Chain = Res.getValue(1);
20898
20899 // If we need an unsigned fixup, XOR the result with adjust.
20900 if (UnsignedFixup)
20901 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20902
20903 return Res;
20904}
20905
20907 const X86Subtarget &Subtarget) {
20908 MVT VT = Op.getSimpleValueType();
20909 SDValue In = Op.getOperand(0);
20910 MVT InVT = In.getSimpleValueType();
20911 unsigned Opc = Op.getOpcode();
20912
20913 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20915 "Unexpected extension opcode");
20917 "Expected same number of elements");
20918 assert((VT.getVectorElementType() == MVT::i16 ||
20919 VT.getVectorElementType() == MVT::i32 ||
20920 VT.getVectorElementType() == MVT::i64) &&
20921 "Unexpected element type");
20922 assert((InVT.getVectorElementType() == MVT::i8 ||
20923 InVT.getVectorElementType() == MVT::i16 ||
20924 InVT.getVectorElementType() == MVT::i32) &&
20925 "Unexpected element type");
20926
20927 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20928
20929 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20930 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20931 return splitVectorIntUnary(Op, DAG, dl);
20932 }
20933
20934 if (Subtarget.hasInt256())
20935 return Op;
20936
20937 // Optimize vectors in AVX mode:
20938 //
20939 // v8i16 -> v8i32
20940 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20941 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20942 // Concat upper and lower parts.
20943 //
20944 // v4i32 -> v4i64
20945 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20946 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20947 // Concat upper and lower parts.
20948 //
20949 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20950 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20951
20952 // Short-circuit if we can determine that each 128-bit half is the same value.
20953 // Otherwise, this is difficult to match and optimize.
20954 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20955 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20956 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20957
20958 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20959 SDValue Undef = DAG.getUNDEF(InVT);
20960 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20961 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20962 OpHi = DAG.getBitcast(HalfVT, OpHi);
20963
20964 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20965}
20966
20967// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20968static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20969 const SDLoc &dl, SelectionDAG &DAG) {
20970 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20971 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20972 DAG.getVectorIdxConstant(0, dl));
20973 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20974 DAG.getVectorIdxConstant(8, dl));
20975 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20976 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20977 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20978 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20979}
20980
20982 const X86Subtarget &Subtarget,
20983 SelectionDAG &DAG) {
20984 MVT VT = Op->getSimpleValueType(0);
20985 SDValue In = Op->getOperand(0);
20986 MVT InVT = In.getSimpleValueType();
20987 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20988 unsigned NumElts = VT.getVectorNumElements();
20989
20990 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20991 // avoids a constant pool load.
20992 if (VT.getVectorElementType() != MVT::i8) {
20993 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20994 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20995 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20996 }
20997
20998 // Extend VT if BWI is not supported.
20999 MVT ExtVT = VT;
21000 if (!Subtarget.hasBWI()) {
21001 // If v16i32 is to be avoided, we'll need to split and concatenate.
21002 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21003 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21004
21005 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21006 }
21007
21008 // Widen to 512-bits if VLX is not supported.
21009 MVT WideVT = ExtVT;
21010 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21011 NumElts *= 512 / ExtVT.getSizeInBits();
21012 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21013 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
21014 DAG.getVectorIdxConstant(0, DL));
21015 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
21016 }
21017
21018 SDValue One = DAG.getConstant(1, DL, WideVT);
21019 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21020
21021 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21022
21023 // Truncate if we had to extend above.
21024 if (VT != ExtVT) {
21025 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21026 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21027 }
21028
21029 // Extract back to 128/256-bit if we widened.
21030 if (WideVT != VT)
21031 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21032 DAG.getVectorIdxConstant(0, DL));
21033
21034 return SelectedVal;
21035}
21036
21038 SelectionDAG &DAG) {
21039 SDValue In = Op.getOperand(0);
21040 MVT SVT = In.getSimpleValueType();
21041 SDLoc DL(Op);
21042
21043 if (SVT.getVectorElementType() == MVT::i1)
21044 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21045
21046 assert(Subtarget.hasAVX() && "Expected AVX support");
21047 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21048}
21049
21050/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21051/// It makes use of the fact that vectors with enough leading sign/zero bits
21052/// prevent the PACKSS/PACKUS from saturating the results.
21053/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21054/// within each 128-bit lane.
21055static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21056 const SDLoc &DL, SelectionDAG &DAG,
21057 const X86Subtarget &Subtarget) {
21058 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21059 "Unexpected PACK opcode");
21060 assert(DstVT.isVector() && "VT not a vector?");
21061
21062 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21063 if (!Subtarget.hasSSE2())
21064 return SDValue();
21065
21066 EVT SrcVT = In.getValueType();
21067
21068 // No truncation required, we might get here due to recursive calls.
21069 if (SrcVT == DstVT)
21070 return In;
21071
21072 unsigned NumElems = SrcVT.getVectorNumElements();
21073 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21074 return SDValue();
21075
21076 unsigned DstSizeInBits = DstVT.getSizeInBits();
21077 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21078 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21079 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21080
21081 LLVMContext &Ctx = *DAG.getContext();
21082 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21083 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21084
21085 // Pack to the largest type possible:
21086 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21087 EVT InVT = MVT::i16, OutVT = MVT::i8;
21088 if (SrcVT.getScalarSizeInBits() > 16 &&
21089 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21090 InVT = MVT::i32;
21091 OutVT = MVT::i16;
21092 }
21093
21094 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21095 // On pre-AVX512, pack the src in both halves to help value tracking.
21096 if (SrcSizeInBits <= 128) {
21097 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21098 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21099 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21100 SDValue LHS = DAG.getBitcast(InVT, In);
21101 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21102 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21103 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21104 Res = DAG.getBitcast(PackedVT, Res);
21105 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21106 }
21107
21108 // Split lower/upper subvectors.
21109 SDValue Lo, Hi;
21110 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21111
21112 // If Hi is undef, then don't bother packing it and widen the result instead.
21113 if (Hi.isUndef()) {
21114 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21115 if (SDValue Res =
21116 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21117 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21118 }
21119
21120 unsigned SubSizeInBits = SrcSizeInBits / 2;
21121 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21122 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21123
21124 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21125 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21126 Lo = DAG.getBitcast(InVT, Lo);
21127 Hi = DAG.getBitcast(InVT, Hi);
21128 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21129 return DAG.getBitcast(DstVT, Res);
21130 }
21131
21132 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21133 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21134 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21135 Lo = DAG.getBitcast(InVT, Lo);
21136 Hi = DAG.getBitcast(InVT, Hi);
21137 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21138
21139 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21140 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21141 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21143 int Scale = 64 / OutVT.getScalarSizeInBits();
21144 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21145 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21146
21147 if (DstVT.is256BitVector())
21148 return DAG.getBitcast(DstVT, Res);
21149
21150 // If 512bit -> 128bit truncate another stage.
21151 Res = DAG.getBitcast(PackedVT, Res);
21152 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21153 }
21154
21155 // Recursively pack lower/upper subvectors, concat result and pack again.
21156 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21157
21158 if (PackedVT.is128BitVector()) {
21159 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21160 // type legalization.
21161 SDValue Res =
21162 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21163 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21164 }
21165
21166 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21167 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21168 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21169 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21170 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21171}
21172
21173/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21174/// e.g. trunc <8 x i32> X to <8 x i16> -->
21175/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21176/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21178 const X86Subtarget &Subtarget,
21179 SelectionDAG &DAG) {
21180 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21181 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21182}
21183
21184/// Truncate using inreg sign extension and X86ISD::PACKSS.
21186 const X86Subtarget &Subtarget,
21187 SelectionDAG &DAG) {
21188 EVT SrcVT = In.getValueType();
21189 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21190 DAG.getValueType(DstVT));
21191 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21192}
21193
21194/// Helper to determine if \p In truncated to \p DstVT has the necessary
21195/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21196/// possibly by converting a SRL node to SRA for sign extension.
21197static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21198 SDValue In, const SDLoc &DL,
21199 SelectionDAG &DAG,
21200 const X86Subtarget &Subtarget,
21201 const SDNodeFlags Flags = SDNodeFlags()) {
21202 // Requires SSE2.
21203 if (!Subtarget.hasSSE2())
21204 return SDValue();
21205
21206 EVT SrcVT = In.getValueType();
21207 EVT DstSVT = DstVT.getVectorElementType();
21208 EVT SrcSVT = SrcVT.getVectorElementType();
21209 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21210 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21211
21212 // Check we have a truncation suited for PACKSS/PACKUS.
21213 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21214 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21215 return SDValue();
21216
21217 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21218 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21219
21220 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21221 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21222 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21223 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21224 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21225 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21226 return SDValue();
21227
21228 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21229 // split this for packing.
21230 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21231 !isFreeToSplitVector(In, DAG) &&
21232 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21233 return SDValue();
21234
21235 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21236 if (Subtarget.hasAVX512() && NumStages > 1)
21237 return SDValue();
21238
21239 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21240 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21241
21242 // Truncate with PACKUS if we are truncating a vector with leading zero
21243 // bits that extend all the way to the packed/truncated value.
21244 // e.g. Masks, zext_in_reg, etc.
21245 // Pre-SSE41 we can only use PACKUSWB.
21246 KnownBits Known = DAG.computeKnownBits(In);
21247 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21248 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21249 PackOpcode = X86ISD::PACKUS;
21250 return In;
21251 }
21252
21253 // Truncate with PACKSS if we are truncating a vector with sign-bits
21254 // that extend all the way to the packed/truncated value.
21255 // e.g. Comparison result, sext_in_reg, etc.
21256 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21257
21258 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21259 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21260 // see through BITCASTs later on and combines/simplifications can't then use
21261 // it.
21262 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21263 !Subtarget.hasAVX512())
21264 return SDValue();
21265
21266 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21267 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21268 MinSignBits < NumSignBits) {
21269 PackOpcode = X86ISD::PACKSS;
21270 return In;
21271 }
21272
21273 // If we have a srl that only generates signbits that we will discard in
21274 // the truncation then we can use PACKSS by converting the srl to a sra.
21275 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21276 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21277 if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
21278 if (*ShAmt == MinSignBits) {
21279 PackOpcode = X86ISD::PACKSS;
21280 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21281 }
21282 }
21283
21284 return SDValue();
21285}
21286
21287/// This function lowers a vector truncation of 'extended sign-bits' or
21288/// 'extended zero-bits' values.
21289/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21291 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21292 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21293 MVT SrcVT = In.getSimpleValueType();
21294 MVT DstSVT = DstVT.getVectorElementType();
21295 MVT SrcSVT = SrcVT.getVectorElementType();
21296 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21297 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21298 return SDValue();
21299
21300 // If the upper half of the source is undef, then attempt to split and
21301 // only truncate the lower half.
21302 if (DstVT.getSizeInBits() >= 128) {
21303 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21304 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21305 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21306 Subtarget, DAG))
21307 return widenSubVector(Res, false, Subtarget, DAG, DL,
21308 DstVT.getSizeInBits());
21309 }
21310 }
21311
21312 unsigned PackOpcode;
21313 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21314 Subtarget, Flags))
21315 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21316
21317 return SDValue();
21318}
21319
21320/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21321/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21323 const X86Subtarget &Subtarget,
21324 SelectionDAG &DAG) {
21325 MVT SrcVT = In.getSimpleValueType();
21326 MVT DstSVT = DstVT.getVectorElementType();
21327 MVT SrcSVT = SrcVT.getVectorElementType();
21328 unsigned NumElems = DstVT.getVectorNumElements();
21329 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21330 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21331 NumElems >= 8))
21332 return SDValue();
21333
21334 // SSSE3's pshufb results in less instructions in the cases below.
21335 if (Subtarget.hasSSSE3() && NumElems == 8) {
21336 if (SrcSVT == MVT::i16)
21337 return SDValue();
21338 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21339 return SDValue();
21340 }
21341
21342 // If the upper half of the source is undef, then attempt to split and
21343 // only truncate the lower half.
21344 if (DstVT.getSizeInBits() >= 128) {
21345 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21346 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21347 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21348 return widenSubVector(Res, false, Subtarget, DAG, DL,
21349 DstVT.getSizeInBits());
21350 }
21351 }
21352
21353 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21354 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21355 // truncate 2 x v4i32 to v8i16.
21356 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21357 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21358
21359 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21360 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21361
21362 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21363 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21364 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21365 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21366 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21367 }
21368
21369 return SDValue();
21370}
21371
21373 SelectionDAG &DAG,
21374 const X86Subtarget &Subtarget) {
21375 MVT VT = Op.getSimpleValueType();
21376 SDValue In = Op.getOperand(0);
21377 MVT InVT = In.getSimpleValueType();
21378 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21379
21380 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21381 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21382 if (InVT.getScalarSizeInBits() <= 16) {
21383 if (Subtarget.hasBWI()) {
21384 // legal, will go to VPMOVB2M, VPMOVW2M
21385 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21386 // We need to shift to get the lsb into sign position.
21387 // Shift packed bytes not supported natively, bitcast to word
21388 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21389 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21390 DAG.getBitcast(ExtVT, In),
21391 DAG.getConstant(ShiftInx, DL, ExtVT));
21392 In = DAG.getBitcast(InVT, In);
21393 }
21394 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21395 In, ISD::SETGT);
21396 }
21397 // Use TESTD/Q, extended vector to packed dword/qword.
21398 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21399 "Unexpected vector type.");
21400 unsigned NumElts = InVT.getVectorNumElements();
21401 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21402 // We need to change to a wider element type that we have support for.
21403 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21404 // For 16 element vectors we extend to v16i32 unless we are explicitly
21405 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21406 // we need to split into two 8 element vectors which we can extend to v8i32,
21407 // truncate and concat the results. There's an additional complication if
21408 // the original type is v16i8. In that case we can't split the v16i8
21409 // directly, so we need to shuffle high elements to low and use
21410 // sign_extend_vector_inreg.
21411 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21412 SDValue Lo, Hi;
21413 if (InVT == MVT::v16i8) {
21414 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21415 Hi = DAG.getVectorShuffle(
21416 InVT, DL, In, In,
21417 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21418 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21419 } else {
21420 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21421 Lo = extract128BitVector(In, 0, DAG, DL);
21422 Hi = extract128BitVector(In, 8, DAG, DL);
21423 }
21424 // We're split now, just emit two truncates and a concat. The two
21425 // truncates will trigger legalization to come back to this function.
21426 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21427 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21428 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21429 }
21430 // We either have 8 elements or we're allowed to use 512-bit vectors.
21431 // If we have VLX, we want to use the narrowest vector that can get the
21432 // job done so we use vXi32.
21433 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21434 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21435 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21436 InVT = ExtVT;
21437 ShiftInx = InVT.getScalarSizeInBits() - 1;
21438 }
21439
21440 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21441 // We need to shift to get the lsb into sign position.
21442 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21443 DAG.getConstant(ShiftInx, DL, InVT));
21444 }
21445 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21446 if (Subtarget.hasDQI())
21447 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21448 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21449}
21450
21451SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21452 SDLoc DL(Op);
21453 MVT VT = Op.getSimpleValueType();
21454 SDValue In = Op.getOperand(0);
21455 MVT InVT = In.getSimpleValueType();
21457 "Invalid TRUNCATE operation");
21458
21459 // If we're called by the type legalizer, handle a few cases.
21460 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21461 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21462 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21463 VT.is128BitVector() && Subtarget.hasAVX512()) {
21464 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21465 "Unexpected subtarget!");
21466 // The default behavior is to truncate one step, concatenate, and then
21467 // truncate the remainder. We'd rather produce two 64-bit results and
21468 // concatenate those.
21469 SDValue Lo, Hi;
21470 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21471
21472 EVT LoVT, HiVT;
21473 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21474
21475 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21476 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21477 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21478 }
21479
21480 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21481 if (!Subtarget.hasAVX512() ||
21482 (InVT.is512BitVector() && VT.is256BitVector()))
21484 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21485 return SignPack;
21486
21487 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21488 if (!Subtarget.hasAVX512())
21489 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21490
21491 // Otherwise let default legalization handle it.
21492 return SDValue();
21493 }
21494
21495 if (VT.getVectorElementType() == MVT::i1)
21496 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21497
21498 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21499 // concat from subvectors to use VPTRUNC etc.
21500 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21502 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21503 return SignPack;
21504
21505 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21506 if (Subtarget.hasAVX512()) {
21507 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21508 assert(VT == MVT::v32i8 && "Unexpected VT!");
21509 return splitVectorIntUnary(Op, DAG, DL);
21510 }
21511
21512 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21513 // and then truncate that. But we should only do that if we haven't been
21514 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21515 // handled by isel patterns.
21516 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21517 Subtarget.canExtendTo512DQ())
21518 return Op;
21519 }
21520
21521 // Handle truncation of V256 to V128 using shuffles.
21522 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21523
21524 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21525 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21526 if (Subtarget.hasInt256()) {
21527 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21528 In = DAG.getBitcast(MVT::v8i32, In);
21529 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21530 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21531 DAG.getVectorIdxConstant(0, DL));
21532 }
21533
21534 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21535 DAG.getVectorIdxConstant(0, DL));
21536 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21537 DAG.getVectorIdxConstant(2, DL));
21538 static const int ShufMask[] = {0, 2, 4, 6};
21539 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21540 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21541 }
21542
21543 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21544 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21545 if (Subtarget.hasInt256()) {
21546 // The PSHUFB mask:
21547 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21548 -1, -1, -1, -1, -1, -1, -1, -1,
21549 16, 17, 20, 21, 24, 25, 28, 29,
21550 -1, -1, -1, -1, -1, -1, -1, -1 };
21551 In = DAG.getBitcast(MVT::v32i8, In);
21552 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21553 In = DAG.getBitcast(MVT::v4i64, In);
21554
21555 static const int ShufMask2[] = {0, 2, -1, -1};
21556 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21557 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21558 DAG.getVectorIdxConstant(0, DL));
21559 return DAG.getBitcast(MVT::v8i16, In);
21560 }
21561
21562 return Subtarget.hasSSE41()
21563 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21564 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21565 }
21566
21567 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21568 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21569
21570 llvm_unreachable("All 256->128 cases should have been handled above!");
21571}
21572
21573// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21574// behaves on out of range inputs to generate optimized conversions.
21576 SelectionDAG &DAG,
21577 const X86Subtarget &Subtarget) {
21578 MVT SrcVT = Src.getSimpleValueType();
21579 unsigned DstBits = VT.getScalarSizeInBits();
21580 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21581
21582 // Calculate the converted result for values in the range 0 to
21583 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21584 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21585 SDValue Big =
21586 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21587 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21588 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21589
21590 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21591 // and only if the value was out of range. So we can use that
21592 // as our indicator that we rather use "Big" instead of "Small".
21593 //
21594 // Use "Small" if "IsOverflown" has all bits cleared
21595 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21596
21597 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21598 // use the slightly slower blendv select instead.
21599 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21600 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21601 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21602 }
21603
21604 SDValue IsOverflown =
21605 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21606 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21607 return DAG.getNode(ISD::OR, dl, VT, Small,
21608 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21609}
21610
21611SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21612 bool IsStrict = Op->isStrictFPOpcode();
21613 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21614 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21615 bool HasVLX = Subtarget.hasVLX();
21616 MVT VT = Op->getSimpleValueType(0);
21617 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21618 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21619 MVT SrcVT = Src.getSimpleValueType();
21620 SDLoc dl(Op);
21621
21622 SDValue Res;
21623 if (isSoftF16(SrcVT, Subtarget)) {
21624 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21625 if (IsStrict)
21626 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21627 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21628 {NVT, MVT::Other}, {Chain, Src})});
21629 return DAG.getNode(Op.getOpcode(), dl, VT,
21630 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21631 } else if (isTypeLegal(SrcVT) &&
21632 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21633 return Op;
21634 }
21635
21636 if (VT.isVector()) {
21637 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21638 MVT ResVT = MVT::v4i32;
21639 MVT TruncVT = MVT::v4i1;
21640 unsigned Opc;
21641 if (IsStrict)
21643 else
21644 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21645
21646 if (!IsSigned && !HasVLX) {
21647 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21648 // Widen to 512-bits.
21649 ResVT = MVT::v8i32;
21650 TruncVT = MVT::v8i1;
21651 Opc = Op.getOpcode();
21652 // Need to concat with zero vector for strict fp to avoid spurious
21653 // exceptions.
21654 // TODO: Should we just do this for non-strict as well?
21655 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21656 : DAG.getUNDEF(MVT::v8f64);
21657 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21658 DAG.getVectorIdxConstant(0, dl));
21659 }
21660 if (IsStrict) {
21661 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21662 Chain = Res.getValue(1);
21663 } else {
21664 Res = DAG.getNode(Opc, dl, ResVT, Src);
21665 }
21666
21667 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21668 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21669 DAG.getVectorIdxConstant(0, dl));
21670 if (IsStrict)
21671 return DAG.getMergeValues({Res, Chain}, dl);
21672 return Res;
21673 }
21674
21675 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21676 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21677 VT == MVT::v32i16)
21678 return Op;
21679
21680 MVT ResVT = VT;
21681 MVT EleVT = VT.getVectorElementType();
21682 if (EleVT != MVT::i64)
21683 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21684
21685 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21686 SDValue Tmp =
21687 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21688 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21689 Ops[0] = Src;
21690 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21691 }
21692
21693 if (!HasVLX) {
21694 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21695 // Widen to 512-bits.
21696 unsigned IntSize = EleVT.getSizeInBits();
21697 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21698 ResVT = MVT::getVectorVT(EleVT, Num);
21699 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21700 Subtarget, DAG, dl);
21701 }
21702
21703 if (IsStrict) {
21704 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21706 dl, {ResVT, MVT::Other}, {Chain, Src});
21707 Chain = Res.getValue(1);
21708 } else {
21709 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21710 ResVT, Src);
21711 }
21712
21713 // TODO: Need to add exception check code for strict FP.
21714 if (EleVT.getSizeInBits() < 16) {
21715 if (HasVLX)
21716 ResVT = MVT::getVectorVT(EleVT, 8);
21717 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21718 }
21719
21720 if (ResVT != VT)
21721 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21722 DAG.getVectorIdxConstant(0, dl));
21723
21724 if (IsStrict)
21725 return DAG.getMergeValues({Res, Chain}, dl);
21726 return Res;
21727 }
21728
21729 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21730 if (VT.getVectorElementType() == MVT::i16) {
21731 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21732 SrcVT.getVectorElementType() == MVT::f64) &&
21733 "Expected f32/f64 vector!");
21734 MVT NVT = VT.changeVectorElementType(MVT::i32);
21735 if (IsStrict) {
21736 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21738 dl, {NVT, MVT::Other}, {Chain, Src});
21739 Chain = Res.getValue(1);
21740 } else {
21741 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21742 NVT, Src);
21743 }
21744
21745 // TODO: Need to add exception check code for strict FP.
21746 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21747
21748 if (IsStrict)
21749 return DAG.getMergeValues({Res, Chain}, dl);
21750 return Res;
21751 }
21752
21753 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21754 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21755 assert(!IsSigned && "Expected unsigned conversion!");
21756 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21757 return Op;
21758 }
21759
21760 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21761 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21762 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21763 Subtarget.useAVX512Regs()) {
21764 assert(!IsSigned && "Expected unsigned conversion!");
21765 assert(!Subtarget.hasVLX() && "Unexpected features!");
21766 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21767 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21768 // Need to concat with zero vector for strict fp to avoid spurious
21769 // exceptions.
21770 // TODO: Should we just do this for non-strict as well?
21771 SDValue Tmp =
21772 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21773 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21774 DAG.getVectorIdxConstant(0, dl));
21775
21776 if (IsStrict) {
21777 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21778 {Chain, Src});
21779 Chain = Res.getValue(1);
21780 } else {
21781 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21782 }
21783
21784 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21785 DAG.getVectorIdxConstant(0, dl));
21786
21787 if (IsStrict)
21788 return DAG.getMergeValues({Res, Chain}, dl);
21789 return Res;
21790 }
21791
21792 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21793 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21794 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21795 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21796 assert(!Subtarget.hasVLX() && "Unexpected features!");
21797 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21798 // Need to concat with zero vector for strict fp to avoid spurious
21799 // exceptions.
21800 // TODO: Should we just do this for non-strict as well?
21801 SDValue Tmp =
21802 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21803 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21804 DAG.getVectorIdxConstant(0, dl));
21805
21806 if (IsStrict) {
21807 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21808 {Chain, Src});
21809 Chain = Res.getValue(1);
21810 } else {
21811 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21812 }
21813
21814 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21815 DAG.getVectorIdxConstant(0, dl));
21816
21817 if (IsStrict)
21818 return DAG.getMergeValues({Res, Chain}, dl);
21819 return Res;
21820 }
21821
21822 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21823 if (!Subtarget.hasVLX()) {
21824 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21825 // legalizer and then widened again by vector op legalization.
21826 if (!IsStrict)
21827 return SDValue();
21828
21829 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21830 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21831 {Src, Zero, Zero, Zero});
21832 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21833 {Chain, Tmp});
21834 SDValue Chain = Tmp.getValue(1);
21835 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21836 DAG.getVectorIdxConstant(0, dl));
21837 return DAG.getMergeValues({Tmp, Chain}, dl);
21838 }
21839
21840 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21841 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21842 DAG.getUNDEF(MVT::v2f32));
21843 if (IsStrict) {
21844 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21846 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21847 }
21848 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21849 return DAG.getNode(Opc, dl, VT, Tmp);
21850 }
21851
21852 // Generate optimized instructions for pre AVX512 unsigned conversions from
21853 // vXf32 to vXi32.
21854 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21855 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21856 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21857 assert(!IsSigned && "Expected unsigned conversion!");
21858 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21859 }
21860
21861 return SDValue();
21862 }
21863
21864 assert(!VT.isVector());
21865
21866 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21867
21868 if (!IsSigned && UseSSEReg) {
21869 // Conversions from f32/f64 with AVX512 should be legal.
21870 if (Subtarget.hasAVX512())
21871 return Op;
21872
21873 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21874 // behaves on out of range inputs to generate optimized conversions.
21875 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21876 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21877 unsigned DstBits = VT.getScalarSizeInBits();
21878 APInt UIntLimit = APInt::getSignMask(DstBits);
21879 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21880 DAG.getConstant(UIntLimit, dl, VT));
21881 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21882
21883 // Calculate the converted result for values in the range:
21884 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21885 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21886 SDValue Small =
21887 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21888 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21889 SDValue Big = DAG.getNode(
21890 X86ISD::CVTTS2SI, dl, VT,
21891 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21892 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21893
21894 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21895 // and only if the value was out of range. So we can use that
21896 // as our indicator that we rather use "Big" instead of "Small".
21897 //
21898 // Use "Small" if "IsOverflown" has all bits cleared
21899 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21900 SDValue IsOverflown = DAG.getNode(
21901 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21902 return DAG.getNode(ISD::OR, dl, VT, Small,
21903 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21904 }
21905
21906 // Use default expansion for i64.
21907 if (VT == MVT::i64)
21908 return SDValue();
21909
21910 assert(VT == MVT::i32 && "Unexpected VT!");
21911
21912 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21913 // FIXME: This does not generate an invalid exception if the input does not
21914 // fit in i32. PR44019
21915 if (Subtarget.is64Bit()) {
21916 if (IsStrict) {
21917 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21918 {Chain, Src});
21919 Chain = Res.getValue(1);
21920 } else
21921 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21922
21923 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21924 if (IsStrict)
21925 return DAG.getMergeValues({Res, Chain}, dl);
21926 return Res;
21927 }
21928
21929 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21930 // use fisttp which will be handled later.
21931 if (!Subtarget.hasSSE3())
21932 return SDValue();
21933 }
21934
21935 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21936 // FIXME: This does not generate an invalid exception if the input does not
21937 // fit in i16. PR44019
21938 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21939 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21940 if (IsStrict) {
21941 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21942 {Chain, Src});
21943 Chain = Res.getValue(1);
21944 } else
21945 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21946
21947 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21948 if (IsStrict)
21949 return DAG.getMergeValues({Res, Chain}, dl);
21950 return Res;
21951 }
21952
21953 // If this is a FP_TO_SINT using SSEReg we're done.
21954 if (UseSSEReg && IsSigned)
21955 return Op;
21956
21957 // fp128 needs to use a libcall.
21958 if (SrcVT == MVT::f128) {
21959 RTLIB::Libcall LC;
21960 if (IsSigned)
21961 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21962 else
21963 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21964
21965 MakeLibCallOptions CallOptions;
21966 std::pair<SDValue, SDValue> Tmp =
21967 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21968
21969 if (IsStrict)
21970 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21971
21972 return Tmp.first;
21973 }
21974
21975 // Fall back to X87.
21976 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21977 if (IsStrict)
21978 return DAG.getMergeValues({V, Chain}, dl);
21979 return V;
21980 }
21981
21982 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21983}
21984
21985SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21986 SelectionDAG &DAG) const {
21987 SDValue Src = Op.getOperand(0);
21988 EVT DstVT = Op.getSimpleValueType();
21989 MVT SrcVT = Src.getSimpleValueType();
21990
21991 if (SrcVT.isVector())
21992 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21993
21994 if (SrcVT == MVT::f16)
21995 return SDValue();
21996
21997 // If the source is in an SSE register, the node is Legal.
21998 if (isScalarFPTypeInSSEReg(SrcVT))
21999 return Op;
22000
22001 return LRINT_LLRINTHelper(Op.getNode(), DAG);
22002}
22003
22004SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22005 SelectionDAG &DAG) const {
22006 EVT DstVT = N->getValueType(0);
22007 SDValue Src = N->getOperand(0);
22008 EVT SrcVT = Src.getValueType();
22009
22010 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22011 // f16 must be promoted before using the lowering in this routine.
22012 // fp128 does not use this lowering.
22013 return SDValue();
22014 }
22015
22016 SDLoc DL(N);
22017 SDValue Chain = DAG.getEntryNode();
22018
22019 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22020
22021 // If we're converting from SSE, the stack slot needs to hold both types.
22022 // Otherwise it only needs to hold the DstVT.
22023 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22024 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22025 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22026 MachinePointerInfo MPI =
22028
22029 if (UseSSE) {
22030 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22031 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22032 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22033 SDValue Ops[] = { Chain, StackPtr };
22034
22035 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22036 /*Align*/ std::nullopt,
22038 Chain = Src.getValue(1);
22039 }
22040
22041 SDValue StoreOps[] = { Chain, Src, StackPtr };
22042 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22043 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22045
22046 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22047}
22048
22049SDValue
22050X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22051 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22052 // but making use of X86 specifics to produce better instruction sequences.
22053 SDNode *Node = Op.getNode();
22054 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22055 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22056 SDLoc dl(SDValue(Node, 0));
22057 SDValue Src = Node->getOperand(0);
22058
22059 // There are three types involved here: SrcVT is the source floating point
22060 // type, DstVT is the type of the result, and TmpVT is the result of the
22061 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22062 // DstVT).
22063 EVT SrcVT = Src.getValueType();
22064 EVT DstVT = Node->getValueType(0);
22065 EVT TmpVT = DstVT;
22066
22067 // This code is only for floats and doubles. Fall back to generic code for
22068 // anything else.
22069 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22070 return SDValue();
22071
22072 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22073 unsigned SatWidth = SatVT.getScalarSizeInBits();
22074 unsigned DstWidth = DstVT.getScalarSizeInBits();
22075 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22076 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22077 "Expected saturation width smaller than result width");
22078
22079 // Promote result of FP_TO_*INT to at least 32 bits.
22080 if (TmpWidth < 32) {
22081 TmpVT = MVT::i32;
22082 TmpWidth = 32;
22083 }
22084
22085 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22086 // us to use a native signed conversion instead.
22087 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22088 TmpVT = MVT::i64;
22089 TmpWidth = 64;
22090 }
22091
22092 // If the saturation width is smaller than the size of the temporary result,
22093 // we can always use signed conversion, which is native.
22094 if (SatWidth < TmpWidth)
22095 FpToIntOpcode = ISD::FP_TO_SINT;
22096
22097 // Determine minimum and maximum integer values and their corresponding
22098 // floating-point values.
22099 APInt MinInt, MaxInt;
22100 if (IsSigned) {
22101 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22102 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22103 } else {
22104 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22105 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22106 }
22107
22108 const fltSemantics &Sem = SrcVT.getFltSemantics();
22109 APFloat MinFloat(Sem);
22110 APFloat MaxFloat(Sem);
22111
22112 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22113 MinInt, IsSigned, APFloat::rmTowardZero);
22114 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22115 MaxInt, IsSigned, APFloat::rmTowardZero);
22116 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22117 && !(MaxStatus & APFloat::opStatus::opInexact);
22118
22119 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22120 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22121
22122 // If the integer bounds are exactly representable as floats, emit a
22123 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22124 if (AreExactFloatBounds) {
22125 if (DstVT != TmpVT) {
22126 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22127 SDValue MinClamped = DAG.getNode(
22128 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22129 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22130 SDValue BothClamped = DAG.getNode(
22131 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22132 // Convert clamped value to integer.
22133 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22134
22135 // NaN will become INDVAL, with the top bit set and the rest zero.
22136 // Truncation will discard the top bit, resulting in zero.
22137 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22138 }
22139
22140 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22141 SDValue MinClamped = DAG.getNode(
22142 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22143 // Clamp by MaxFloat from above. NaN cannot occur.
22144 SDValue BothClamped = DAG.getNode(
22145 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22146 // Convert clamped value to integer.
22147 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22148
22149 if (!IsSigned) {
22150 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22151 // which is zero.
22152 return FpToInt;
22153 }
22154
22155 // Otherwise, select zero if Src is NaN.
22156 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22157 return DAG.getSelectCC(
22158 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22159 }
22160
22161 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22162 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22163
22164 // Result of direct conversion, which may be selected away.
22165 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22166
22167 if (DstVT != TmpVT) {
22168 // NaN will become INDVAL, with the top bit set and the rest zero.
22169 // Truncation will discard the top bit, resulting in zero.
22170 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22171 }
22172
22173 SDValue Select = FpToInt;
22174 // For signed conversions where we saturate to the same size as the
22175 // result type of the fptoi instructions, INDVAL coincides with integer
22176 // minimum, so we don't need to explicitly check it.
22177 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22178 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22179 // MinInt if Src is NaN.
22180 Select = DAG.getSelectCC(
22181 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22182 }
22183
22184 // If Src OGT MaxFloat, select MaxInt.
22185 Select = DAG.getSelectCC(
22186 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22187
22188 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22189 // is already zero. The promoted case was already handled above.
22190 if (!IsSigned || DstVT != TmpVT) {
22191 return Select;
22192 }
22193
22194 // Otherwise, select 0 if Src is NaN.
22195 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22196 return DAG.getSelectCC(
22197 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22198}
22199
22200SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22201 bool IsStrict = Op->isStrictFPOpcode();
22202
22203 SDLoc DL(Op);
22204 MVT VT = Op.getSimpleValueType();
22205 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22206 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22207 MVT SVT = In.getSimpleValueType();
22208
22209 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22210 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22211 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22212 !Subtarget.getTargetTriple().isOSDarwin()))
22213 return SDValue();
22214
22215 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22216 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22217 return Op;
22218
22219 if (SVT == MVT::f16) {
22220 if (Subtarget.hasFP16())
22221 return Op;
22222
22223 if (VT != MVT::f32) {
22224 if (IsStrict)
22225 return DAG.getNode(
22226 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22227 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22228 {MVT::f32, MVT::Other}, {Chain, In})});
22229
22230 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22231 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22232 }
22233
22234 if (!Subtarget.hasF16C()) {
22235 if (!Subtarget.getTargetTriple().isOSDarwin())
22236 return SDValue();
22237
22238 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22239
22240 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22241 TargetLowering::CallLoweringInfo CLI(DAG);
22242 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22243
22244 In = DAG.getBitcast(MVT::i16, In);
22246 TargetLowering::ArgListEntry Entry(
22247 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22248 Entry.IsSExt = false;
22249 Entry.IsZExt = true;
22250 Args.push_back(Entry);
22251
22253 getLibcallName(RTLIB::FPEXT_F16_F32),
22255 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22256 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22257 std::move(Args));
22258
22259 SDValue Res;
22260 std::tie(Res,Chain) = LowerCallTo(CLI);
22261 if (IsStrict)
22262 Res = DAG.getMergeValues({Res, Chain}, DL);
22263
22264 return Res;
22265 }
22266
22267 In = DAG.getBitcast(MVT::i16, In);
22268 SDValue Res;
22269 if (IsStrict) {
22270 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22271 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22272 DAG.getVectorIdxConstant(0, DL));
22273 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22274 {Chain, In});
22275 Chain = Res.getValue(1);
22276 } else {
22277 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22278 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22279 DAG.getUNDEF(MVT::v4i32), In,
22280 DAG.getVectorIdxConstant(0, DL));
22281 In = DAG.getBitcast(MVT::v8i16, In);
22282 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22283 DAG.getTargetConstant(4, DL, MVT::i32));
22284 }
22285 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22286 DAG.getVectorIdxConstant(0, DL));
22287 if (IsStrict)
22288 return DAG.getMergeValues({Res, Chain}, DL);
22289 return Res;
22290 }
22291
22292 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22293 return Op;
22294
22295 if (SVT.getVectorElementType() == MVT::f16) {
22296 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22297 return Op;
22298 assert(Subtarget.hasF16C() && "Unexpected features!");
22299 if (SVT == MVT::v2f16)
22300 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22301 DAG.getUNDEF(MVT::v2f16));
22302 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22303 DAG.getUNDEF(MVT::v4f16));
22304 if (IsStrict)
22305 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22306 {Op->getOperand(0), Res});
22307 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22308 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22309 return Op;
22310 }
22311
22312 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22313
22314 SDValue Res =
22315 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22316 if (IsStrict)
22317 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22318 {Op->getOperand(0), Res});
22319 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22320}
22321
22322SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22323 bool IsStrict = Op->isStrictFPOpcode();
22324
22325 SDLoc DL(Op);
22326 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22327 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22328 MVT VT = Op.getSimpleValueType();
22329 MVT SVT = In.getSimpleValueType();
22330
22331 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22332 return SDValue();
22333
22334 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22335 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22336 if (!Subtarget.getTargetTriple().isOSDarwin())
22337 return SDValue();
22338
22339 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22340 TargetLowering::CallLoweringInfo CLI(DAG);
22341 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22342
22344 TargetLowering::ArgListEntry Entry(
22345 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22346 Entry.IsSExt = false;
22347 Entry.IsZExt = true;
22348 Args.push_back(Entry);
22349
22351 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22352 : RTLIB::FPROUND_F32_F16),
22354 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22355 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22356 std::move(Args));
22357
22358 SDValue Res;
22359 std::tie(Res, Chain) = LowerCallTo(CLI);
22360
22361 Res = DAG.getBitcast(MVT::f16, Res);
22362
22363 if (IsStrict)
22364 Res = DAG.getMergeValues({Res, Chain}, DL);
22365
22366 return Res;
22367 }
22368
22369 if (VT.getScalarType() == MVT::bf16) {
22370 if (SVT.getScalarType() == MVT::f32 &&
22371 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22372 Subtarget.hasAVXNECONVERT()))
22373 return Op;
22374 return SDValue();
22375 }
22376
22377 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22378 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22379 return SDValue();
22380
22381 if (VT.isVector())
22382 return Op;
22383
22384 SDValue Res;
22386 MVT::i32);
22387 if (IsStrict) {
22388 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22389 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22390 DAG.getVectorIdxConstant(0, DL));
22391 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22392 {Chain, Res, Rnd});
22393 Chain = Res.getValue(1);
22394 } else {
22395 // FIXME: Should we use zeros for upper elements for non-strict?
22396 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22397 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22398 }
22399
22400 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22401 DAG.getVectorIdxConstant(0, DL));
22402 Res = DAG.getBitcast(MVT::f16, Res);
22403
22404 if (IsStrict)
22405 return DAG.getMergeValues({Res, Chain}, DL);
22406
22407 return Res;
22408 }
22409
22410 return Op;
22411}
22412
22414 bool IsStrict = Op->isStrictFPOpcode();
22415 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22416 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22417 "Unexpected VT!");
22418
22419 SDLoc dl(Op);
22420 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22421 DAG.getConstant(0, dl, MVT::v8i16), Src,
22422 DAG.getVectorIdxConstant(0, dl));
22423
22424 SDValue Chain;
22425 if (IsStrict) {
22426 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22427 {Op.getOperand(0), Res});
22428 Chain = Res.getValue(1);
22429 } else {
22430 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22431 }
22432
22433 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22434 DAG.getVectorIdxConstant(0, dl));
22435
22436 if (IsStrict)
22437 return DAG.getMergeValues({Res, Chain}, dl);
22438
22439 return Res;
22440}
22441
22443 bool IsStrict = Op->isStrictFPOpcode();
22444 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22445 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22446 "Unexpected VT!");
22447
22448 SDLoc dl(Op);
22449 SDValue Res, Chain;
22450 if (IsStrict) {
22451 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22452 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22453 DAG.getVectorIdxConstant(0, dl));
22454 Res = DAG.getNode(
22455 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22456 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22457 Chain = Res.getValue(1);
22458 } else {
22459 // FIXME: Should we use zeros for upper elements for non-strict?
22460 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22461 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22462 DAG.getTargetConstant(4, dl, MVT::i32));
22463 }
22464
22465 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22466 DAG.getVectorIdxConstant(0, dl));
22467
22468 if (IsStrict)
22469 return DAG.getMergeValues({Res, Chain}, dl);
22470
22471 return Res;
22472}
22473
22474SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22475 SelectionDAG &DAG) const {
22476 SDLoc DL(Op);
22477
22478 MVT SVT = Op.getOperand(0).getSimpleValueType();
22479 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22480 Subtarget.hasAVXNECONVERT())) {
22481 SDValue Res;
22482 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22483 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22484 Res = DAG.getBitcast(MVT::v8i16, Res);
22485 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22486 DAG.getVectorIdxConstant(0, DL));
22487 }
22488
22489 MakeLibCallOptions CallOptions;
22490 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22491 SDValue Res =
22492 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22493 return DAG.getBitcast(MVT::i16, Res);
22494}
22495
22496/// Depending on uarch and/or optimizing for size, we might prefer to use a
22497/// vector operation in place of the typical scalar operation.
22499 SelectionDAG &DAG,
22500 const X86Subtarget &Subtarget) {
22501 // If both operands have other uses, this is probably not profitable.
22502 SDValue LHS = Op.getOperand(0);
22503 SDValue RHS = Op.getOperand(1);
22504 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22505 return Op;
22506
22507 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22508 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22509 if (IsFP && !Subtarget.hasSSE3())
22510 return Op;
22511 if (!IsFP && !Subtarget.hasSSSE3())
22512 return Op;
22513
22514 // Extract from a common vector.
22515 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22516 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22517 LHS.getOperand(0) != RHS.getOperand(0) ||
22518 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22519 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22520 !shouldUseHorizontalOp(true, DAG, Subtarget))
22521 return Op;
22522
22523 // Allow commuted 'hadd' ops.
22524 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22525 unsigned HOpcode;
22526 switch (Op.getOpcode()) {
22527 // clang-format off
22528 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22529 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22530 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22531 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22532 default:
22533 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22534 // clang-format on
22535 }
22536 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22537 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22538 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22539 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22540 std::swap(LExtIndex, RExtIndex);
22541
22542 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22543 return Op;
22544
22545 SDValue X = LHS.getOperand(0);
22546 EVT VecVT = X.getValueType();
22547 unsigned BitWidth = VecVT.getSizeInBits();
22548 unsigned NumLanes = BitWidth / 128;
22549 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22550 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22551 "Not expecting illegal vector widths here");
22552
22553 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22554 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22555 if (BitWidth == 256 || BitWidth == 512) {
22556 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22557 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22558 LExtIndex %= NumEltsPerLane;
22559 }
22560
22561 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22562 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22563 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22564 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22565 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22566 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22567 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22568}
22569
22570/// Depending on uarch and/or optimizing for size, we might prefer to use a
22571/// vector operation in place of the typical scalar operation.
22572SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22573 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22574 "Only expecting float/double");
22575 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22576}
22577
22578/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22579/// This mode isn't supported in hardware on X86. But as long as we aren't
22580/// compiling with trapping math, we can emulate this with
22581/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22583 SDValue N0 = Op.getOperand(0);
22584 SDLoc dl(Op);
22585 MVT VT = Op.getSimpleValueType();
22586
22587 // N0 += copysign(nextafter(0.5, 0.0), N0)
22588 const fltSemantics &Sem = VT.getFltSemantics();
22589 bool Ignored;
22590 APFloat Point5Pred = APFloat(0.5f);
22591 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22592 Point5Pred.next(/*nextDown*/true);
22593
22594 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22595 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22596 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22597
22598 // Truncate the result to remove fraction.
22599 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22600}
22601
22602/// The only differences between FABS and FNEG are the mask and the logic op.
22603/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22605 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22606 "Wrong opcode for lowering FABS or FNEG.");
22607
22608 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22609
22610 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22611 // into an FNABS. We'll lower the FABS after that if it is still in use.
22612 if (IsFABS)
22613 for (SDNode *User : Op->users())
22614 if (User->getOpcode() == ISD::FNEG)
22615 return Op;
22616
22617 SDLoc dl(Op);
22618 MVT VT = Op.getSimpleValueType();
22619
22620 bool IsF128 = (VT == MVT::f128);
22621 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22623 "Unexpected type in LowerFABSorFNEG");
22624
22625 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22626 // decide if we should generate a 16-byte constant mask when we only need 4 or
22627 // 8 bytes for the scalar case.
22628
22629 // There are no scalar bitwise logical SSE/AVX instructions, so we
22630 // generate a 16-byte vector constant and logic op even for the scalar case.
22631 // Using a 16-byte mask allows folding the load of the mask with
22632 // the logic op, so it can save (~4 bytes) on code size.
22633 bool IsFakeVector = !VT.isVector() && !IsF128;
22634 MVT LogicVT = VT;
22635 if (IsFakeVector)
22636 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22637 : (VT == MVT::f32) ? MVT::v4f32
22638 : MVT::v8f16;
22639
22640 unsigned EltBits = VT.getScalarSizeInBits();
22641 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22642 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22643 APInt::getSignMask(EltBits);
22644 const fltSemantics &Sem = VT.getFltSemantics();
22645 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22646
22647 SDValue Op0 = Op.getOperand(0);
22648 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22649 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22650 IsFNABS ? X86ISD::FOR :
22652 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22653
22654 if (VT.isVector() || IsF128)
22655 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22656
22657 // For the scalar case extend to a 128-bit vector, perform the logic op,
22658 // and extract the scalar result back out.
22659 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22660 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22661 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22662 DAG.getVectorIdxConstant(0, dl));
22663}
22664
22666 SDValue Mag = Op.getOperand(0);
22667 SDValue Sign = Op.getOperand(1);
22668 SDLoc dl(Op);
22669
22670 // If the sign operand is smaller, extend it first.
22671 MVT VT = Op.getSimpleValueType();
22672 if (Sign.getSimpleValueType().bitsLT(VT))
22673 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22674
22675 // And if it is bigger, shrink it first.
22676 if (Sign.getSimpleValueType().bitsGT(VT))
22677 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22678 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22679
22680 // At this point the operands and the result should have the same
22681 // type, and that won't be f80 since that is not custom lowered.
22682 bool IsF128 = (VT == MVT::f128);
22683 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22685 "Unexpected type in LowerFCOPYSIGN");
22686
22687 const fltSemantics &Sem = VT.getFltSemantics();
22688
22689 // Perform all scalar logic operations as 16-byte vectors because there are no
22690 // scalar FP logic instructions in SSE.
22691 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22692 // unnecessary splats, but we might miss load folding opportunities. Should
22693 // this decision be based on OptimizeForSize?
22694 bool IsFakeVector = !VT.isVector() && !IsF128;
22695 MVT LogicVT = VT;
22696 if (IsFakeVector)
22697 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22698 : (VT == MVT::f32) ? MVT::v4f32
22699 : MVT::v8f16;
22700
22701 // The mask constants are automatically splatted for vector types.
22702 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22703 SDValue SignMask = DAG.getConstantFP(
22704 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22705 SDValue MagMask = DAG.getConstantFP(
22706 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22707
22708 // First, clear all bits but the sign bit from the second operand (sign).
22709 if (IsFakeVector)
22710 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22711 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22712
22713 // Next, clear the sign bit from the first operand (magnitude).
22714 // TODO: If we had general constant folding for FP logic ops, this check
22715 // wouldn't be necessary.
22716 SDValue MagBits;
22717 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22718 APFloat APF = Op0CN->getValueAPF();
22719 APF.clearSign();
22720 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22721 } else {
22722 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22723 if (IsFakeVector)
22724 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22725 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22726 }
22727
22728 // OR the magnitude value with the sign bit.
22729 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22730 return !IsFakeVector ? Or
22731 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22732 DAG.getVectorIdxConstant(0, dl));
22733}
22734
22736 SDValue N0 = Op.getOperand(0);
22737 SDLoc dl(Op);
22738 MVT VT = Op.getSimpleValueType();
22739
22740 MVT OpVT = N0.getSimpleValueType();
22741 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22742 "Unexpected type for FGETSIGN");
22743
22744 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22745 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22746 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22747 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22748 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22749 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22750 return Res;
22751}
22752
22753/// Helper for attempting to create a X86ISD::BT node.
22754static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22755 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22756 // instruction. Since the shift amount is in-range-or-undefined, we know
22757 // that doing a bittest on the i32 value is ok. We extend to i32 because
22758 // the encoding for the i16 version is larger than the i32 version.
22759 // Also promote i16 to i32 for performance / code size reason.
22760 if (Src.getValueType().getScalarSizeInBits() < 32)
22761 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22762
22763 // No legal type found, give up.
22764 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22765 return SDValue();
22766
22767 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22768 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22769 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22770 // known to be zero.
22771 if (Src.getValueType() == MVT::i64 &&
22772 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22773 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22774
22775 // If the operand types disagree, extend the shift amount to match. Since
22776 // BT ignores high bits (like shifts) we can use anyextend.
22777 if (Src.getValueType() != BitNo.getValueType()) {
22778 // Peek through a mask/modulo operation.
22779 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22780 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22781 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22782 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22783 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22784 BitNo.getOperand(0)),
22785 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22786 BitNo.getOperand(1)));
22787 else
22788 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22789 }
22790
22791 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22792}
22793
22794/// Helper for creating a X86ISD::SETCC node.
22796 SelectionDAG &DAG) {
22797 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22798 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22799}
22800
22801/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22802/// recognizable memcmp expansion.
22803static bool isOrXorXorTree(SDValue X, bool Root = true) {
22804 if (X.getOpcode() == ISD::OR)
22805 return isOrXorXorTree(X.getOperand(0), false) &&
22806 isOrXorXorTree(X.getOperand(1), false);
22807 if (Root)
22808 return false;
22809 return X.getOpcode() == ISD::XOR;
22810}
22811
22812/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22813/// expansion.
22814template <typename F>
22816 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22817 SDValue Op0 = X.getOperand(0);
22818 SDValue Op1 = X.getOperand(1);
22819 if (X.getOpcode() == ISD::OR) {
22820 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22821 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22822 if (VecVT != CmpVT)
22823 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22824 if (HasPT)
22825 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22826 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22827 }
22828 if (X.getOpcode() == ISD::XOR) {
22829 SDValue A = SToV(Op0);
22830 SDValue B = SToV(Op1);
22831 if (VecVT != CmpVT)
22832 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22833 if (HasPT)
22834 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22835 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22836 }
22837 llvm_unreachable("Impossible");
22838}
22839
22840/// Try to map a 128-bit or larger integer comparison to vector instructions
22841/// before type legalization splits it up into chunks.
22843 ISD::CondCode CC,
22844 const SDLoc &DL,
22845 SelectionDAG &DAG,
22846 const X86Subtarget &Subtarget) {
22847 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22848
22849 // We're looking for an oversized integer equality comparison.
22850 EVT OpVT = X.getValueType();
22851 unsigned OpSize = OpVT.getSizeInBits();
22852 if (!OpVT.isScalarInteger() || OpSize < 128)
22853 return SDValue();
22854
22855 // Ignore a comparison with zero because that gets special treatment in
22856 // EmitTest(). But make an exception for the special case of a pair of
22857 // logically-combined vector-sized operands compared to zero. This pattern may
22858 // be generated by the memcmp expansion pass with oversized integer compares
22859 // (see PR33325).
22860 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22861 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22862 return SDValue();
22863
22864 // Don't perform this combine if constructing the vector will be expensive.
22865 auto IsVectorBitCastCheap = [](SDValue X) {
22867 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22868 X.getOpcode() == ISD::LOAD;
22869 };
22870 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22871 !IsOrXorXorTreeCCZero)
22872 return SDValue();
22873
22874 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22875 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22876 // Otherwise use PCMPEQ (plus AND) and mask testing.
22877 bool NoImplicitFloatOps =
22879 Attribute::NoImplicitFloat);
22880 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22881 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22882 (OpSize == 256 && Subtarget.hasAVX()) ||
22883 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22884 bool HasPT = Subtarget.hasSSE41();
22885
22886 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22887 // vector registers are essentially free. (Technically, widening registers
22888 // prevents load folding, but the tradeoff is worth it.)
22889 bool PreferKOT = Subtarget.preferMaskRegisters();
22890 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22891
22892 EVT VecVT = MVT::v16i8;
22893 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22894 if (OpSize == 256) {
22895 VecVT = MVT::v32i8;
22896 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22897 }
22898 EVT CastVT = VecVT;
22899 bool NeedsAVX512FCast = false;
22900 if (OpSize == 512 || NeedZExt) {
22901 if (Subtarget.hasBWI()) {
22902 VecVT = MVT::v64i8;
22903 CmpVT = MVT::v64i1;
22904 if (OpSize == 512)
22905 CastVT = VecVT;
22906 } else {
22907 VecVT = MVT::v16i32;
22908 CmpVT = MVT::v16i1;
22909 CastVT = OpSize == 512 ? VecVT
22910 : OpSize == 256 ? MVT::v8i32
22911 : MVT::v4i32;
22912 NeedsAVX512FCast = true;
22913 }
22914 }
22915
22916 auto ScalarToVector = [&](SDValue X) -> SDValue {
22917 bool TmpZext = false;
22918 EVT TmpCastVT = CastVT;
22919 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22920 SDValue OrigX = X.getOperand(0);
22921 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22922 if (OrigSize < OpSize) {
22923 if (OrigSize == 128) {
22924 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22925 X = OrigX;
22926 TmpZext = true;
22927 } else if (OrigSize == 256) {
22928 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22929 X = OrigX;
22930 TmpZext = true;
22931 }
22932 }
22933 }
22934 X = DAG.getBitcast(TmpCastVT, X);
22935 if (!NeedZExt && !TmpZext)
22936 return X;
22937 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22938 DAG.getConstant(0, DL, VecVT), X,
22939 DAG.getVectorIdxConstant(0, DL));
22940 };
22941
22942 SDValue Cmp;
22943 if (IsOrXorXorTreeCCZero) {
22944 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22945 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22946 // Use 2 vector equality compares and 'and' the results before doing a
22947 // MOVMSK.
22948 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22949 } else {
22950 SDValue VecX = ScalarToVector(X);
22951 SDValue VecY = ScalarToVector(Y);
22952 if (VecVT != CmpVT) {
22953 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22954 } else if (HasPT) {
22955 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22956 } else {
22957 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22958 }
22959 }
22960 // AVX512 should emit a setcc that will lower to kortest.
22961 if (VecVT != CmpVT) {
22962 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22963 : CmpVT == MVT::v32i1 ? MVT::i32
22964 : MVT::i16;
22965 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22966 DAG.getConstant(0, DL, KRegVT), CC);
22967 }
22968 if (HasPT) {
22969 SDValue BCCmp =
22970 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22971 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22973 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22974 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22975 }
22976 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22977 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22978 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22979 assert(Cmp.getValueType() == MVT::v16i8 &&
22980 "Non 128-bit vector on pre-SSE41 target");
22981 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22982 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22983 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22984 }
22985
22986 return SDValue();
22987}
22988
22989/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22990/// style scalarized (associative) reduction patterns. Partial reductions
22991/// are supported when the pointer SrcMask is non-null.
22992/// TODO - move this to SelectionDAG?
22995 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22997 DenseMap<SDValue, APInt> SrcOpMap;
22998 EVT VT = MVT::Other;
22999
23000 // Recognize a special case where a vector is casted into wide integer to
23001 // test all 0s.
23002 assert(Op.getOpcode() == unsigned(BinOp) &&
23003 "Unexpected bit reduction opcode");
23004 Opnds.push_back(Op.getOperand(0));
23005 Opnds.push_back(Op.getOperand(1));
23006
23007 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
23009 // BFS traverse all BinOp operands.
23010 if (I->getOpcode() == unsigned(BinOp)) {
23011 Opnds.push_back(I->getOperand(0));
23012 Opnds.push_back(I->getOperand(1));
23013 // Re-evaluate the number of nodes to be traversed.
23014 e += 2; // 2 more nodes (LHS and RHS) are pushed.
23015 continue;
23016 }
23017
23018 // Quit if a non-EXTRACT_VECTOR_ELT
23019 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23020 return false;
23021
23022 // Quit if without a constant index.
23023 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23024 if (!Idx)
23025 return false;
23026
23027 SDValue Src = I->getOperand(0);
23028 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23029 if (M == SrcOpMap.end()) {
23030 VT = Src.getValueType();
23031 // Quit if not the same type.
23032 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23033 return false;
23034 unsigned NumElts = VT.getVectorNumElements();
23035 APInt EltCount = APInt::getZero(NumElts);
23036 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23037 SrcOps.push_back(Src);
23038 }
23039
23040 // Quit if element already used.
23041 unsigned CIdx = Idx->getZExtValue();
23042 if (M->second[CIdx])
23043 return false;
23044 M->second.setBit(CIdx);
23045 }
23046
23047 if (SrcMask) {
23048 // Collect the source partial masks.
23049 for (SDValue &SrcOp : SrcOps)
23050 SrcMask->push_back(SrcOpMap[SrcOp]);
23051 } else {
23052 // Quit if not all elements are used.
23053 for (const auto &I : SrcOpMap)
23054 if (!I.second.isAllOnes())
23055 return false;
23056 }
23057
23058 return true;
23059}
23060
23061// Helper function for comparing all bits of two vectors.
23063 ISD::CondCode CC, const APInt &OriginalMask,
23064 const X86Subtarget &Subtarget,
23065 SelectionDAG &DAG, X86::CondCode &X86CC) {
23066 EVT VT = LHS.getValueType();
23067 unsigned ScalarSize = VT.getScalarSizeInBits();
23068 if (OriginalMask.getBitWidth() != ScalarSize) {
23069 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23070 return SDValue();
23071 }
23072
23073 // Quit if not convertable to legal scalar or 128/256-bit vector.
23075 return SDValue();
23076
23077 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23078 if (VT.isFloatingPoint())
23079 return SDValue();
23080
23081 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23082 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23083
23084 APInt Mask = OriginalMask;
23085
23086 auto MaskBits = [&](SDValue Src) {
23087 if (Mask.isAllOnes())
23088 return Src;
23089 EVT SrcVT = Src.getValueType();
23090 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23091 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23092 };
23093
23094 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23095 if (VT.getSizeInBits() < 128) {
23096 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23097 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23098 if (IntVT != MVT::i64)
23099 return SDValue();
23100 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23101 MVT::i32, MVT::i32);
23102 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23103 MVT::i32, MVT::i32);
23104 SDValue Lo =
23105 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23106 SDValue Hi =
23107 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23108 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23109 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23110 DAG.getConstant(0, DL, MVT::i32));
23111 }
23112 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23113 DAG.getBitcast(IntVT, MaskBits(LHS)),
23114 DAG.getBitcast(IntVT, MaskBits(RHS)));
23115 }
23116
23117 // Without PTEST, a masked v2i64 or-reduction is not faster than
23118 // scalarization.
23119 bool UseKORTEST = Subtarget.useAVX512Regs();
23120 bool UsePTEST = Subtarget.hasSSE41();
23121 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23122 return SDValue();
23123
23124 // Split down to 128/256/512-bit vector.
23125 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23126
23127 // If the input vector has vector elements wider than the target test size,
23128 // then cast to <X x i64> so it will safely split.
23129 if (ScalarSize > TestSize) {
23130 if (!Mask.isAllOnes())
23131 return SDValue();
23132 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23133 LHS = DAG.getBitcast(VT, LHS);
23134 RHS = DAG.getBitcast(VT, RHS);
23135 Mask = APInt::getAllOnes(64);
23136 }
23137
23138 if (VT.getSizeInBits() > TestSize) {
23139 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23140 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23141 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23142 while (VT.getSizeInBits() > TestSize) {
23143 auto Split = DAG.SplitVector(LHS, DL);
23144 VT = Split.first.getValueType();
23145 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23146 }
23147 RHS = DAG.getAllOnesConstant(DL, VT);
23148 } else if (!UsePTEST && !KnownRHS.isZero()) {
23149 // MOVMSK Special Case:
23150 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23151 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23152 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23153 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23154 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23155 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23156 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23157 V = DAG.getSExtOrTrunc(V, DL, VT);
23158 while (VT.getSizeInBits() > TestSize) {
23159 auto Split = DAG.SplitVector(V, DL);
23160 VT = Split.first.getValueType();
23161 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23162 }
23163 V = DAG.getNOT(DL, V, VT);
23164 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23165 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23166 DAG.getConstant(0, DL, MVT::i32));
23167 } else {
23168 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23169 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23170 while (VT.getSizeInBits() > TestSize) {
23171 auto Split = DAG.SplitVector(V, DL);
23172 VT = Split.first.getValueType();
23173 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23174 }
23175 LHS = V;
23176 RHS = DAG.getConstant(0, DL, VT);
23177 }
23178 }
23179
23180 if (UseKORTEST && VT.is512BitVector()) {
23181 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23182 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23183 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23184 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23185 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23186 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23187 }
23188
23189 if (UsePTEST) {
23190 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23191 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23192 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23193 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23194 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23195 }
23196
23197 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23198 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23199 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23200 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23201 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23202 V = DAG.getNOT(DL, V, MaskVT);
23203 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23204 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23205 DAG.getConstant(0, DL, MVT::i32));
23206}
23207
23208// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23209// to CMP(MOVMSK(PCMPEQB(X,Y))).
23211 ISD::CondCode CC, const SDLoc &DL,
23212 const X86Subtarget &Subtarget,
23213 SelectionDAG &DAG,
23214 X86::CondCode &X86CC) {
23215 SDValue Op = OrigLHS;
23216
23217 bool CmpNull;
23218 APInt Mask;
23219 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23220 CmpNull = isNullConstant(OrigRHS);
23221 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23222 return SDValue();
23223
23224 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23225 return SDValue();
23226
23227 // Check whether we're masking/truncating an OR-reduction result, in which
23228 // case track the masked bits.
23229 // TODO: Add CmpAllOnes support.
23230 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23231 if (CmpNull) {
23232 switch (Op.getOpcode()) {
23233 case ISD::TRUNCATE: {
23234 SDValue Src = Op.getOperand(0);
23235 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23236 Op.getScalarValueSizeInBits());
23237 Op = Src;
23238 break;
23239 }
23240 case ISD::AND: {
23241 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23242 Mask = Cst->getAPIntValue();
23243 Op = Op.getOperand(0);
23244 }
23245 break;
23246 }
23247 }
23248 }
23249 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23250 CC = ISD::SETEQ;
23251 CmpNull = true;
23252 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23253 } else {
23254 return SDValue();
23255 }
23256
23257 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23258
23259 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23260 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23262 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23263 EVT VT = VecIns[0].getValueType();
23264 assert(llvm::all_of(VecIns,
23265 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23266 "Reduction source vector mismatch");
23267
23268 // Quit if not splittable to scalar/128/256/512-bit vector.
23270 return SDValue();
23271
23272 // If more than one full vector is evaluated, AND/OR them first before
23273 // PTEST.
23274 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23275 Slot += 2, e += 1) {
23276 // Each iteration will AND/OR 2 nodes and append the result until there is
23277 // only 1 node left, i.e. the final value of all vectors.
23278 SDValue LHS = VecIns[Slot];
23279 SDValue RHS = VecIns[Slot + 1];
23280 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23281 }
23282
23283 return LowerVectorAllEqual(DL, VecIns.back(),
23284 CmpNull ? DAG.getConstant(0, DL, VT)
23285 : DAG.getAllOnesConstant(DL, VT),
23286 CC, Mask, Subtarget, DAG, X86CC);
23287 }
23288
23289 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23290 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23291 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23292 ISD::NodeType BinOp;
23293 if (SDValue Match =
23294 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23295 EVT MatchVT = Match.getValueType();
23296 return LowerVectorAllEqual(DL, Match,
23297 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23298 : DAG.getAllOnesConstant(DL, MatchVT),
23299 CC, Mask, Subtarget, DAG, X86CC);
23300 }
23301 }
23302
23303 if (Mask.isAllOnes()) {
23304 assert(!Op.getValueType().isVector() &&
23305 "Illegal vector type for reduction pattern");
23307 if (Src.getValueType().isFixedLengthVector() &&
23308 Src.getValueType().getScalarType() == MVT::i1) {
23309 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23310 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23311 if (Src.getOpcode() == ISD::SETCC) {
23312 SDValue LHS = Src.getOperand(0);
23313 SDValue RHS = Src.getOperand(1);
23314 EVT LHSVT = LHS.getValueType();
23315 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23316 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23318 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23319 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23320 X86CC);
23321 }
23322 }
23323 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23324 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23325 // Peek through truncation, mask the LSB and compare against zero/LSB.
23326 if (Src.getOpcode() == ISD::TRUNCATE) {
23327 SDValue Inner = Src.getOperand(0);
23328 EVT InnerVT = Inner.getValueType();
23330 unsigned BW = InnerVT.getScalarSizeInBits();
23331 APInt SrcMask = APInt(BW, 1);
23332 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23333 return LowerVectorAllEqual(DL, Inner,
23334 DAG.getConstant(Cmp, DL, InnerVT), CC,
23335 SrcMask, Subtarget, DAG, X86CC);
23336 }
23337 }
23338 }
23339 }
23340
23341 return SDValue();
23342}
23343
23344/// return true if \c Op has a use that doesn't just read flags.
23346 for (SDUse &Use : Op->uses()) {
23347 SDNode *User = Use.getUser();
23348 unsigned UOpNo = Use.getOperandNo();
23349 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23350 // Look past truncate.
23351 UOpNo = User->use_begin()->getOperandNo();
23352 User = User->use_begin()->getUser();
23353 }
23354
23355 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23356 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23357 return true;
23358 }
23359 return false;
23360}
23361
23362// Transform to an x86-specific ALU node with flags if there is a chance of
23363// using an RMW op or only the flags are used. Otherwise, leave
23364// the node alone and emit a 'cmp' or 'test' instruction.
23366 for (SDNode *U : Op->users())
23367 if (U->getOpcode() != ISD::CopyToReg &&
23368 U->getOpcode() != ISD::SETCC &&
23369 U->getOpcode() != ISD::STORE)
23370 return false;
23371
23372 return true;
23373}
23374
23375/// Emit nodes that will be selected as "test Op0,Op0", or something
23376/// equivalent.
23378 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23379 // CF and OF aren't always set the way we want. Determine which
23380 // of these we need.
23381 bool NeedCF = false;
23382 bool NeedOF = false;
23383 switch (X86CC) {
23384 default: break;
23385 case X86::COND_A: case X86::COND_AE:
23386 case X86::COND_B: case X86::COND_BE:
23387 NeedCF = true;
23388 break;
23389 case X86::COND_G: case X86::COND_GE:
23390 case X86::COND_L: case X86::COND_LE:
23391 case X86::COND_O: case X86::COND_NO: {
23392 // Check if we really need to set the
23393 // Overflow flag. If NoSignedWrap is present
23394 // that is not actually needed.
23395 switch (Op->getOpcode()) {
23396 case ISD::ADD:
23397 case ISD::SUB:
23398 case ISD::MUL:
23399 case ISD::SHL:
23400 if (Op.getNode()->getFlags().hasNoSignedWrap())
23401 break;
23402 [[fallthrough]];
23403 default:
23404 NeedOF = true;
23405 break;
23406 }
23407 break;
23408 }
23409 }
23410 // See if we can use the EFLAGS value from the operand instead of
23411 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23412 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23413 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23414 // Emit a CMP with 0, which is the TEST pattern.
23415 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23416 DAG.getConstant(0, dl, Op.getValueType()));
23417 }
23418 unsigned Opcode = 0;
23419 unsigned NumOperands = 0;
23420
23421 SDValue ArithOp = Op;
23422
23423 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23424 // which may be the result of a CAST. We use the variable 'Op', which is the
23425 // non-casted variable when we check for possible users.
23426 switch (ArithOp.getOpcode()) {
23427 case ISD::AND:
23428 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23429 // because a TEST instruction will be better.
23430 if (!hasNonFlagsUse(Op))
23431 break;
23432
23433 [[fallthrough]];
23434 case ISD::ADD:
23435 case ISD::SUB:
23436 case ISD::OR:
23437 case ISD::XOR:
23439 break;
23440
23441 // Otherwise use a regular EFLAGS-setting instruction.
23442 switch (ArithOp.getOpcode()) {
23443 // clang-format off
23444 default: llvm_unreachable("unexpected operator!");
23445 case ISD::ADD: Opcode = X86ISD::ADD; break;
23446 case ISD::SUB: Opcode = X86ISD::SUB; break;
23447 case ISD::XOR: Opcode = X86ISD::XOR; break;
23448 case ISD::AND: Opcode = X86ISD::AND; break;
23449 case ISD::OR: Opcode = X86ISD::OR; break;
23450 // clang-format on
23451 }
23452
23453 NumOperands = 2;
23454 break;
23455 case X86ISD::ADD:
23456 case X86ISD::SUB:
23457 case X86ISD::OR:
23458 case X86ISD::XOR:
23459 case X86ISD::AND:
23460 return SDValue(Op.getNode(), 1);
23461 case ISD::SSUBO:
23462 case ISD::USUBO: {
23463 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23464 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23465 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23466 Op->getOperand(1)).getValue(1);
23467 }
23468 default:
23469 break;
23470 }
23471
23472 if (Opcode == 0) {
23473 // Emit a CMP with 0, which is the TEST pattern.
23474 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23475 DAG.getConstant(0, dl, Op.getValueType()));
23476 }
23477 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23478 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23479
23480 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23481 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23482 return SDValue(New.getNode(), 1);
23483}
23484
23485/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23486/// equivalent.
23488 const SDLoc &dl, SelectionDAG &DAG,
23489 const X86Subtarget &Subtarget) {
23490 if (isNullConstant(Op1))
23491 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23492
23493 EVT CmpVT = Op0.getValueType();
23494
23495 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23496 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23497
23498 // Only promote the compare up to I32 if it is a 16 bit operation
23499 // with an immediate. 16 bit immediates are to be avoided unless the target
23500 // isn't slowed down by length changing prefixes, we're optimizing for
23501 // codesize or the comparison is with a folded load.
23502 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23503 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23505 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23506 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23507 // Don't do this if the immediate can fit in 8-bits.
23508 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23509 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23510 unsigned ExtendOp =
23512 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23513 // For equality comparisons try to use SIGN_EXTEND if the input was
23514 // truncate from something with enough sign bits.
23515 if (Op0.getOpcode() == ISD::TRUNCATE) {
23516 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23517 ExtendOp = ISD::SIGN_EXTEND;
23518 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23519 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23520 ExtendOp = ISD::SIGN_EXTEND;
23521 }
23522 }
23523
23524 CmpVT = MVT::i32;
23525 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23526 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23527 }
23528 }
23529
23530 // Try to shrink i64 compares if the input has enough zero bits.
23531 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23532 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23533 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23534 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23535 CmpVT = MVT::i32;
23536 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23537 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23538 }
23539
23540 // Try to shrink all i64 compares if the inputs are representable as signed
23541 // i32.
23542 if (CmpVT == MVT::i64 &&
23543 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23544 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23545 CmpVT = MVT::i32;
23546 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23547 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23548 }
23549
23550 // 0-x == y --> x+y == 0
23551 // 0-x != y --> x+y != 0
23552 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23553 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23554 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23555 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23556 return Add.getValue(1);
23557 }
23558
23559 // x == 0-y --> x+y == 0
23560 // x != 0-y --> x+y != 0
23561 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23562 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23563 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23564 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23565 return Add.getValue(1);
23566 }
23567
23568 // If we already have an XOR of the ops, use that to check for equality.
23569 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23570 unsigned X86Opc = X86ISD::SUB;
23571 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23572 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23573 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23574 X86Opc = X86ISD::XOR;
23575
23576 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23577 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23578 return CmpOp.getValue(1);
23579}
23580
23585
23586bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23587 SDNode *N, SDValue, SDValue IntPow2) const {
23588 if (N->getOpcode() == ISD::FDIV)
23589 return true;
23590
23591 EVT FPVT = N->getValueType(0);
23592 EVT IntVT = IntPow2.getValueType();
23593
23594 // This indicates a non-free bitcast.
23595 // TODO: This is probably overly conservative as we will need to scale the
23596 // integer vector anyways for the int->fp cast.
23597 if (FPVT.isVector() &&
23598 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23599 return false;
23600
23601 return true;
23602}
23603
23604/// Check if replacement of SQRT with RSQRT should be disabled.
23605bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23606 EVT VT = Op.getValueType();
23607
23608 // We don't need to replace SQRT with RSQRT for half type.
23609 if (VT.getScalarType() == MVT::f16)
23610 return true;
23611
23612 // We never want to use both SQRT and RSQRT instructions for the same input.
23613 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23614 return false;
23615
23616 if (VT.isVector())
23617 return Subtarget.hasFastVectorFSQRT();
23618 return Subtarget.hasFastScalarFSQRT();
23619}
23620
23621/// The minimum architected relative accuracy is 2^-12. We need one
23622/// Newton-Raphson step to have a good float result (24 bits of precision).
23623SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23624 SelectionDAG &DAG, int Enabled,
23625 int &RefinementSteps,
23626 bool &UseOneConstNR,
23627 bool Reciprocal) const {
23628 SDLoc DL(Op);
23629 EVT VT = Op.getValueType();
23630
23631 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23632 // It is likely not profitable to do this for f64 because a double-precision
23633 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23634 // instructions: convert to single, rsqrtss, convert back to double, refine
23635 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23636 // along with FMA, this could be a throughput win.
23637 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23638 // after legalize types.
23639 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23640 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23641 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23642 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23643 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23644 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23645 RefinementSteps = 1;
23646
23647 UseOneConstNR = false;
23648 // There is no FSQRT for 512-bits, but there is RSQRT14.
23649 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23650 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23651 if (RefinementSteps == 0 && !Reciprocal)
23652 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23653 return Estimate;
23654 }
23655
23656 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23657 Subtarget.hasFP16()) {
23658 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23659 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23660 RefinementSteps = 0;
23661
23662 if (VT == MVT::f16) {
23664 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23665 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23666 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23667 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23668 }
23669
23670 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23671 }
23672 return SDValue();
23673}
23674
23675/// The minimum architected relative accuracy is 2^-12. We need one
23676/// Newton-Raphson step to have a good float result (24 bits of precision).
23677SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23678 int Enabled,
23679 int &RefinementSteps) const {
23680 SDLoc DL(Op);
23681 EVT VT = Op.getValueType();
23682
23683 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23684 // It is likely not profitable to do this for f64 because a double-precision
23685 // reciprocal estimate with refinement on x86 prior to FMA requires
23686 // 15 instructions: convert to single, rcpss, convert back to double, refine
23687 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23688 // along with FMA, this could be a throughput win.
23689
23690 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23691 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23692 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23693 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23694 // Enable estimate codegen with 1 refinement step for vector division.
23695 // Scalar division estimates are disabled because they break too much
23696 // real-world code. These defaults are intended to match GCC behavior.
23697 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23698 return SDValue();
23699
23700 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23701 RefinementSteps = 1;
23702
23703 // There is no FSQRT for 512-bits, but there is RCP14.
23704 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23705 return DAG.getNode(Opcode, DL, VT, Op);
23706 }
23707
23708 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23709 Subtarget.hasFP16()) {
23710 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23711 RefinementSteps = 0;
23712
23713 if (VT == MVT::f16) {
23715 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23716 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23717 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23718 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23719 }
23720
23721 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23722 }
23723 return SDValue();
23724}
23725
23726/// If we have at least two divisions that use the same divisor, convert to
23727/// multiplication by a reciprocal. This may need to be adjusted for a given
23728/// CPU if a division's cost is not at least twice the cost of a multiplication.
23729/// This is because we still need one division to calculate the reciprocal and
23730/// then we need two multiplies by that reciprocal as replacements for the
23731/// original divisions.
23733 return 2;
23734}
23735
23736SDValue
23737X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23738 SelectionDAG &DAG,
23739 SmallVectorImpl<SDNode *> &Created) const {
23740 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23741 if (isIntDivCheap(N->getValueType(0), Attr))
23742 return SDValue(N,0); // Lower SDIV as SDIV
23743
23744 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23745 "Unexpected divisor!");
23746
23747 // Only perform this transform if CMOV is supported otherwise the select
23748 // below will become a branch.
23749 if (!Subtarget.canUseCMOV())
23750 return SDValue();
23751
23752 // fold (sdiv X, pow2)
23753 EVT VT = N->getValueType(0);
23754 // FIXME: Support i8.
23755 if (VT != MVT::i16 && VT != MVT::i32 &&
23756 !(Subtarget.is64Bit() && VT == MVT::i64))
23757 return SDValue();
23758
23759 // If the divisor is 2 or -2, the default expansion is better.
23760 if (Divisor == 2 ||
23761 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23762 return SDValue();
23763
23764 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23765}
23766
23767/// Result of 'and' is compared against zero. Change to a BT node if possible.
23768/// Returns the BT node and the condition code needed to use it.
23770 SelectionDAG &DAG, X86::CondCode &X86CC) {
23771 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23772 SDValue Op0 = And.getOperand(0);
23773 SDValue Op1 = And.getOperand(1);
23774 if (Op0.getOpcode() == ISD::TRUNCATE)
23775 Op0 = Op0.getOperand(0);
23776 if (Op1.getOpcode() == ISD::TRUNCATE)
23777 Op1 = Op1.getOperand(0);
23778
23779 SDValue Src, BitNo;
23780 if (Op1.getOpcode() == ISD::SHL)
23781 std::swap(Op0, Op1);
23782 if (Op0.getOpcode() == ISD::SHL) {
23783 if (isOneConstant(Op0.getOperand(0))) {
23784 // If we looked past a truncate, check that it's only truncating away
23785 // known zeros.
23786 unsigned BitWidth = Op0.getValueSizeInBits();
23787 unsigned AndBitWidth = And.getValueSizeInBits();
23788 if (BitWidth > AndBitWidth) {
23789 KnownBits Known = DAG.computeKnownBits(Op0);
23790 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23791 return SDValue();
23792 }
23793 Src = Op1;
23794 BitNo = Op0.getOperand(1);
23795 }
23796 } else if (Op1.getOpcode() == ISD::Constant) {
23797 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23798 uint64_t AndRHSVal = AndRHS->getZExtValue();
23799 SDValue AndLHS = Op0;
23800
23801 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23802 Src = AndLHS.getOperand(0);
23803 BitNo = AndLHS.getOperand(1);
23804 } else {
23805 // Use BT if the immediate can't be encoded in a TEST instruction or we
23806 // are optimizing for size and the immedaite won't fit in a byte.
23807 bool OptForSize = DAG.shouldOptForSize();
23808 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23809 isPowerOf2_64(AndRHSVal)) {
23810 Src = AndLHS;
23811 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23812 Src.getValueType());
23813 }
23814 }
23815 }
23816
23817 // No patterns found, give up.
23818 if (!Src.getNode())
23819 return SDValue();
23820
23821 // Remove any bit flip.
23822 if (isBitwiseNot(Src)) {
23823 Src = Src.getOperand(0);
23824 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23825 }
23826
23827 // Attempt to create the X86ISD::BT node.
23828 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23829 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23830 return BT;
23831 }
23832
23833 return SDValue();
23834}
23835
23836// Check if pre-AVX condcode can be performed by a single FCMP op.
23837static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23838 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23839}
23840
23841/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23842/// CMPs.
23843static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23844 SDValue &Op1, bool &IsAlwaysSignaling) {
23845 unsigned SSECC;
23846 bool Swap = false;
23847
23848 // SSE Condition code mapping:
23849 // 0 - EQ
23850 // 1 - LT
23851 // 2 - LE
23852 // 3 - UNORD
23853 // 4 - NEQ
23854 // 5 - NLT
23855 // 6 - NLE
23856 // 7 - ORD
23857 switch (SetCCOpcode) {
23858 // clang-format off
23859 default: llvm_unreachable("Unexpected SETCC condition");
23860 case ISD::SETOEQ:
23861 case ISD::SETEQ: SSECC = 0; break;
23862 case ISD::SETOGT:
23863 case ISD::SETGT: Swap = true; [[fallthrough]];
23864 case ISD::SETLT:
23865 case ISD::SETOLT: SSECC = 1; break;
23866 case ISD::SETOGE:
23867 case ISD::SETGE: Swap = true; [[fallthrough]];
23868 case ISD::SETLE:
23869 case ISD::SETOLE: SSECC = 2; break;
23870 case ISD::SETUO: SSECC = 3; break;
23871 case ISD::SETUNE:
23872 case ISD::SETNE: SSECC = 4; break;
23873 case ISD::SETULE: Swap = true; [[fallthrough]];
23874 case ISD::SETUGE: SSECC = 5; break;
23875 case ISD::SETULT: Swap = true; [[fallthrough]];
23876 case ISD::SETUGT: SSECC = 6; break;
23877 case ISD::SETO: SSECC = 7; break;
23878 case ISD::SETUEQ: SSECC = 8; break;
23879 case ISD::SETONE: SSECC = 12; break;
23880 // clang-format on
23881 }
23882 if (Swap)
23883 std::swap(Op0, Op1);
23884
23885 switch (SetCCOpcode) {
23886 default:
23887 IsAlwaysSignaling = true;
23888 break;
23889 case ISD::SETEQ:
23890 case ISD::SETOEQ:
23891 case ISD::SETUEQ:
23892 case ISD::SETNE:
23893 case ISD::SETONE:
23894 case ISD::SETUNE:
23895 case ISD::SETO:
23896 case ISD::SETUO:
23897 IsAlwaysSignaling = false;
23898 break;
23899 }
23900
23901 return SSECC;
23902}
23903
23904/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23905/// concatenate the result back.
23907 SelectionDAG &DAG, const SDLoc &dl) {
23908 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23909 "Unsupported VTs!");
23910 SDValue CC = DAG.getCondCode(Cond);
23911
23912 // Extract the LHS Lo/Hi vectors
23913 SDValue LHS1, LHS2;
23914 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23915
23916 // Extract the RHS Lo/Hi vectors
23917 SDValue RHS1, RHS2;
23918 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23919
23920 // Issue the operation on the smaller types and concatenate the result back
23921 EVT LoVT, HiVT;
23922 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23923 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23924 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23925 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23926}
23927
23929 SelectionDAG &DAG) {
23930 SDValue Op0 = Op.getOperand(0);
23931 SDValue Op1 = Op.getOperand(1);
23932 SDValue CC = Op.getOperand(2);
23933 MVT VT = Op.getSimpleValueType();
23934 assert(VT.getVectorElementType() == MVT::i1 &&
23935 "Cannot set masked compare for this operation");
23936
23937 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23938
23939 // Prefer SETGT over SETLT.
23940 if (SetCCOpcode == ISD::SETLT) {
23941 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23942 std::swap(Op0, Op1);
23943 }
23944
23945 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23946}
23947
23948/// Given a buildvector constant, return a new vector constant with each element
23949/// incremented or decremented. If incrementing or decrementing would result in
23950/// unsigned overflow or underflow or this is not a simple vector constant,
23951/// return an empty value.
23953 bool NSW) {
23954 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23955 if (!BV || !V.getValueType().isSimple())
23956 return SDValue();
23957
23958 MVT VT = V.getSimpleValueType();
23959 MVT EltVT = VT.getVectorElementType();
23960 unsigned NumElts = VT.getVectorNumElements();
23962 SDLoc DL(V);
23963 for (unsigned i = 0; i < NumElts; ++i) {
23964 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23965 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23966 return SDValue();
23967
23968 // Avoid overflow/underflow.
23969 const APInt &EltC = Elt->getAPIntValue();
23970 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23971 return SDValue();
23972 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23973 (!IsInc && EltC.isMinSignedValue())))
23974 return SDValue();
23975
23976 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23977 }
23978
23979 return DAG.getBuildVector(VT, DL, NewVecC);
23980}
23981
23982/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23983/// Op0 u<= Op1:
23984/// t = psubus Op0, Op1
23985/// pcmpeq t, <0..0>
23987 ISD::CondCode Cond, const SDLoc &dl,
23988 const X86Subtarget &Subtarget,
23989 SelectionDAG &DAG) {
23990 if (!Subtarget.hasSSE2())
23991 return SDValue();
23992
23993 MVT VET = VT.getVectorElementType();
23994 if (VET != MVT::i8 && VET != MVT::i16)
23995 return SDValue();
23996
23997 switch (Cond) {
23998 default:
23999 return SDValue();
24000 case ISD::SETULT: {
24001 // If the comparison is against a constant we can turn this into a
24002 // setule. With psubus, setule does not require a swap. This is
24003 // beneficial because the constant in the register is no longer
24004 // destructed as the destination so it can be hoisted out of a loop.
24005 // Only do this pre-AVX since vpcmp* is no longer destructive.
24006 if (Subtarget.hasAVX())
24007 return SDValue();
24008 SDValue ULEOp1 =
24009 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
24010 if (!ULEOp1)
24011 return SDValue();
24012 Op1 = ULEOp1;
24013 break;
24014 }
24015 case ISD::SETUGT: {
24016 // If the comparison is against a constant, we can turn this into a setuge.
24017 // This is beneficial because materializing a constant 0 for the PCMPEQ is
24018 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
24019 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
24020 SDValue UGEOp1 =
24021 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
24022 if (!UGEOp1)
24023 return SDValue();
24024 Op1 = Op0;
24025 Op0 = UGEOp1;
24026 break;
24027 }
24028 // Psubus is better than flip-sign because it requires no inversion.
24029 case ISD::SETUGE:
24030 std::swap(Op0, Op1);
24031 break;
24032 case ISD::SETULE:
24033 break;
24034 }
24035
24036 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24037 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24038 DAG.getConstant(0, dl, VT));
24039}
24040
24041static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24042 SelectionDAG &DAG) {
24043 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24044 Op.getOpcode() == ISD::STRICT_FSETCCS;
24045 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24046 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24047 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24048 MVT VT = Op->getSimpleValueType(0);
24050 MVT OpVT = Op0.getSimpleValueType();
24051 SDLoc dl(Op);
24052
24053 if (OpVT.isFloatingPoint()) {
24054 MVT EltVT = OpVT.getVectorElementType();
24055 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24056 EltVT == MVT::f64);
24057
24058 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24059 if (isSoftF16(EltVT, Subtarget)) {
24060 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24061 return SDValue();
24062
24063 // Break 256-bit FP vector compare into smaller ones.
24064 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24065 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24066
24067 // Break 512-bit FP vector compare into smaller ones.
24068 if (OpVT.is512BitVector())
24069 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24070
24071 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24072 if (IsStrict) {
24073 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24074 {Chain, Op0});
24075 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24076 {Chain, Op1});
24077 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24078 {Chain, Op0, Op1, CC});
24079 }
24080 MVT DVT = VT.getVectorElementType() == MVT::i16
24081 ? VT.changeVectorElementType(MVT::i32)
24082 : VT;
24083 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24084 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24085 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24086 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24087 }
24088
24089 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24090
24091 // If we have a strict compare with a vXi1 result and the input is 128/256
24092 // bits we can't use a masked compare unless we have VLX. If we use a wider
24093 // compare like we do for non-strict, we might trigger spurious exceptions
24094 // from the upper elements. Instead emit a AVX compare and convert to mask.
24095 unsigned Opc;
24096 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24097 (!IsStrict || Subtarget.hasVLX() ||
24099#ifndef NDEBUG
24100 unsigned Num = VT.getVectorNumElements();
24101 assert(Num <= 16 ||
24102 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24103#endif
24104 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24105 } else {
24106 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24107 // The SSE/AVX packed FP comparison nodes are defined with a
24108 // floating-point vector result that matches the operand type. This allows
24109 // them to work with an SSE1 target (integer vector types are not legal).
24110 VT = Op0.getSimpleValueType();
24111 }
24112
24113 SDValue Cmp;
24114 bool IsAlwaysSignaling;
24115 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24116 if (!Subtarget.hasAVX()) {
24117 // TODO: We could use following steps to handle a quiet compare with
24118 // signaling encodings.
24119 // 1. Get ordered masks from a quiet ISD::SETO
24120 // 2. Use the masks to mask potential unordered elements in operand A, B
24121 // 3. Get the compare results of masked A, B
24122 // 4. Calculating final result using the mask and result from 3
24123 // But currently, we just fall back to scalar operations.
24124 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24125 return SDValue();
24126
24127 // Insert an extra signaling instruction to raise exception.
24128 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24129 SDValue SignalCmp = DAG.getNode(
24130 Opc, dl, {VT, MVT::Other},
24131 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24132 // FIXME: It seems we need to update the flags of all new strict nodes.
24133 // Otherwise, mayRaiseFPException in MI will return false due to
24134 // NoFPExcept = false by default. However, I didn't find it in other
24135 // patches.
24136 SignalCmp->setFlags(Op->getFlags());
24137 Chain = SignalCmp.getValue(1);
24138 }
24139
24140 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24141 // emit two comparisons and a logic op to tie them together.
24142 if (!cheapX86FSETCC_SSE(Cond)) {
24143 // LLVM predicate is SETUEQ or SETONE.
24144 unsigned CC0, CC1;
24145 unsigned CombineOpc;
24146 if (Cond == ISD::SETUEQ) {
24147 CC0 = 3; // UNORD
24148 CC1 = 0; // EQ
24149 CombineOpc = X86ISD::FOR;
24150 } else {
24152 CC0 = 7; // ORD
24153 CC1 = 4; // NEQ
24154 CombineOpc = X86ISD::FAND;
24155 }
24156
24157 SDValue Cmp0, Cmp1;
24158 if (IsStrict) {
24159 Cmp0 = DAG.getNode(
24160 Opc, dl, {VT, MVT::Other},
24161 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24162 Cmp1 = DAG.getNode(
24163 Opc, dl, {VT, MVT::Other},
24164 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24165 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24166 Cmp1.getValue(1));
24167 } else {
24168 Cmp0 = DAG.getNode(
24169 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24170 Cmp1 = DAG.getNode(
24171 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24172 }
24173 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24174 } else {
24175 if (IsStrict) {
24176 Cmp = DAG.getNode(
24177 Opc, dl, {VT, MVT::Other},
24178 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24179 Chain = Cmp.getValue(1);
24180 } else
24181 Cmp = DAG.getNode(
24182 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24183 }
24184 } else {
24185 // Handle all other FP comparisons here.
24186 if (IsStrict) {
24187 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24188 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24189 Cmp = DAG.getNode(
24190 Opc, dl, {VT, MVT::Other},
24191 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24192 Chain = Cmp.getValue(1);
24193 } else
24194 Cmp = DAG.getNode(
24195 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24196 }
24197
24198 if (VT.getFixedSizeInBits() >
24199 Op.getSimpleValueType().getFixedSizeInBits()) {
24200 // We emitted a compare with an XMM/YMM result. Finish converting to a
24201 // mask register using a vptestm.
24203 Cmp = DAG.getBitcast(CastVT, Cmp);
24204 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24205 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24206 } else {
24207 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24208 // the result type of SETCC. The bitcast is expected to be optimized
24209 // away during combining/isel.
24210 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24211 }
24212
24213 if (IsStrict)
24214 return DAG.getMergeValues({Cmp, Chain}, dl);
24215
24216 return Cmp;
24217 }
24218
24219 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24220
24221 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24222 assert(VTOp0 == Op1.getSimpleValueType() &&
24223 "Expected operands with same type!");
24225 "Invalid number of packed elements for source and destination!");
24226
24227 // The non-AVX512 code below works under the assumption that source and
24228 // destination types are the same.
24229 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24230 "Value types for source and destination must be the same!");
24231
24232 // The result is boolean, but operands are int/float
24233 if (VT.getVectorElementType() == MVT::i1) {
24234 // In AVX-512 architecture setcc returns mask with i1 elements,
24235 // But there is no compare instruction for i8 and i16 elements in KNL.
24236 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24237 "Unexpected operand type");
24238 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24239 }
24240
24241 // Lower using XOP integer comparisons.
24242 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24243 // Translate compare code to XOP PCOM compare mode.
24244 unsigned CmpMode = 0;
24245 switch (Cond) {
24246 // clang-format off
24247 default: llvm_unreachable("Unexpected SETCC condition");
24248 case ISD::SETULT:
24249 case ISD::SETLT: CmpMode = 0x00; break;
24250 case ISD::SETULE:
24251 case ISD::SETLE: CmpMode = 0x01; break;
24252 case ISD::SETUGT:
24253 case ISD::SETGT: CmpMode = 0x02; break;
24254 case ISD::SETUGE:
24255 case ISD::SETGE: CmpMode = 0x03; break;
24256 case ISD::SETEQ: CmpMode = 0x04; break;
24257 case ISD::SETNE: CmpMode = 0x05; break;
24258 // clang-format on
24259 }
24260
24261 // Are we comparing unsigned or signed integers?
24262 unsigned Opc =
24264
24265 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24266 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24267 }
24268
24269 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24270 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24272 SDValue BC0 = peekThroughBitcasts(Op0);
24273 if (BC0.getOpcode() == ISD::AND &&
24275 /*AllowUndefs=*/false)) {
24276 Cond = ISD::SETEQ;
24277 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24278 }
24279 }
24280
24281 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24282 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24283 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24285 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24286 unsigned BitWidth = VT.getScalarSizeInBits();
24287 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24288
24289 SDValue Result = Op0.getOperand(0);
24290 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24291 DAG.getConstant(ShiftAmt, dl, VT));
24292 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24293 DAG.getConstant(BitWidth - 1, dl, VT));
24294 return Result;
24295 }
24296 }
24297
24298 // Break 256-bit integer vector compare into smaller ones.
24299 if (VT.is256BitVector() && !Subtarget.hasInt256())
24300 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24301
24302 // Break 512-bit integer vector compare into smaller ones.
24303 // TODO: Try harder to use VPCMPx + VPMOV2x?
24304 if (VT.is512BitVector())
24305 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24306
24307 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24308 // not-of-PCMPEQ:
24309 // X != INT_MIN --> X >s INT_MIN
24310 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24311 // +X != 0 --> +X >s 0
24312 APInt ConstValue;
24313 if (Cond == ISD::SETNE &&
24314 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24315 if (ConstValue.isMinSignedValue())
24316 Cond = ISD::SETGT;
24317 else if (ConstValue.isMaxSignedValue())
24318 Cond = ISD::SETLT;
24319 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24320 Cond = ISD::SETGT;
24321 }
24322
24323 // If both operands are known non-negative, then an unsigned compare is the
24324 // same as a signed compare and there's no need to flip signbits.
24325 // TODO: We could check for more general simplifications here since we're
24326 // computing known bits.
24327 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24328 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24329
24330 // Special case: Use min/max operations for unsigned compares.
24331 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24333 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24334 TLI.isOperationLegal(ISD::UMIN, VT)) {
24335 // If we have a constant operand, increment/decrement it and change the
24336 // condition to avoid an invert.
24337 if (Cond == ISD::SETUGT) {
24338 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24339 if (SDValue UGTOp1 =
24340 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24341 Op1 = UGTOp1;
24342 Cond = ISD::SETUGE;
24343 }
24344 }
24345 if (Cond == ISD::SETULT) {
24346 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24347 if (SDValue ULTOp1 =
24348 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24349 Op1 = ULTOp1;
24350 Cond = ISD::SETULE;
24351 }
24352 }
24353 bool Invert = false;
24354 unsigned Opc;
24355 switch (Cond) {
24356 // clang-format off
24357 default: llvm_unreachable("Unexpected condition code");
24358 case ISD::SETUGT: Invert = true; [[fallthrough]];
24359 case ISD::SETULE: Opc = ISD::UMIN; break;
24360 case ISD::SETULT: Invert = true; [[fallthrough]];
24361 case ISD::SETUGE: Opc = ISD::UMAX; break;
24362 // clang-format on
24363 }
24364
24365 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24366 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24367
24368 // If the logical-not of the result is required, perform that now.
24369 if (Invert)
24370 Result = DAG.getNOT(dl, Result, VT);
24371
24372 return Result;
24373 }
24374
24375 // Try to use SUBUS and PCMPEQ.
24376 if (FlipSigns)
24377 if (SDValue V =
24378 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24379 return V;
24380
24381 // We are handling one of the integer comparisons here. Since SSE only has
24382 // GT and EQ comparisons for integer, swapping operands and multiple
24383 // operations may be required for some comparisons.
24384 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24386 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24388 bool Invert = Cond == ISD::SETNE ||
24390
24391 if (Swap)
24392 std::swap(Op0, Op1);
24393
24394 // Check that the operation in question is available (most are plain SSE2,
24395 // but PCMPGTQ and PCMPEQQ have different requirements).
24396 if (VT == MVT::v2i64) {
24397 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24398 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24399
24400 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24401 // the odd elements over the even elements.
24402 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24403 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24404 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24405
24406 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24407 static const int MaskHi[] = { 1, 1, 3, 3 };
24408 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24409
24410 return DAG.getBitcast(VT, Result);
24411 }
24412
24413 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24414 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24415 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24416
24417 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24418 static const int MaskHi[] = { 1, 1, 3, 3 };
24419 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24420
24421 return DAG.getBitcast(VT, Result);
24422 }
24423
24424 // If the i64 elements are sign-extended enough to be representable as i32
24425 // then we can compare the lower i32 bits and splat.
24426 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24427 DAG.ComputeNumSignBits(Op1) > 32) {
24428 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24429 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24430
24431 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24432 static const int MaskLo[] = {0, 0, 2, 2};
24433 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24434
24435 return DAG.getBitcast(VT, Result);
24436 }
24437
24438 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24439 // bits of the inputs before performing those operations. The lower
24440 // compare is always unsigned.
24441 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24442 : 0x0000000080000000ULL,
24443 dl, MVT::v2i64);
24444
24445 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24446 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24447
24448 // Cast everything to the right type.
24449 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24450 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24451
24452 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24453 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24454 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24455
24456 // Create masks for only the low parts/high parts of the 64 bit integers.
24457 static const int MaskHi[] = { 1, 1, 3, 3 };
24458 static const int MaskLo[] = { 0, 0, 2, 2 };
24459 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24460 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24461 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24462
24463 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24464 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24465
24466 if (Invert)
24467 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24468
24469 return DAG.getBitcast(VT, Result);
24470 }
24471
24472 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24473 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24474 // pcmpeqd + pshufd + pand.
24475 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24476
24477 // First cast everything to the right type.
24478 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24479 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24480
24481 // Do the compare.
24482 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24483
24484 // Make sure the lower and upper halves are both all-ones.
24485 static const int Mask[] = { 1, 0, 3, 2 };
24486 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24487 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24488
24489 if (Invert)
24490 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24491
24492 return DAG.getBitcast(VT, Result);
24493 }
24494 }
24495
24496 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24497 // bits of the inputs before performing those operations.
24498 if (FlipSigns) {
24499 MVT EltVT = VT.getVectorElementType();
24501 VT);
24502 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24503 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24504 }
24505
24506 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24507
24508 // If the logical-not of the result is required, perform that now.
24509 if (Invert)
24510 Result = DAG.getNOT(dl, Result, VT);
24511
24512 return Result;
24513}
24514
24515// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24517 const SDLoc &dl, SelectionDAG &DAG,
24518 const X86Subtarget &Subtarget,
24519 SDValue &X86CC) {
24520 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24521
24522 // Must be a bitcast from vXi1.
24523 if (Op0.getOpcode() != ISD::BITCAST)
24524 return SDValue();
24525
24526 Op0 = Op0.getOperand(0);
24527 MVT VT = Op0.getSimpleValueType();
24528 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24529 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24530 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24531 return SDValue();
24532
24533 X86::CondCode X86Cond;
24534 if (isNullConstant(Op1)) {
24535 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24536 } else if (isAllOnesConstant(Op1)) {
24537 // C flag is set for all ones.
24538 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24539 } else
24540 return SDValue();
24541
24542 // If the input is an AND, we can combine it's operands into the KTEST.
24543 bool KTestable = false;
24544 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24545 KTestable = true;
24546 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24547 KTestable = true;
24548 if (!isNullConstant(Op1))
24549 KTestable = false;
24550 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24551 SDValue LHS = Op0.getOperand(0);
24552 SDValue RHS = Op0.getOperand(1);
24553 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24554 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24555 }
24556
24557 // If the input is an OR, we can combine it's operands into the KORTEST.
24558 SDValue LHS = Op0;
24559 SDValue RHS = Op0;
24560 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24561 LHS = Op0.getOperand(0);
24562 RHS = Op0.getOperand(1);
24563 }
24564
24565 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24566 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24567}
24568
24569/// Emit flags for the given setcc condition and operands. Also returns the
24570/// corresponding X86 condition code constant in X86CC.
24571SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24572 ISD::CondCode CC, const SDLoc &dl,
24573 SelectionDAG &DAG,
24574 SDValue &X86CC) const {
24575 // Equality Combines.
24576 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24577 X86::CondCode X86CondCode;
24578
24579 // Optimize to BT if possible.
24580 // Lower (X & (1 << N)) == 0 to BT(X, N).
24581 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24582 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24583 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24584 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24585 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24586 return BT;
24587 }
24588 }
24589
24590 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24591 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24592 X86CondCode)) {
24593 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24594 return CmpZ;
24595 }
24596
24597 // Try to lower using KORTEST or KTEST.
24598 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24599 return Test;
24600
24601 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24602 // of these.
24603 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24604 // If the input is a setcc, then reuse the input setcc or use a new one
24605 // with the inverted condition.
24606 if (Op0.getOpcode() == X86ISD::SETCC) {
24607 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24608
24609 X86CC = Op0.getOperand(0);
24610 if (Invert) {
24611 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24612 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24613 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24614 }
24615
24616 return Op0.getOperand(1);
24617 }
24618 }
24619
24620 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24621 // overflow.
24622 if (isMinSignedConstant(Op1)) {
24623 EVT VT = Op0.getValueType();
24624 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24625 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24627 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24628 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24629 DAG.getConstant(0, dl, VT), Op0);
24630 return SDValue(Neg.getNode(), 1);
24631 }
24632 }
24633
24634 // Try to use the carry flag from the add in place of an separate CMP for:
24635 // (seteq (add X, -1), -1). Similar for setne.
24636 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24637 Op0.getOperand(1) == Op1) {
24638 if (isProfitableToUseFlagOp(Op0)) {
24639 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24640
24641 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24642 Op0.getOperand(1));
24643 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24644 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24645 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24646 return SDValue(New.getNode(), 1);
24647 }
24648 }
24649 }
24650
24652 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24653 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24654
24655 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24656 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24657 return EFLAGS;
24658}
24659
24660SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24661
24662 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24663 Op.getOpcode() == ISD::STRICT_FSETCCS;
24664 MVT VT = Op->getSimpleValueType(0);
24665
24666 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24667
24668 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24669 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24670 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24671 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24672 SDLoc dl(Op);
24673 ISD::CondCode CC =
24674 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24675
24676 if (isSoftF16(Op0.getValueType(), Subtarget))
24677 return SDValue();
24678
24679 // Handle f128 first, since one possible outcome is a normal integer
24680 // comparison which gets handled by emitFlagsForSetcc.
24681 if (Op0.getValueType() == MVT::f128) {
24682 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24683 Op.getOpcode() == ISD::STRICT_FSETCCS);
24684
24685 // If softenSetCCOperands returned a scalar, use it.
24686 if (!Op1.getNode()) {
24687 assert(Op0.getValueType() == Op.getValueType() &&
24688 "Unexpected setcc expansion!");
24689 if (IsStrict)
24690 return DAG.getMergeValues({Op0, Chain}, dl);
24691 return Op0;
24692 }
24693 }
24694
24695 if (Op0.getSimpleValueType().isInteger()) {
24696 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24697 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24698 // this may translate to less uops depending on uarch implementation. The
24699 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24700 // canonicalize to that CondCode.
24701 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24702 // encoding size - so it must either already be a i8 or i32 immediate, or it
24703 // shrinks down to that. We don't do this for any i64's to avoid additional
24704 // constant materializations.
24705 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24706 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24707 const APInt &Op1Val = Op1C->getAPIntValue();
24708 if (!Op1Val.isZero()) {
24709 // Ensure the constant+1 doesn't overflow.
24710 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24711 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24712 APInt Op1ValPlusOne = Op1Val + 1;
24713 if (Op1ValPlusOne.isSignedIntN(32) &&
24714 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24715 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24718 }
24719 }
24720 }
24721 }
24722
24723 SDValue X86CC;
24724 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24725 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24726 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24727 }
24728
24729 if (Subtarget.hasAVX10_2()) {
24730 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24731 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24732 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24733 if (Op0.getSimpleValueType() != MVT::f80) {
24734 SDValue Res = getSETCC(
24735 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24736 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24737 }
24738 }
24739 }
24740 // Handle floating point.
24741 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24742 if (CondCode == X86::COND_INVALID)
24743 return SDValue();
24744
24745 SDValue EFLAGS;
24746 if (IsStrict) {
24747 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24748 EFLAGS =
24750 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24751 Chain = EFLAGS.getValue(1);
24752 } else {
24753 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24754 }
24755
24756 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24757 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24758 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24759}
24760
24761SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24762 SDValue LHS = Op.getOperand(0);
24763 SDValue RHS = Op.getOperand(1);
24764 SDValue Carry = Op.getOperand(2);
24765 SDValue Cond = Op.getOperand(3);
24766 SDLoc DL(Op);
24767
24768 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24770
24771 // Recreate the carry if needed.
24772 EVT CarryVT = Carry.getValueType();
24773 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24774 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24775
24776 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24777 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24778 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24779}
24780
24781// This function returns three things: the arithmetic computation itself
24782// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24783// flag and the condition code define the case in which the arithmetic
24784// computation overflows.
24785static std::pair<SDValue, SDValue>
24787 assert(Op.getResNo() == 0 && "Unexpected result number!");
24788 SDValue Value, Overflow;
24789 SDValue LHS = Op.getOperand(0);
24790 SDValue RHS = Op.getOperand(1);
24791 unsigned BaseOp = 0;
24792 SDLoc DL(Op);
24793 switch (Op.getOpcode()) {
24794 default: llvm_unreachable("Unknown ovf instruction!");
24795 case ISD::SADDO:
24796 BaseOp = X86ISD::ADD;
24797 Cond = X86::COND_O;
24798 break;
24799 case ISD::UADDO:
24800 BaseOp = X86ISD::ADD;
24802 break;
24803 case ISD::SSUBO:
24804 BaseOp = X86ISD::SUB;
24805 Cond = X86::COND_O;
24806 break;
24807 case ISD::USUBO:
24808 BaseOp = X86ISD::SUB;
24809 Cond = X86::COND_B;
24810 break;
24811 case ISD::SMULO:
24812 BaseOp = X86ISD::SMUL;
24813 Cond = X86::COND_O;
24814 break;
24815 case ISD::UMULO:
24816 BaseOp = X86ISD::UMUL;
24817 Cond = X86::COND_O;
24818 break;
24819 }
24820
24821 if (BaseOp) {
24822 // Also sets EFLAGS.
24823 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24824 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24825 Overflow = Value.getValue(1);
24826 }
24827
24828 return std::make_pair(Value, Overflow);
24829}
24830
24832 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24833 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24834 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24835 // has only one use.
24836 SDLoc DL(Op);
24838 SDValue Value, Overflow;
24839 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24840
24841 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24842 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24843 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24844}
24845
24846/// Return true if opcode is a X86 logical comparison.
24848 unsigned Opc = Op.getOpcode();
24849 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24850 Opc == X86ISD::FCMP)
24851 return true;
24852 if (Op.getResNo() == 1 &&
24853 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24855 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24856 return true;
24857
24858 return false;
24859}
24860
24862 if (V.getOpcode() != ISD::TRUNCATE)
24863 return false;
24864
24865 SDValue VOp0 = V.getOperand(0);
24866 unsigned InBits = VOp0.getValueSizeInBits();
24867 unsigned Bits = V.getValueSizeInBits();
24868 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24869}
24870
24871// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24873 unsigned X86CC, const SDLoc &DL,
24874 SelectionDAG &DAG,
24875 const X86Subtarget &Subtarget) {
24876 EVT CmpVT = CmpVal.getValueType();
24877 EVT VT = LHS.getValueType();
24878 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24879 return SDValue();
24880
24881 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24882 isOneConstant(CmpVal.getOperand(1))) {
24883 auto SplatLSB = [&](EVT SplatVT) {
24884 // we need mask of all zeros or ones with same size of the other
24885 // operands.
24886 SDValue Neg = CmpVal;
24887 if (CmpVT.bitsGT(SplatVT))
24888 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24889 else if (CmpVT.bitsLT(SplatVT))
24890 Neg = DAG.getNode(
24891 ISD::AND, DL, SplatVT,
24892 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24893 DAG.getConstant(1, DL, SplatVT));
24894 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24895 };
24896
24897 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24899 return SplatLSB(VT);
24900
24901 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24902 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24904 SDValue Mask = SplatLSB(VT);
24905 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24906 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24907 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24908 }
24909
24910 SDValue Src1, Src2;
24911 auto isIdentityPatternZero = [&]() {
24912 switch (RHS.getOpcode()) {
24913 default:
24914 break;
24915 case ISD::OR:
24916 case ISD::XOR:
24917 case ISD::ADD:
24918 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24919 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24920 Src2 = LHS;
24921 return true;
24922 }
24923 break;
24924 case ISD::SHL:
24925 case ISD::SRA:
24926 case ISD::SRL:
24927 case ISD::SUB:
24928 if (RHS.getOperand(0) == LHS) {
24929 Src1 = RHS.getOperand(1);
24930 Src2 = LHS;
24931 return true;
24932 }
24933 break;
24934 }
24935 return false;
24936 };
24937
24938 auto isIdentityPatternOnes = [&]() {
24939 switch (LHS.getOpcode()) {
24940 default:
24941 break;
24942 case ISD::AND:
24943 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24944 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24945 Src2 = RHS;
24946 return true;
24947 }
24948 break;
24949 }
24950 return false;
24951 };
24952
24953 // Convert 'identity' patterns (iff X is 0 or 1):
24954 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24955 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24956 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24957 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24958 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24959 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24960 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24961 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24962 SDValue Mask = SplatLSB(Src1.getValueType());
24963 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24964 Src1); // Mask & z
24965 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24966 }
24967 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24968 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24969 SDValue Mask = SplatLSB(VT);
24970 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24971 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24972 }
24973 }
24974
24975 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24978 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24979
24980 // 'X - 1' sets the carry flag if X == 0.
24981 // '0 - X' sets the carry flag if X != 0.
24982 // Convert the carry flag to a -1/0 mask with sbb:
24983 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24984 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24985 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24986 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24987 SDValue Sub;
24988 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24989 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24990 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24991 } else {
24992 SDValue One = DAG.getConstant(1, DL, CmpVT);
24993 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24994 }
24995 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24996 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24997 Sub.getValue(1));
24998 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24999 }
25000
25001 return SDValue();
25002}
25003
25004SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
25005 bool AddTest = true;
25006 SDValue Cond = Op.getOperand(0);
25007 SDValue Op1 = Op.getOperand(1);
25008 SDValue Op2 = Op.getOperand(2);
25009 SDLoc DL(Op);
25010 MVT VT = Op1.getSimpleValueType();
25011 SDValue CC;
25012
25013 if (isSoftF16(VT, Subtarget)) {
25014 MVT NVT = VT.changeTypeToInteger();
25015 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
25016 DAG.getBitcast(NVT, Op1),
25017 DAG.getBitcast(NVT, Op2)));
25018 }
25019
25020 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
25021 // are available or VBLENDV if AVX is available.
25022 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
25023 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
25024 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
25025 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25026 bool IsAlwaysSignaling;
25027 unsigned SSECC =
25028 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25029 CondOp0, CondOp1, IsAlwaysSignaling);
25030
25031 if (Subtarget.hasAVX512()) {
25032 SDValue Cmp =
25033 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25034 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25035 assert(!VT.isVector() && "Not a scalar type?");
25036 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25037 }
25038
25039 if (SSECC < 8 || Subtarget.hasAVX()) {
25040 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25041 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25042
25043 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25044 // instead of 3 logic instructions for size savings and potentially speed.
25045 // Unfortunately, there is no scalar form of VBLENDV.
25046 //
25047 // If either operand is a +0.0 constant, don't try this. We can expect to
25048 // optimize away at least one of the logic instructions later in that
25049 // case, so that sequence would be faster than a variable blend.
25050 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25051 !isNullFPConstant(Op2)) {
25052 // Convert to vectors, do a VSELECT, and convert back to scalar.
25053 // All of the conversions should be optimized away.
25054 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25055 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25056 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25057 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25058
25059 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25060 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25061
25062 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25063
25064 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25065 DAG.getVectorIdxConstant(0, DL));
25066 }
25067 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25068 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25069 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25070 }
25071 }
25072
25073 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25074 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25075 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25076 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25077 }
25078
25079 if (Cond.getOpcode() == ISD::SETCC &&
25080 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25081 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25082 Cond = NewCond;
25083 // If the condition was updated, it's possible that the operands of the
25084 // select were also updated (for example, EmitTest has a RAUW). Refresh
25085 // the local references to the select operands in case they got stale.
25086 Op1 = Op.getOperand(1);
25087 Op2 = Op.getOperand(2);
25088 }
25089 }
25090
25091 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25092 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25093 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25094 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25095 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25096 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25097 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25098 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25099 if (Cond.getOpcode() == X86ISD::SETCC &&
25100 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25101 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25102 SDValue Cmp = Cond.getOperand(1);
25103 SDValue CmpOp0 = Cmp.getOperand(0);
25104 unsigned CondCode = Cond.getConstantOperandVal(0);
25105
25106 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25107 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25108 // handle to keep the CMP with 0. This should be removed by
25109 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25110 // cttz_zero_undef.
25111 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25112 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25113 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25114 };
25115 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25116 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25117 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25118 // Keep Cmp.
25119 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25120 DL, DAG, Subtarget)) {
25121 return R;
25122 } else if (VT.isScalarInteger() && isNullConstant(Op2) &&
25123 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25124 ((CondCode == X86::COND_S) || // smin(x, 0)
25125 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25126 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25127 //
25128 // If the comparison is testing for a positive value, we have to invert
25129 // the sign bit mask, so only do that transform if the target has a
25130 // bitwise 'and not' instruction (the invert is free).
25131 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25132 unsigned ShCt = VT.getSizeInBits() - 1;
25133 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25134 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25135 if (CondCode == X86::COND_G)
25136 Shift = DAG.getNOT(DL, Shift, VT);
25137 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25138 }
25139 }
25140
25141 // Look past (and (setcc_carry (cmp ...)), 1).
25142 if (Cond.getOpcode() == ISD::AND &&
25143 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25144 isOneConstant(Cond.getOperand(1)))
25145 Cond = Cond.getOperand(0);
25146
25147 // Attempt to fold "raw cond" cases by treating them as:
25148 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25149 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25150 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25151 Subtarget))
25152 return R;
25153
25154 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25155 // setting operand in place of the X86ISD::SETCC.
25156 unsigned CondOpcode = Cond.getOpcode();
25157 if (CondOpcode == X86ISD::SETCC ||
25158 CondOpcode == X86ISD::SETCC_CARRY) {
25159 CC = Cond.getOperand(0);
25160
25161 SDValue Cmp = Cond.getOperand(1);
25162 bool IllegalFPCMov = false;
25163 if (VT.isFloatingPoint() && !VT.isVector() &&
25164 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25165 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25166
25167 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25168 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25169 Cond = Cmp;
25170 AddTest = false;
25171 }
25172 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25173 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25174 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25175 SDValue Value;
25176 X86::CondCode X86Cond;
25177 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25178
25179 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25180 AddTest = false;
25181 }
25182
25183 if (AddTest) {
25184 // Look past the truncate if the high bits are known zero.
25186 Cond = Cond.getOperand(0);
25187
25188 // We know the result of AND is compared against zero. Try to match
25189 // it to BT.
25190 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25191 X86::CondCode X86CondCode;
25192 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25193 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25194 Cond = BT;
25195 AddTest = false;
25196 }
25197 }
25198 }
25199
25200 if (AddTest) {
25201 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25202 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25203 }
25204
25205 // a < b ? -1 : 0 -> RES = ~setcc_carry
25206 // a < b ? 0 : -1 -> RES = setcc_carry
25207 // a >= b ? -1 : 0 -> RES = setcc_carry
25208 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25209 if (Cond.getOpcode() == X86ISD::SUB) {
25210 unsigned CondCode = CC->getAsZExtVal();
25211
25212 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25213 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25214 (isNullConstant(Op1) || isNullConstant(Op2))) {
25215 SDValue Res =
25216 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25217 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25218 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25219 return DAG.getNOT(DL, Res, Res.getValueType());
25220 return Res;
25221 }
25222 }
25223
25224 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25225 // widen the cmov and push the truncate through. This avoids introducing a new
25226 // branch during isel and doesn't add any extensions.
25227 if (Op.getValueType() == MVT::i8 &&
25228 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25229 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25230 if (T1.getValueType() == T2.getValueType() &&
25231 // Exclude CopyFromReg to avoid partial register stalls.
25232 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25233 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25234 CC, Cond);
25235 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25236 }
25237 }
25238
25239 // Or finally, promote i8 cmovs if we have CMOV,
25240 // or i16 cmovs if it won't prevent folding a load.
25241 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25242 // legal, but EmitLoweredSelect() can not deal with these extensions
25243 // being inserted between two CMOV's. (in i16 case too TBN)
25244 // https://bugs.llvm.org/show_bug.cgi?id=40974
25245 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25246 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25247 !X86::mayFoldLoad(Op2, Subtarget))) {
25248 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25249 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25250 SDValue Ops[] = { Op2, Op1, CC, Cond };
25251 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25252 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25253 }
25254
25255 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25256 // condition is true.
25257 SDValue Ops[] = { Op2, Op1, CC, Cond };
25258 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25259}
25260
25262 const X86Subtarget &Subtarget,
25263 SelectionDAG &DAG) {
25264 MVT VT = Op->getSimpleValueType(0);
25265 SDValue In = Op->getOperand(0);
25266 MVT InVT = In.getSimpleValueType();
25267 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25268 MVT VTElt = VT.getVectorElementType();
25269 unsigned NumElts = VT.getVectorNumElements();
25270
25271 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25272 MVT ExtVT = VT;
25273 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25274 // If v16i32 is to be avoided, we'll need to split and concatenate.
25275 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25276 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25277
25278 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25279 }
25280
25281 // Widen to 512-bits if VLX is not supported.
25282 MVT WideVT = ExtVT;
25283 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25284 NumElts *= 512 / ExtVT.getSizeInBits();
25285 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25286 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25287 DAG.getVectorIdxConstant(0, dl));
25288 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25289 }
25290
25291 SDValue V;
25292 MVT WideEltVT = WideVT.getVectorElementType();
25293 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25294 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25295 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25296 } else {
25297 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25298 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25299 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25300 }
25301
25302 // Truncate if we had to extend i16/i8 above.
25303 if (VT != ExtVT) {
25304 WideVT = MVT::getVectorVT(VTElt, NumElts);
25305 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25306 }
25307
25308 // Extract back to 128/256-bit if we widened.
25309 if (WideVT != VT)
25310 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25311 DAG.getVectorIdxConstant(0, dl));
25312
25313 return V;
25314}
25315
25317 SelectionDAG &DAG) {
25318 SDValue In = Op->getOperand(0);
25319 MVT InVT = In.getSimpleValueType();
25320 SDLoc DL(Op);
25321
25322 if (InVT.getVectorElementType() == MVT::i1)
25323 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25324
25325 assert(Subtarget.hasAVX() && "Expected AVX support");
25326 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25327}
25328
25329// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25330// For sign extend this needs to handle all vector sizes and SSE4.1 and
25331// non-SSE4.1 targets. For zero extend this should only handle inputs of
25332// MVT::v64i8 when BWI is not supported, but AVX512 is.
25334 const X86Subtarget &Subtarget,
25335 SelectionDAG &DAG) {
25336 SDValue In = Op->getOperand(0);
25337 MVT VT = Op->getSimpleValueType(0);
25338 MVT InVT = In.getSimpleValueType();
25339
25340 MVT SVT = VT.getVectorElementType();
25341 MVT InSVT = InVT.getVectorElementType();
25343
25344 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25345 return SDValue();
25346 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25347 return SDValue();
25348 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25349 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25350 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25351 return SDValue();
25352
25353 SDLoc dl(Op);
25354 unsigned Opc = Op.getOpcode();
25355 unsigned NumElts = VT.getVectorNumElements();
25356
25357 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25358 // For 512-bit vectors, we need 128-bits or 256-bits.
25359 if (InVT.getSizeInBits() > 128) {
25360 // Input needs to be at least the same number of elements as output, and
25361 // at least 128-bits.
25362 int InSize = InSVT.getSizeInBits() * NumElts;
25363 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25364 InVT = In.getSimpleValueType();
25365 }
25366
25367 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25368 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25369 // need to be handled here for 256/512-bit results.
25370 if (Subtarget.hasInt256()) {
25371 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25372
25373 if (InVT.getVectorNumElements() != NumElts)
25374 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25375
25376 // FIXME: Apparently we create inreg operations that could be regular
25377 // extends.
25378 unsigned ExtOpc =
25381 return DAG.getNode(ExtOpc, dl, VT, In);
25382 }
25383
25384 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25385 if (Subtarget.hasAVX()) {
25386 assert(VT.is256BitVector() && "256-bit vector expected");
25387 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25388 int HalfNumElts = HalfVT.getVectorNumElements();
25389
25390 unsigned NumSrcElts = InVT.getVectorNumElements();
25391 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25392 for (int i = 0; i != HalfNumElts; ++i)
25393 HiMask[i] = HalfNumElts + i;
25394
25395 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25396 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25397 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25398 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25399 }
25400
25401 // We should only get here for sign extend.
25402 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25403 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25404 unsigned InNumElts = InVT.getVectorNumElements();
25405
25406 // If the source elements are already all-signbits, we don't need to extend,
25407 // just splat the elements.
25408 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25409 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25410 unsigned Scale = InNumElts / NumElts;
25411 SmallVector<int, 16> ShuffleMask;
25412 for (unsigned I = 0; I != NumElts; ++I)
25413 ShuffleMask.append(Scale, I);
25414 return DAG.getBitcast(VT,
25415 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25416 }
25417
25418 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25419 SDValue Curr = In;
25420 SDValue SignExt = Curr;
25421
25422 // As SRAI is only available on i16/i32 types, we expand only up to i32
25423 // and handle i64 separately.
25424 if (InVT != MVT::v4i32) {
25425 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25426
25427 unsigned DestWidth = DestVT.getScalarSizeInBits();
25428 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25429 unsigned DestElts = DestVT.getVectorNumElements();
25430
25431 // Build a shuffle mask that takes each input element and places it in the
25432 // MSBs of the new element size.
25433 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25434 for (unsigned i = 0; i != DestElts; ++i)
25435 Mask[i * Scale + (Scale - 1)] = i;
25436
25437 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25438 Curr = DAG.getBitcast(DestVT, Curr);
25439
25440 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25441 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25442 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25443 }
25444
25445 if (VT == MVT::v2i64) {
25446 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25447 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25448 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25449 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25450 SignExt = DAG.getBitcast(VT, SignExt);
25451 }
25452
25453 return SignExt;
25454}
25455
25457 SelectionDAG &DAG) {
25458 MVT VT = Op->getSimpleValueType(0);
25459 SDValue In = Op->getOperand(0);
25460 MVT InVT = In.getSimpleValueType();
25461 SDLoc dl(Op);
25462
25463 if (InVT.getVectorElementType() == MVT::i1)
25464 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25465
25466 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25468 "Expected same number of elements");
25469 assert((VT.getVectorElementType() == MVT::i16 ||
25470 VT.getVectorElementType() == MVT::i32 ||
25471 VT.getVectorElementType() == MVT::i64) &&
25472 "Unexpected element type");
25473 assert((InVT.getVectorElementType() == MVT::i8 ||
25474 InVT.getVectorElementType() == MVT::i16 ||
25475 InVT.getVectorElementType() == MVT::i32) &&
25476 "Unexpected element type");
25477
25478 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25479 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25480 return splitVectorIntUnary(Op, DAG, dl);
25481 }
25482
25483 if (Subtarget.hasInt256())
25484 return Op;
25485
25486 // Optimize vectors in AVX mode
25487 // Sign extend v8i16 to v8i32 and
25488 // v4i32 to v4i64
25489 //
25490 // Divide input vector into two parts
25491 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25492 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25493 // concat the vectors to original VT
25494 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25495 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25496
25497 unsigned NumElems = InVT.getVectorNumElements();
25498 SmallVector<int,8> ShufMask(NumElems, -1);
25499 for (unsigned i = 0; i != NumElems/2; ++i)
25500 ShufMask[i] = i + NumElems/2;
25501
25502 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25503 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25504
25505 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25506}
25507
25508/// Change a vector store into a pair of half-size vector stores.
25510 SDValue StoredVal = Store->getValue();
25511 assert((StoredVal.getValueType().is256BitVector() ||
25512 StoredVal.getValueType().is512BitVector()) &&
25513 "Expecting 256/512-bit op");
25514
25515 // Splitting volatile memory ops is not allowed unless the operation was not
25516 // legal to begin with. Assume the input store is legal (this transform is
25517 // only used for targets with AVX). Note: It is possible that we have an
25518 // illegal type like v2i128, and so we could allow splitting a volatile store
25519 // in that case if that is important.
25520 if (!Store->isSimple())
25521 return SDValue();
25522
25523 SDLoc DL(Store);
25524 SDValue Value0, Value1;
25525 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25526 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25527 SDValue Ptr0 = Store->getBasePtr();
25528 SDValue Ptr1 =
25529 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25530 SDValue Ch0 =
25531 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25532 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25533 SDValue Ch1 =
25534 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25535 Store->getPointerInfo().getWithOffset(HalfOffset),
25536 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25537 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25538}
25539
25540/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25541/// type.
25543 SelectionDAG &DAG) {
25544 SDValue StoredVal = Store->getValue();
25545 assert(StoreVT.is128BitVector() &&
25546 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25547 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25548
25549 // Splitting volatile memory ops is not allowed unless the operation was not
25550 // legal to begin with. We are assuming the input op is legal (this transform
25551 // is only used for targets with AVX).
25552 if (!Store->isSimple())
25553 return SDValue();
25554
25555 MVT StoreSVT = StoreVT.getScalarType();
25556 unsigned NumElems = StoreVT.getVectorNumElements();
25557 unsigned ScalarSize = StoreSVT.getStoreSize();
25558
25559 SDLoc DL(Store);
25561 for (unsigned i = 0; i != NumElems; ++i) {
25562 unsigned Offset = i * ScalarSize;
25563 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25565 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25566 DAG.getVectorIdxConstant(i, DL));
25567 SDValue Ch =
25568 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25569 Store->getPointerInfo().getWithOffset(Offset),
25570 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25571 Stores.push_back(Ch);
25572 }
25573 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25574}
25575
25576static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25577 SelectionDAG &DAG) {
25578 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25579 SDLoc dl(St);
25580 SDValue StoredVal = St->getValue();
25581
25582 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25583 if (StoredVal.getValueType().isVector() &&
25584 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25585 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25586 assert(NumElts <= 8 && "Unexpected VT");
25587 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25588 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25589 "Expected AVX512F without AVX512DQI");
25590
25591 // We must pad with zeros to ensure we store zeroes to any unused bits.
25592 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25593 DAG.getUNDEF(MVT::v16i1), StoredVal,
25594 DAG.getVectorIdxConstant(0, dl));
25595 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25596 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25597 // Make sure we store zeros in the extra bits.
25598 if (NumElts < 8)
25599 StoredVal = DAG.getZeroExtendInReg(
25600 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25601
25602 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25603 St->getPointerInfo(), St->getBaseAlign(),
25604 St->getMemOperand()->getFlags());
25605 }
25606
25607 if (St->isTruncatingStore())
25608 return SDValue();
25609
25610 // If this is a 256/512-bit store of concatenated ops, we are better off
25611 // splitting that store into two half-size stores. This avoids spurious use of
25612 // concatenated ops and each half can execute independently. Some cores would
25613 // split the op into halves anyway, so the concat is purely an extra op.
25614 MVT StoreVT = StoredVal.getSimpleValueType();
25615 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25616 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25617 return splitVectorStore(St, DAG);
25618 return SDValue();
25619 }
25620
25621 if (StoreVT.is32BitVector())
25622 return SDValue();
25623
25624 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25625 assert(StoreVT.is64BitVector() && "Unexpected VT");
25626 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25628 "Unexpected type action!");
25629
25630 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25631 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25632 DAG.getUNDEF(StoreVT));
25633
25634 if (Subtarget.hasSSE2()) {
25635 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25636 // and store it.
25637 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25638 MVT CastVT = MVT::getVectorVT(StVT, 2);
25639 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25640 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25641 DAG.getVectorIdxConstant(0, dl));
25642
25643 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25644 St->getPointerInfo(), St->getBaseAlign(),
25645 St->getMemOperand()->getFlags());
25646 }
25647 assert(Subtarget.hasSSE1() && "Expected SSE");
25648 SDVTList Tys = DAG.getVTList(MVT::Other);
25649 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25650 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25651 St->getMemOperand());
25652}
25653
25654// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25655// may emit an illegal shuffle but the expansion is still better than scalar
25656// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25657// we'll emit a shuffle and a arithmetic shift.
25658// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25659// TODO: It is possible to support ZExt by zeroing the undef values during
25660// the shuffle phase or after the shuffle.
25661static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25662 SelectionDAG &DAG) {
25663 MVT RegVT = Op.getSimpleValueType();
25664 assert(RegVT.isVector() && "We only custom lower vector loads.");
25665 assert(RegVT.isInteger() &&
25666 "We only custom lower integer vector loads.");
25667
25668 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25669 SDLoc dl(Ld);
25670
25671 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25672 if (RegVT.getVectorElementType() == MVT::i1) {
25673 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25674 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25675 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25676 "Expected AVX512F without AVX512DQI");
25677
25678 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25679 Ld->getPointerInfo(), Ld->getBaseAlign(),
25680 Ld->getMemOperand()->getFlags());
25681
25682 // Replace chain users with the new chain.
25683 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25684
25685 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25686 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25687 DAG.getBitcast(MVT::v16i1, Val),
25688 DAG.getVectorIdxConstant(0, dl));
25689 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25690 }
25691
25692 return SDValue();
25693}
25694
25695/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25696/// each of which has no other use apart from the AND / OR.
25697static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25698 Opc = Op.getOpcode();
25699 if (Opc != ISD::OR && Opc != ISD::AND)
25700 return false;
25701 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25702 Op.getOperand(0).hasOneUse() &&
25703 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25704 Op.getOperand(1).hasOneUse());
25705}
25706
25707SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25708 SDValue Chain = Op.getOperand(0);
25709 SDValue Cond = Op.getOperand(1);
25710 SDValue Dest = Op.getOperand(2);
25711 SDLoc dl(Op);
25712
25713 // Bail out when we don't have native compare instructions.
25714 if (Cond.getOpcode() == ISD::SETCC &&
25715 Cond.getOperand(0).getValueType() != MVT::f128 &&
25716 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25717 SDValue LHS = Cond.getOperand(0);
25718 SDValue RHS = Cond.getOperand(1);
25719 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25720
25721 // Special case for
25722 // setcc([su]{add,sub,mul}o == 0)
25723 // setcc([su]{add,sub,mul}o != 1)
25725 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25727 SDValue Value, Overflow;
25728 X86::CondCode X86Cond;
25729 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25730
25731 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25732 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25733
25734 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25735 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25736 Overflow, Op->getFlags());
25737 }
25738
25739 if (LHS.getSimpleValueType().isInteger()) {
25740 SDValue CCVal;
25741 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25742 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25743 EFLAGS, Op->getFlags());
25744 }
25745
25746 if (CC == ISD::SETOEQ) {
25747 // For FCMP_OEQ, we can emit
25748 // two branches instead of an explicit AND instruction with a
25749 // separate test. However, we only do this if this block doesn't
25750 // have a fall-through edge, because this requires an explicit
25751 // jmp when the condition is false.
25752 if (Op.getNode()->hasOneUse()) {
25753 SDNode *User = *Op.getNode()->user_begin();
25754 // Look for an unconditional branch following this conditional branch.
25755 // We need this because we need to reverse the successors in order
25756 // to implement FCMP_OEQ.
25757 if (User->getOpcode() == ISD::BR) {
25758 SDValue FalseBB = User->getOperand(1);
25759 SDNode *NewBR =
25760 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25761 assert(NewBR == User);
25762 (void)NewBR;
25763 Dest = FalseBB;
25764
25765 SDValue Cmp =
25766 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25767 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25768 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25769 CCVal, Cmp, Op->getFlags());
25770 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25771 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25772 Cmp, Op->getFlags());
25773 }
25774 }
25775 } else if (CC == ISD::SETUNE) {
25776 // For FCMP_UNE, we can emit
25777 // two branches instead of an explicit OR instruction with a
25778 // separate test.
25779 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25780 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25781 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25782 Cmp, Op->getFlags());
25783 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25784 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25785 Cmp, Op->getFlags());
25786 } else {
25787 X86::CondCode X86Cond =
25788 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25789 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25790 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25791 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25792 Cmp, Op->getFlags());
25793 }
25794 }
25795
25797 SDValue Value, Overflow;
25798 X86::CondCode X86Cond;
25799 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25800
25801 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25802 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25803 Overflow, Op->getFlags());
25804 }
25805
25806 // Look past the truncate if the high bits are known zero.
25808 Cond = Cond.getOperand(0);
25809
25810 EVT CondVT = Cond.getValueType();
25811
25812 // Add an AND with 1 if we don't already have one.
25813 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25814 Cond =
25815 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25816
25817 SDValue LHS = Cond;
25818 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25819
25820 SDValue CCVal;
25821 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25822 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25823 Op->getFlags());
25824}
25825
25826// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25827// Calls to _alloca are needed to probe the stack when allocating more than 4k
25828// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25829// that the guard pages used by the OS virtual memory manager are allocated in
25830// correct sequence.
25831SDValue
25832X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25833 SelectionDAG &DAG) const {
25834 MachineFunction &MF = DAG.getMachineFunction();
25835 bool SplitStack = MF.shouldSplitStack();
25836 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25837 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25838 SplitStack || EmitStackProbeCall;
25839 SDLoc dl(Op);
25840
25841 // Get the inputs.
25842 SDNode *Node = Op.getNode();
25843 SDValue Chain = Op.getOperand(0);
25844 SDValue Size = Op.getOperand(1);
25845 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25846 EVT VT = Node->getValueType(0);
25847
25848 // Chain the dynamic stack allocation so that it doesn't modify the stack
25849 // pointer when other instructions are using the stack.
25850 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25851
25852 bool Is64Bit = Subtarget.is64Bit();
25853 MVT SPTy = Op.getValueType().getSimpleVT();
25854
25856 if (!Lower) {
25857 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25859 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25860 " not tell us which reg is the stack pointer!");
25861
25862 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25863 const Align StackAlign = TFI.getStackAlign();
25864 if (hasInlineStackProbe(MF)) {
25865 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25866 {Chain, Size});
25867 Chain = Result.getValue(1);
25868 } else {
25869 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25870 Chain = SP.getValue(1);
25871 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25872 }
25873 if (Alignment && *Alignment > StackAlign)
25874 Result = DAG.getNode(
25875 ISD::AND, dl, VT, Result,
25876 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25877 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25878 } else if (SplitStack) {
25879 if (Is64Bit) {
25880 // The 64 bit implementation of segmented stacks needs to clobber both r10
25881 // r11. This makes it impossible to use it along with nested parameters.
25882 const Function &F = MF.getFunction();
25883 for (const auto &A : F.args()) {
25884 if (A.hasNestAttr())
25885 report_fatal_error("Cannot use segmented stacks with functions that "
25886 "have nested arguments.");
25887 }
25888 }
25889
25890 Result =
25891 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25892 Chain = Result.getValue(1);
25893 } else {
25894 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25895 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25896 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25897
25898 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25899 Register SPReg = RegInfo->getStackRegister();
25900 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25901 Chain = SP.getValue(1);
25902
25903 if (Alignment) {
25904 SP = DAG.getNode(
25905 ISD::AND, dl, VT, SP.getValue(0),
25906 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25907 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25908 }
25909
25910 Result = SP;
25911 }
25912
25913 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25914
25915 SDValue Ops[2] = {Result, Chain};
25916 return DAG.getMergeValues(Ops, dl);
25917}
25918
25919SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25920 MachineFunction &MF = DAG.getMachineFunction();
25921 SDValue Ptr = Op.getOperand(1);
25922 EVT PtrVT = Ptr.getValueType();
25923
25924 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25925
25926 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25927 SDLoc DL(Op);
25928
25929 if (!Subtarget.is64Bit() ||
25930 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25931 // vastart just stores the address of the VarArgsFrameIndex slot into the
25932 // memory location argument.
25933 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25934 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25935 }
25936
25937 // __va_list_tag:
25938 // gp_offset (0 - 6 * 8)
25939 // fp_offset (48 - 48 + 8 * 16)
25940 // overflow_arg_area (point to parameters coming in memory).
25941 // reg_save_area
25943 SDValue FIN = Op.getOperand(1);
25944 // Store gp_offset
25945 SDValue Store = DAG.getStore(
25946 Op.getOperand(0), DL,
25947 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25948 MachinePointerInfo(SV));
25949 MemOps.push_back(Store);
25950
25951 // Store fp_offset
25952 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25953 Store = DAG.getStore(
25954 Op.getOperand(0), DL,
25955 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25956 MachinePointerInfo(SV, 4));
25957 MemOps.push_back(Store);
25958
25959 // Store ptr to overflow_arg_area
25960 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25961 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25962 Store =
25963 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25964 MemOps.push_back(Store);
25965
25966 // Store ptr to reg_save_area.
25967 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25968 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25969 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25970 Store = DAG.getStore(
25971 Op.getOperand(0), DL, RSFIN, FIN,
25972 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25973 MemOps.push_back(Store);
25974 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25975}
25976
25977SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25978 assert(Subtarget.is64Bit() &&
25979 "LowerVAARG only handles 64-bit va_arg!");
25980 assert(Op.getNumOperands() == 4);
25981
25982 MachineFunction &MF = DAG.getMachineFunction();
25983 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25984 // The Win64 ABI uses char* instead of a structure.
25985 return DAG.expandVAArg(Op.getNode());
25986
25987 SDValue Chain = Op.getOperand(0);
25988 SDValue SrcPtr = Op.getOperand(1);
25989 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25990 unsigned Align = Op.getConstantOperandVal(3);
25991 SDLoc dl(Op);
25992
25993 EVT ArgVT = Op.getNode()->getValueType(0);
25994 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25995 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25996 uint8_t ArgMode;
25997
25998 // Decide which area this value should be read from.
25999 // TODO: Implement the AMD64 ABI in its entirety. This simple
26000 // selection mechanism works only for the basic types.
26001 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
26002 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
26003 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
26004 } else {
26005 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
26006 "Unhandled argument type in LowerVAARG");
26007 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
26008 }
26009
26010 if (ArgMode == 2) {
26011 // Make sure using fp_offset makes sense.
26012 assert(!Subtarget.useSoftFloat() &&
26013 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
26014 Subtarget.hasSSE1());
26015 }
26016
26017 // Insert VAARG node into the DAG
26018 // VAARG returns two values: Variable Argument Address, Chain
26019 SDValue InstOps[] = {Chain, SrcPtr,
26020 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
26021 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
26022 DAG.getTargetConstant(Align, dl, MVT::i32)};
26023 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
26024 SDValue VAARG = DAG.getMemIntrinsicNode(
26025 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
26026 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26027 /*Alignment=*/std::nullopt,
26029 Chain = VAARG.getValue(1);
26030
26031 // Load the next argument and return it
26032 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26033}
26034
26035static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26036 SelectionDAG &DAG) {
26037 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26038 // where a va_list is still an i8*.
26039 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26040 if (Subtarget.isCallingConvWin64(
26042 // Probably a Win64 va_copy.
26043 return DAG.expandVACopy(Op.getNode());
26044
26045 SDValue Chain = Op.getOperand(0);
26046 SDValue DstPtr = Op.getOperand(1);
26047 SDValue SrcPtr = Op.getOperand(2);
26048 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26049 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26050 SDLoc DL(Op);
26051
26052 return DAG.getMemcpy(
26053 Chain, DL, DstPtr, SrcPtr,
26054 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26055 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26056 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26057 MachinePointerInfo(SrcSV));
26058}
26059
26060// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26061static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26062 switch (Opc) {
26063 case ISD::SHL:
26064 case X86ISD::VSHL:
26065 case X86ISD::VSHLI:
26066 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26067 case ISD::SRL:
26068 case X86ISD::VSRL:
26069 case X86ISD::VSRLI:
26070 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26071 case ISD::SRA:
26072 case X86ISD::VSRA:
26073 case X86ISD::VSRAI:
26074 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26075 }
26076 llvm_unreachable("Unknown target vector shift node");
26077}
26078
26079/// Handle vector element shifts where the shift amount is a constant.
26080/// Takes immediate version of shift as input.
26081static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26082 SDValue SrcOp, uint64_t ShiftAmt,
26083 SelectionDAG &DAG) {
26084 MVT ElementType = VT.getVectorElementType();
26085
26086 // Bitcast the source vector to the output type, this is mainly necessary for
26087 // vXi8/vXi64 shifts.
26088 if (VT != SrcOp.getSimpleValueType())
26089 SrcOp = DAG.getBitcast(VT, SrcOp);
26090
26091 // Fold this packed shift into its first operand if ShiftAmt is 0.
26092 if (ShiftAmt == 0)
26093 return SrcOp;
26094
26095 // Check for ShiftAmt >= element width
26096 if (ShiftAmt >= ElementType.getSizeInBits()) {
26097 if (Opc == X86ISD::VSRAI)
26098 ShiftAmt = ElementType.getSizeInBits() - 1;
26099 else
26100 return DAG.getConstant(0, dl, VT);
26101 }
26102
26104 && "Unknown target vector shift-by-constant node");
26105
26106 // Fold this packed vector shift into a build vector if SrcOp is a
26107 // vector of Constants or UNDEFs.
26109 unsigned ShiftOpc;
26110 switch (Opc) {
26111 default: llvm_unreachable("Unknown opcode!");
26112 case X86ISD::VSHLI:
26113 ShiftOpc = ISD::SHL;
26114 break;
26115 case X86ISD::VSRLI:
26116 ShiftOpc = ISD::SRL;
26117 break;
26118 case X86ISD::VSRAI:
26119 ShiftOpc = ISD::SRA;
26120 break;
26121 }
26122
26123 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26124 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26125 return C;
26126 }
26127
26128 return DAG.getNode(Opc, dl, VT, SrcOp,
26129 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26130}
26131
26132/// Handle vector element shifts by a splat shift amount
26133static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26134 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26135 const X86Subtarget &Subtarget,
26136 SelectionDAG &DAG) {
26137 MVT AmtVT = ShAmt.getSimpleValueType();
26138 assert(AmtVT.isVector() && "Vector shift type mismatch");
26139 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26140 "Illegal vector splat index");
26141
26142 // Move the splat element to the bottom element.
26143 if (ShAmtIdx != 0) {
26144 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26145 Mask[0] = ShAmtIdx;
26146 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26147 }
26148
26149 // Peek through any zext node if we can get back to a 128-bit source.
26150 if (AmtVT.getScalarSizeInBits() == 64 &&
26151 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26153 ShAmt.getOperand(0).getValueType().isSimple() &&
26154 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26155 ShAmt = ShAmt.getOperand(0);
26156 AmtVT = ShAmt.getSimpleValueType();
26157 }
26158
26159 // See if we can mask off the upper elements using the existing source node.
26160 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26161 // do this for vXi64 types.
26162 bool IsMasked = false;
26163 if (AmtVT.getScalarSizeInBits() < 64) {
26164 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26165 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26166 // If the shift amount has come from a scalar, then zero-extend the scalar
26167 // before moving to the vector.
26168 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26169 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26170 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26171 AmtVT = MVT::v4i32;
26172 IsMasked = true;
26173 } else if (ShAmt.getOpcode() == ISD::AND) {
26174 // See if the shift amount is already masked (e.g. for rotation modulo),
26175 // then we can zero-extend it by setting all the other mask elements to
26176 // zero.
26177 SmallVector<SDValue> MaskElts(
26178 AmtVT.getVectorNumElements(),
26179 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26180 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26181 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26182 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26183 {ShAmt.getOperand(1), Mask}))) {
26184 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26185 IsMasked = true;
26186 }
26187 }
26188 }
26189
26190 // Extract if the shift amount vector is larger than 128-bits.
26191 if (AmtVT.getSizeInBits() > 128) {
26192 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26193 AmtVT = ShAmt.getSimpleValueType();
26194 }
26195
26196 // Zero-extend bottom element to v2i64 vector type, either by extension or
26197 // shuffle masking.
26198 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26199 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26200 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26201 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26202 } else if (Subtarget.hasSSE41()) {
26203 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26204 MVT::v2i64, ShAmt);
26205 } else {
26206 SDValue ByteShift = DAG.getTargetConstant(
26207 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26208 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26209 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26210 ByteShift);
26211 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26212 ByteShift);
26213 }
26214 }
26215
26216 // Change opcode to non-immediate version.
26218
26219 // The return type has to be a 128-bit type with the same element
26220 // type as the input type.
26221 MVT EltVT = VT.getVectorElementType();
26222 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26223
26224 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26225 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26226}
26227
26228/// Return Mask with the necessary casting or extending
26229/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26230static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26231 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26232 const SDLoc &dl) {
26233
26234 if (isAllOnesConstant(Mask))
26235 return DAG.getConstant(1, dl, MaskVT);
26236 if (X86::isZeroNode(Mask))
26237 return DAG.getConstant(0, dl, MaskVT);
26238
26239 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26240
26241 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26242 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26243 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26244 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26245 SDValue Lo, Hi;
26246 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26247 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26248 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26249 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26250 } else {
26251 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26252 Mask.getSimpleValueType().getSizeInBits());
26253 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26254 // are extracted by EXTRACT_SUBVECTOR.
26255 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26256 DAG.getBitcast(BitcastVT, Mask),
26257 DAG.getVectorIdxConstant(0, dl));
26258 }
26259}
26260
26261/// Return (and \p Op, \p Mask) for compare instructions or
26262/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26263/// necessary casting or extending for \p Mask when lowering masking intrinsics
26265 SDValue PreservedSrc,
26266 const X86Subtarget &Subtarget,
26267 SelectionDAG &DAG) {
26268 MVT VT = Op.getSimpleValueType();
26269 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26270 unsigned OpcodeSelect = ISD::VSELECT;
26271 SDLoc dl(Op);
26272
26273 if (isAllOnesConstant(Mask))
26274 return Op;
26275
26276 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26277
26278 if (PreservedSrc.isUndef())
26279 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26280 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26281}
26282
26283/// Creates an SDNode for a predicated scalar operation.
26284/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26285/// The mask is coming as MVT::i8 and it should be transformed
26286/// to MVT::v1i1 while lowering masking intrinsics.
26287/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26288/// "X86select" instead of "vselect". We just can't create the "vselect" node
26289/// for a scalar instruction.
26291 SDValue PreservedSrc,
26292 const X86Subtarget &Subtarget,
26293 SelectionDAG &DAG) {
26294 auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
26295 if (MaskConst && (MaskConst->getZExtValue() & 0x1))
26296 return Op;
26297
26298 MVT VT = Op.getSimpleValueType();
26299 SDLoc dl(Op);
26300
26301 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26302 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26303 DAG.getBitcast(MVT::v8i1, Mask),
26304 DAG.getVectorIdxConstant(0, dl));
26305 if (Op.getOpcode() == X86ISD::FSETCCM ||
26306 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26307 Op.getOpcode() == X86ISD::VFPCLASSS)
26308 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26309
26310 if (PreservedSrc.isUndef())
26311 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26312
26313 if (MaskConst) {
26314 assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
26315 // Discard op and blend passthrough with scalar op src/dst.
26317 std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
26318 ShuffleMask[0] = VT.getVectorNumElements();
26319 return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
26320 ShuffleMask);
26321 }
26322
26323 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26324}
26325
26327 if (!Fn->hasPersonalityFn())
26329 "querying registration node size for function without personality");
26330 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26331 // WinEHStatePass for the full struct definition.
26332 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26333 case EHPersonality::MSVC_X86SEH: return 24;
26334 case EHPersonality::MSVC_CXX: return 16;
26335 default: break;
26336 }
26338 "can only recover FP for 32-bit MSVC EH personality functions");
26339}
26340
26341/// When the MSVC runtime transfers control to us, either to an outlined
26342/// function or when returning to a parent frame after catching an exception, we
26343/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26344/// Here's the math:
26345/// RegNodeBase = EntryEBP - RegNodeSize
26346/// ParentFP = RegNodeBase - ParentFrameOffset
26347/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26348/// subtracting the offset (negative on x86) takes us back to the parent FP.
26350 SDValue EntryEBP) {
26352 SDLoc dl;
26353
26354 // It's possible that the parent function no longer has a personality function
26355 // if the exceptional code was optimized away, in which case we just return
26356 // the incoming EBP.
26357 if (!Fn->hasPersonalityFn())
26358 return EntryEBP;
26359
26360 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26361 // registration, or the .set_setframe offset.
26364 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26365 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26366 SDValue ParentFrameOffset =
26367 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26368
26369 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26370 // prologue to RBP in the parent function.
26371 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26372 if (Subtarget.is64Bit())
26373 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26374
26375 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26376 // RegNodeBase = EntryEBP - RegNodeSize
26377 // ParentFP = RegNodeBase - ParentFrameOffset
26378 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26379 DAG.getConstant(RegNodeSize, dl, PtrVT));
26380 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26381}
26382
26383SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26384 SelectionDAG &DAG) const {
26385 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26386 auto isRoundModeCurDirection = [](SDValue Rnd) {
26387 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26388 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26389
26390 return false;
26391 };
26392 auto isRoundModeSAE = [](SDValue Rnd) {
26393 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26394 unsigned RC = C->getZExtValue();
26396 // Clear the NO_EXC bit and check remaining bits.
26398 // As a convenience we allow no other bits or explicitly
26399 // current direction.
26400 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26401 }
26402 }
26403
26404 return false;
26405 };
26406 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26407 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26408 RC = C->getZExtValue();
26410 // Clear the NO_EXC bit and check remaining bits.
26416 }
26417 }
26418
26419 return false;
26420 };
26421
26422 SDLoc dl(Op);
26423 unsigned IntNo = Op.getConstantOperandVal(0);
26424 MVT VT = Op.getSimpleValueType();
26425 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26426
26427 // Propagate flags from original node to transformed node(s).
26428 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26429
26430 if (IntrData) {
26431 switch(IntrData->Type) {
26432 case INTR_TYPE_1OP: {
26433 // We specify 2 possible opcodes for intrinsics with rounding modes.
26434 // First, we check if the intrinsic may have non-default rounding mode,
26435 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26436 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26437 if (IntrWithRoundingModeOpcode != 0) {
26438 SDValue Rnd = Op.getOperand(2);
26439 unsigned RC = 0;
26440 if (isRoundModeSAEToX(Rnd, RC))
26441 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26442 Op.getOperand(1),
26443 DAG.getTargetConstant(RC, dl, MVT::i32));
26444 if (!isRoundModeCurDirection(Rnd))
26445 return SDValue();
26446 }
26447 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26448 Op.getOperand(1));
26449 }
26450 case INTR_TYPE_1OP_SAE: {
26451 SDValue Sae = Op.getOperand(2);
26452
26453 unsigned Opc;
26454 if (isRoundModeCurDirection(Sae))
26455 Opc = IntrData->Opc0;
26456 else if (isRoundModeSAE(Sae))
26457 Opc = IntrData->Opc1;
26458 else
26459 return SDValue();
26460
26461 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26462 }
26463 case INTR_TYPE_2OP: {
26464 SDValue Src2 = Op.getOperand(2);
26465
26466 // We specify 2 possible opcodes for intrinsics with rounding modes.
26467 // First, we check if the intrinsic may have non-default rounding mode,
26468 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26469 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26470 if (IntrWithRoundingModeOpcode != 0) {
26471 SDValue Rnd = Op.getOperand(3);
26472 unsigned RC = 0;
26473 if (isRoundModeSAEToX(Rnd, RC))
26474 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26475 Op.getOperand(1), Src2,
26476 DAG.getTargetConstant(RC, dl, MVT::i32));
26477 if (!isRoundModeCurDirection(Rnd))
26478 return SDValue();
26479 }
26480
26481 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26482 Op.getOperand(1), Src2);
26483 }
26484 case INTR_TYPE_2OP_SAE: {
26485 SDValue Sae = Op.getOperand(3);
26486
26487 unsigned Opc;
26488 if (isRoundModeCurDirection(Sae))
26489 Opc = IntrData->Opc0;
26490 else if (isRoundModeSAE(Sae))
26491 Opc = IntrData->Opc1;
26492 else
26493 return SDValue();
26494
26495 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26496 Op.getOperand(2));
26497 }
26498 case INTR_TYPE_3OP:
26499 case INTR_TYPE_3OP_IMM8: {
26500 SDValue Src1 = Op.getOperand(1);
26501 SDValue Src2 = Op.getOperand(2);
26502 SDValue Src3 = Op.getOperand(3);
26503
26504 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26505 Src3.getValueType() != MVT::i8) {
26506 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26507 }
26508
26509 // We specify 2 possible opcodes for intrinsics with rounding modes.
26510 // First, we check if the intrinsic may have non-default rounding mode,
26511 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26512 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26513 if (IntrWithRoundingModeOpcode != 0) {
26514 SDValue Rnd = Op.getOperand(4);
26515 unsigned RC = 0;
26516 if (isRoundModeSAEToX(Rnd, RC))
26517 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26518 Src1, Src2, Src3,
26519 DAG.getTargetConstant(RC, dl, MVT::i32));
26520 if (!isRoundModeCurDirection(Rnd))
26521 return SDValue();
26522 }
26523
26524 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26525 {Src1, Src2, Src3});
26526 }
26527 case INTR_TYPE_4OP_IMM8: {
26528 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26529 SDValue Src4 = Op.getOperand(4);
26530 if (Src4.getValueType() != MVT::i8) {
26531 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26532 }
26533
26534 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26535 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26536 Src4);
26537 }
26538 case INTR_TYPE_1OP_MASK: {
26539 SDValue Src = Op.getOperand(1);
26540 SDValue PassThru = Op.getOperand(2);
26541 SDValue Mask = Op.getOperand(3);
26542 // We add rounding mode to the Node when
26543 // - RC Opcode is specified and
26544 // - RC is not "current direction".
26545 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26546 if (IntrWithRoundingModeOpcode != 0) {
26547 SDValue Rnd = Op.getOperand(4);
26548 unsigned RC = 0;
26549 if (isRoundModeSAEToX(Rnd, RC))
26550 return getVectorMaskingNode(
26551 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26552 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26553 Mask, PassThru, Subtarget, DAG);
26554 if (!isRoundModeCurDirection(Rnd))
26555 return SDValue();
26556 }
26557 return getVectorMaskingNode(
26558 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26559 Subtarget, DAG);
26560 }
26562 SDValue Src = Op.getOperand(1);
26563 SDValue PassThru = Op.getOperand(2);
26564 SDValue Mask = Op.getOperand(3);
26565 SDValue Rnd = Op.getOperand(4);
26566
26567 unsigned Opc;
26568 if (isRoundModeCurDirection(Rnd))
26569 Opc = IntrData->Opc0;
26570 else if (isRoundModeSAE(Rnd))
26571 Opc = IntrData->Opc1;
26572 else
26573 return SDValue();
26574
26575 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26576 Subtarget, DAG);
26577 }
26578 case INTR_TYPE_SCALAR_MASK: {
26579 SDValue Src1 = Op.getOperand(1);
26580 SDValue Src2 = Op.getOperand(2);
26581 SDValue passThru = Op.getOperand(3);
26582 SDValue Mask = Op.getOperand(4);
26583 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26584 // There are 2 kinds of intrinsics in this group:
26585 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26586 // (2) With rounding mode and sae - 7 operands.
26587 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26588 if (Op.getNumOperands() == (5U + HasRounding)) {
26589 if (HasRounding) {
26590 SDValue Rnd = Op.getOperand(5);
26591 unsigned RC = 0;
26592 if (isRoundModeSAEToX(Rnd, RC))
26593 return getScalarMaskingNode(
26594 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26595 DAG.getTargetConstant(RC, dl, MVT::i32)),
26596 Mask, passThru, Subtarget, DAG);
26597 if (!isRoundModeCurDirection(Rnd))
26598 return SDValue();
26599 }
26600 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26601 Src2),
26602 Mask, passThru, Subtarget, DAG);
26603 }
26604
26605 assert(Op.getNumOperands() == (6U + HasRounding) &&
26606 "Unexpected intrinsic form");
26607 SDValue RoundingMode = Op.getOperand(5);
26608 unsigned Opc = IntrData->Opc0;
26609 if (HasRounding) {
26610 SDValue Sae = Op.getOperand(6);
26611 if (isRoundModeSAE(Sae))
26612 Opc = IntrWithRoundingModeOpcode;
26613 else if (!isRoundModeCurDirection(Sae))
26614 return SDValue();
26615 }
26616 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26617 Src2, RoundingMode),
26618 Mask, passThru, Subtarget, DAG);
26619 }
26621 SDValue Src1 = Op.getOperand(1);
26622 SDValue Src2 = Op.getOperand(2);
26623 SDValue passThru = Op.getOperand(3);
26624 SDValue Mask = Op.getOperand(4);
26625 SDValue Rnd = Op.getOperand(5);
26626
26627 SDValue NewOp;
26628 unsigned RC = 0;
26629 if (isRoundModeCurDirection(Rnd))
26630 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26631 else if (isRoundModeSAEToX(Rnd, RC))
26632 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26633 DAG.getTargetConstant(RC, dl, MVT::i32));
26634 else
26635 return SDValue();
26636
26637 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26638 }
26640 SDValue Src1 = Op.getOperand(1);
26641 SDValue Src2 = Op.getOperand(2);
26642 SDValue passThru = Op.getOperand(3);
26643 SDValue Mask = Op.getOperand(4);
26644 SDValue Sae = Op.getOperand(5);
26645 unsigned Opc;
26646 if (isRoundModeCurDirection(Sae))
26647 Opc = IntrData->Opc0;
26648 else if (isRoundModeSAE(Sae))
26649 Opc = IntrData->Opc1;
26650 else
26651 return SDValue();
26652
26653 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26654 Mask, passThru, Subtarget, DAG);
26655 }
26656 case INTR_TYPE_2OP_MASK: {
26657 SDValue Src1 = Op.getOperand(1);
26658 SDValue Src2 = Op.getOperand(2);
26659 SDValue PassThru = Op.getOperand(3);
26660 SDValue Mask = Op.getOperand(4);
26661 SDValue NewOp;
26662 if (IntrData->Opc1 != 0) {
26663 SDValue Rnd = Op.getOperand(5);
26664 unsigned RC = 0;
26665 if (isRoundModeSAEToX(Rnd, RC))
26666 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26667 DAG.getTargetConstant(RC, dl, MVT::i32));
26668 else if (!isRoundModeCurDirection(Rnd))
26669 return SDValue();
26670 }
26671 if (!NewOp)
26672 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26673 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26674 }
26676 SDValue Src1 = Op.getOperand(1);
26677 SDValue Src2 = Op.getOperand(2);
26678 SDValue PassThru = Op.getOperand(3);
26679 SDValue Mask = Op.getOperand(4);
26680
26681 unsigned Opc = IntrData->Opc0;
26682 if (IntrData->Opc1 != 0) {
26683 SDValue Sae = Op.getOperand(5);
26684 if (isRoundModeSAE(Sae))
26685 Opc = IntrData->Opc1;
26686 else if (!isRoundModeCurDirection(Sae))
26687 return SDValue();
26688 }
26689
26690 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26691 Mask, PassThru, Subtarget, DAG);
26692 }
26694 SDValue Src1 = Op.getOperand(1);
26695 SDValue Src2 = Op.getOperand(2);
26696 SDValue Src3 = Op.getOperand(3);
26697 SDValue PassThru = Op.getOperand(4);
26698 SDValue Mask = Op.getOperand(5);
26699 SDValue Sae = Op.getOperand(6);
26700 unsigned Opc;
26701 if (isRoundModeCurDirection(Sae))
26702 Opc = IntrData->Opc0;
26703 else if (isRoundModeSAE(Sae))
26704 Opc = IntrData->Opc1;
26705 else
26706 return SDValue();
26707
26708 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26709 Mask, PassThru, Subtarget, DAG);
26710 }
26712 SDValue Src1 = Op.getOperand(1);
26713 SDValue Src2 = Op.getOperand(2);
26714 SDValue Src3 = Op.getOperand(3);
26715 SDValue PassThru = Op.getOperand(4);
26716 SDValue Mask = Op.getOperand(5);
26717
26718 unsigned Opc = IntrData->Opc0;
26719 if (IntrData->Opc1 != 0) {
26720 SDValue Sae = Op.getOperand(6);
26721 if (isRoundModeSAE(Sae))
26722 Opc = IntrData->Opc1;
26723 else if (!isRoundModeCurDirection(Sae))
26724 return SDValue();
26725 }
26726 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26727 Mask, PassThru, Subtarget, DAG);
26728 }
26729 case BLENDV: {
26730 SDValue Src1 = Op.getOperand(1);
26731 SDValue Src2 = Op.getOperand(2);
26732 SDValue Src3 = Op.getOperand(3);
26733
26734 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26735 Src3 = DAG.getBitcast(MaskVT, Src3);
26736
26737 // Reverse the operands to match VSELECT order.
26738 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26739 }
26740 case VPERM_2OP : {
26741 SDValue Src1 = Op.getOperand(1);
26742 SDValue Src2 = Op.getOperand(2);
26743
26744 // Swap Src1 and Src2 in the node creation
26745 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26746 }
26747 case CFMA_OP_MASKZ:
26748 case CFMA_OP_MASK: {
26749 SDValue Src1 = Op.getOperand(1);
26750 SDValue Src2 = Op.getOperand(2);
26751 SDValue Src3 = Op.getOperand(3);
26752 SDValue Mask = Op.getOperand(4);
26753 MVT VT = Op.getSimpleValueType();
26754
26755 SDValue PassThru = Src3;
26756 if (IntrData->Type == CFMA_OP_MASKZ)
26757 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26758
26759 // We add rounding mode to the Node when
26760 // - RC Opcode is specified and
26761 // - RC is not "current direction".
26762 SDValue NewOp;
26763 if (IntrData->Opc1 != 0) {
26764 SDValue Rnd = Op.getOperand(5);
26765 unsigned RC = 0;
26766 if (isRoundModeSAEToX(Rnd, RC))
26767 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26768 DAG.getTargetConstant(RC, dl, MVT::i32));
26769 else if (!isRoundModeCurDirection(Rnd))
26770 return SDValue();
26771 }
26772 if (!NewOp)
26773 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26774 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26775 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26776 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26777 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26778 }
26779 case IFMA_OP:
26780 // NOTE: We need to swizzle the operands to pass the multiply operands
26781 // first.
26782 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26783 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26784 case FPCLASSS: {
26785 SDValue Src1 = Op.getOperand(1);
26786 SDValue Imm = Op.getOperand(2);
26787 SDValue Mask = Op.getOperand(3);
26788 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26789 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26790 Subtarget, DAG);
26791 // Need to fill with zeros to ensure the bitcast will produce zeroes
26792 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26793 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26794 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26795 DAG.getVectorIdxConstant(0, dl));
26796 return DAG.getBitcast(MVT::i8, Ins);
26797 }
26798
26799 case CMP_MASK_CC: {
26800 MVT MaskVT = Op.getSimpleValueType();
26801 SDValue CC = Op.getOperand(3);
26802 SDValue Mask = Op.getOperand(4);
26803 // We specify 2 possible opcodes for intrinsics with rounding modes.
26804 // First, we check if the intrinsic may have non-default rounding mode,
26805 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26806 if (IntrData->Opc1 != 0) {
26807 SDValue Sae = Op.getOperand(5);
26808 if (isRoundModeSAE(Sae))
26809 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26810 Op.getOperand(2), CC, Mask, Sae);
26811 if (!isRoundModeCurDirection(Sae))
26812 return SDValue();
26813 }
26814 //default rounding mode
26815 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26816 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26817 }
26818 case CMP_MASK_SCALAR_CC: {
26819 SDValue Src1 = Op.getOperand(1);
26820 SDValue Src2 = Op.getOperand(2);
26821 SDValue CC = Op.getOperand(3);
26822 SDValue Mask = Op.getOperand(4);
26823
26824 SDValue Cmp;
26825 if (IntrData->Opc1 != 0) {
26826 SDValue Sae = Op.getOperand(5);
26827 if (isRoundModeSAE(Sae))
26828 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26829 else if (!isRoundModeCurDirection(Sae))
26830 return SDValue();
26831 }
26832 //default rounding mode
26833 if (!Cmp.getNode())
26834 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26835
26836 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26837 Subtarget, DAG);
26838 // Need to fill with zeros to ensure the bitcast will produce zeroes
26839 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26840 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26841 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26842 DAG.getVectorIdxConstant(0, dl));
26843 return DAG.getBitcast(MVT::i8, Ins);
26844 }
26845 case COMI: { // Comparison intrinsics
26846 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26847 SDValue LHS = Op.getOperand(1);
26848 SDValue RHS = Op.getOperand(2);
26849 // Some conditions require the operands to be swapped.
26850 if (CC == ISD::SETLT || CC == ISD::SETLE)
26851 std::swap(LHS, RHS);
26852
26853 // For AVX10.2, Support EQ and NE.
26854 bool HasAVX10_2_COMX =
26855 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26856
26857 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26858 // For BF type we need to fall back.
26859 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26860
26861 auto ComiOpCode = IntrData->Opc0;
26862 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26863
26864 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26865 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26866
26867 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26868
26869 SDValue SetCC;
26870 switch (CC) {
26871 case ISD::SETEQ: {
26872 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26873 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26874 break;
26875 // (ZF = 1 and PF = 0)
26876 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26877 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26878 break;
26879 }
26880 case ISD::SETNE: {
26881 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26882 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26883 break;
26884 // (ZF = 0 or PF = 1)
26885 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26886 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26887 break;
26888 }
26889 case ISD::SETGT: // (CF = 0 and ZF = 0)
26890 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26891 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26892 break;
26893 }
26894 case ISD::SETGE: // CF = 0
26895 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26896 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26897 break;
26898 default:
26899 llvm_unreachable("Unexpected illegal condition!");
26900 }
26901 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26902 }
26903 case COMI_RM: { // Comparison intrinsics with Sae
26904 SDValue LHS = Op.getOperand(1);
26905 SDValue RHS = Op.getOperand(2);
26906 unsigned CondVal = Op.getConstantOperandVal(3);
26907 SDValue Sae = Op.getOperand(4);
26908
26909 SDValue FCmp;
26910 if (isRoundModeCurDirection(Sae))
26911 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26912 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26913 else if (isRoundModeSAE(Sae))
26914 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26915 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26916 else
26917 return SDValue();
26918 // Need to fill with zeros to ensure the bitcast will produce zeroes
26919 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26920 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26921 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26922 DAG.getVectorIdxConstant(0, dl));
26923 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26924 DAG.getBitcast(MVT::i16, Ins));
26925 }
26926 case VSHIFT: {
26927 SDValue SrcOp = Op.getOperand(1);
26928 SDValue ShAmt = Op.getOperand(2);
26929 assert(ShAmt.getValueType() == MVT::i32 &&
26930 "Unexpected VSHIFT amount type");
26931
26932 // Catch shift-by-constant.
26933 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26934 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26935 Op.getSimpleValueType(), SrcOp,
26936 CShAmt->getZExtValue(), DAG);
26937
26938 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26939 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26940 SrcOp, ShAmt, 0, Subtarget, DAG);
26941 }
26943 SDValue Mask = Op.getOperand(3);
26944 SDValue DataToCompress = Op.getOperand(1);
26945 SDValue PassThru = Op.getOperand(2);
26946 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26947 return Op.getOperand(1);
26948
26949 // Avoid false dependency.
26950 if (PassThru.isUndef())
26951 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26952
26953 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26954 Mask);
26955 }
26956 case FIXUPIMM:
26957 case FIXUPIMM_MASKZ: {
26958 SDValue Src1 = Op.getOperand(1);
26959 SDValue Src2 = Op.getOperand(2);
26960 SDValue Src3 = Op.getOperand(3);
26961 SDValue Imm = Op.getOperand(4);
26962 SDValue Mask = Op.getOperand(5);
26963 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26964 ? Src1
26965 : getZeroVector(VT, Subtarget, DAG, dl);
26966
26967 unsigned Opc = IntrData->Opc0;
26968 if (IntrData->Opc1 != 0) {
26969 SDValue Sae = Op.getOperand(6);
26970 if (isRoundModeSAE(Sae))
26971 Opc = IntrData->Opc1;
26972 else if (!isRoundModeCurDirection(Sae))
26973 return SDValue();
26974 }
26975
26976 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26977
26979 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26980
26981 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26982 }
26983 case ROUNDP: {
26984 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26985 // Clear the upper bits of the rounding immediate so that the legacy
26986 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26987 uint64_t Round = Op.getConstantOperandVal(2);
26988 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26989 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26990 Op.getOperand(1), RoundingMode);
26991 }
26992 case ROUNDS: {
26993 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26994 // Clear the upper bits of the rounding immediate so that the legacy
26995 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26996 uint64_t Round = Op.getConstantOperandVal(3);
26997 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26998 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26999 Op.getOperand(1), Op.getOperand(2), RoundingMode);
27000 }
27001 case BEXTRI: {
27002 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
27003
27004 uint64_t Imm = Op.getConstantOperandVal(2);
27005 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
27006 Op.getValueType());
27007 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27008 Op.getOperand(1), Control);
27009 }
27010 // ADC/SBB
27011 case ADX: {
27012 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
27013 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
27014
27015 SDValue Res;
27016 // If the carry in is zero, then we should just use ADD/SUB instead of
27017 // ADC/SBB.
27018 if (isNullConstant(Op.getOperand(1))) {
27019 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
27020 Op.getOperand(3));
27021 } else {
27022 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
27023 DAG.getAllOnesConstant(dl, MVT::i8));
27024 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
27025 Op.getOperand(3), GenCF.getValue(1));
27026 }
27027 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27028 SDValue Results[] = { SetCC, Res };
27029 return DAG.getMergeValues(Results, dl);
27030 }
27031 case CVTPD2PS_MASK:
27032 case CVTPD2DQ_MASK:
27033 case CVTQQ2PS_MASK:
27034 case TRUNCATE_TO_REG: {
27035 SDValue Src = Op.getOperand(1);
27036 SDValue PassThru = Op.getOperand(2);
27037 SDValue Mask = Op.getOperand(3);
27038
27039 if (isAllOnesConstant(Mask))
27040 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27041
27042 MVT SrcVT = Src.getSimpleValueType();
27043 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27044 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27045 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27046 {Src, PassThru, Mask});
27047 }
27048 case TRUNCATE2_TO_REG: {
27049 SDValue Src = Op.getOperand(1);
27050 SDValue Src2 = Op.getOperand(2);
27051 SDValue PassThru = Op.getOperand(3);
27052 SDValue Mask = Op.getOperand(4);
27053
27054 if (isAllOnesConstant(Mask))
27055 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27056
27057 MVT Src2VT = Src2.getSimpleValueType();
27058 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27059 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27060 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27061 {Src, Src2, PassThru, Mask});
27062 }
27063 case CVTPS2PH_MASK: {
27064 SDValue Src = Op.getOperand(1);
27065 SDValue Rnd = Op.getOperand(2);
27066 SDValue PassThru = Op.getOperand(3);
27067 SDValue Mask = Op.getOperand(4);
27068
27069 unsigned RC = 0;
27070 unsigned Opc = IntrData->Opc0;
27071 bool SAE = Src.getValueType().is512BitVector() &&
27072 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27073 if (SAE) {
27075 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27076 }
27077
27078 if (isAllOnesConstant(Mask))
27079 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27080
27081 if (SAE)
27083 else
27084 Opc = IntrData->Opc1;
27085 MVT SrcVT = Src.getSimpleValueType();
27086 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27087 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27088 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27089 }
27090 case CVTNEPS2BF16_MASK: {
27091 SDValue Src = Op.getOperand(1);
27092 SDValue PassThru = Op.getOperand(2);
27093 SDValue Mask = Op.getOperand(3);
27094
27095 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27096 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27097
27098 // Break false dependency.
27099 if (PassThru.isUndef())
27100 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27101
27102 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27103 Mask);
27104 }
27105 default:
27106 break;
27107 }
27108 }
27109
27110 switch (IntNo) {
27111 default: return SDValue(); // Don't custom lower most intrinsics.
27112
27113 // ptest and testp intrinsics. The intrinsic these come from are designed to
27114 // return an integer value, not just an instruction so lower it to the ptest
27115 // or testp pattern and a setcc for the result.
27116 case Intrinsic::x86_avx512_ktestc_b:
27117 case Intrinsic::x86_avx512_ktestc_w:
27118 case Intrinsic::x86_avx512_ktestc_d:
27119 case Intrinsic::x86_avx512_ktestc_q:
27120 case Intrinsic::x86_avx512_ktestz_b:
27121 case Intrinsic::x86_avx512_ktestz_w:
27122 case Intrinsic::x86_avx512_ktestz_d:
27123 case Intrinsic::x86_avx512_ktestz_q:
27124 case Intrinsic::x86_sse41_ptestz:
27125 case Intrinsic::x86_sse41_ptestc:
27126 case Intrinsic::x86_sse41_ptestnzc:
27127 case Intrinsic::x86_avx_ptestz_256:
27128 case Intrinsic::x86_avx_ptestc_256:
27129 case Intrinsic::x86_avx_ptestnzc_256:
27130 case Intrinsic::x86_avx_vtestz_ps:
27131 case Intrinsic::x86_avx_vtestc_ps:
27132 case Intrinsic::x86_avx_vtestnzc_ps:
27133 case Intrinsic::x86_avx_vtestz_pd:
27134 case Intrinsic::x86_avx_vtestc_pd:
27135 case Intrinsic::x86_avx_vtestnzc_pd:
27136 case Intrinsic::x86_avx_vtestz_ps_256:
27137 case Intrinsic::x86_avx_vtestc_ps_256:
27138 case Intrinsic::x86_avx_vtestnzc_ps_256:
27139 case Intrinsic::x86_avx_vtestz_pd_256:
27140 case Intrinsic::x86_avx_vtestc_pd_256:
27141 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27142 unsigned TestOpc = X86ISD::PTEST;
27143 X86::CondCode X86CC;
27144 switch (IntNo) {
27145 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27146 case Intrinsic::x86_avx512_ktestc_b:
27147 case Intrinsic::x86_avx512_ktestc_w:
27148 case Intrinsic::x86_avx512_ktestc_d:
27149 case Intrinsic::x86_avx512_ktestc_q:
27150 // CF = 1
27151 TestOpc = X86ISD::KTEST;
27152 X86CC = X86::COND_B;
27153 break;
27154 case Intrinsic::x86_avx512_ktestz_b:
27155 case Intrinsic::x86_avx512_ktestz_w:
27156 case Intrinsic::x86_avx512_ktestz_d:
27157 case Intrinsic::x86_avx512_ktestz_q:
27158 TestOpc = X86ISD::KTEST;
27159 X86CC = X86::COND_E;
27160 break;
27161 case Intrinsic::x86_avx_vtestz_ps:
27162 case Intrinsic::x86_avx_vtestz_pd:
27163 case Intrinsic::x86_avx_vtestz_ps_256:
27164 case Intrinsic::x86_avx_vtestz_pd_256:
27165 TestOpc = X86ISD::TESTP;
27166 [[fallthrough]];
27167 case Intrinsic::x86_sse41_ptestz:
27168 case Intrinsic::x86_avx_ptestz_256:
27169 // ZF = 1
27170 X86CC = X86::COND_E;
27171 break;
27172 case Intrinsic::x86_avx_vtestc_ps:
27173 case Intrinsic::x86_avx_vtestc_pd:
27174 case Intrinsic::x86_avx_vtestc_ps_256:
27175 case Intrinsic::x86_avx_vtestc_pd_256:
27176 TestOpc = X86ISD::TESTP;
27177 [[fallthrough]];
27178 case Intrinsic::x86_sse41_ptestc:
27179 case Intrinsic::x86_avx_ptestc_256:
27180 // CF = 1
27181 X86CC = X86::COND_B;
27182 break;
27183 case Intrinsic::x86_avx_vtestnzc_ps:
27184 case Intrinsic::x86_avx_vtestnzc_pd:
27185 case Intrinsic::x86_avx_vtestnzc_ps_256:
27186 case Intrinsic::x86_avx_vtestnzc_pd_256:
27187 TestOpc = X86ISD::TESTP;
27188 [[fallthrough]];
27189 case Intrinsic::x86_sse41_ptestnzc:
27190 case Intrinsic::x86_avx_ptestnzc_256:
27191 // ZF and CF = 0
27192 X86CC = X86::COND_A;
27193 break;
27194 }
27195
27196 SDValue LHS = Op.getOperand(1);
27197 SDValue RHS = Op.getOperand(2);
27198 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27199 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27200 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27201 }
27202
27203 case Intrinsic::x86_sse42_pcmpistria128:
27204 case Intrinsic::x86_sse42_pcmpestria128:
27205 case Intrinsic::x86_sse42_pcmpistric128:
27206 case Intrinsic::x86_sse42_pcmpestric128:
27207 case Intrinsic::x86_sse42_pcmpistrio128:
27208 case Intrinsic::x86_sse42_pcmpestrio128:
27209 case Intrinsic::x86_sse42_pcmpistris128:
27210 case Intrinsic::x86_sse42_pcmpestris128:
27211 case Intrinsic::x86_sse42_pcmpistriz128:
27212 case Intrinsic::x86_sse42_pcmpestriz128: {
27213 unsigned Opcode;
27214 X86::CondCode X86CC;
27215 switch (IntNo) {
27216 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27217 case Intrinsic::x86_sse42_pcmpistria128:
27218 Opcode = X86ISD::PCMPISTR;
27219 X86CC = X86::COND_A;
27220 break;
27221 case Intrinsic::x86_sse42_pcmpestria128:
27222 Opcode = X86ISD::PCMPESTR;
27223 X86CC = X86::COND_A;
27224 break;
27225 case Intrinsic::x86_sse42_pcmpistric128:
27226 Opcode = X86ISD::PCMPISTR;
27227 X86CC = X86::COND_B;
27228 break;
27229 case Intrinsic::x86_sse42_pcmpestric128:
27230 Opcode = X86ISD::PCMPESTR;
27231 X86CC = X86::COND_B;
27232 break;
27233 case Intrinsic::x86_sse42_pcmpistrio128:
27234 Opcode = X86ISD::PCMPISTR;
27235 X86CC = X86::COND_O;
27236 break;
27237 case Intrinsic::x86_sse42_pcmpestrio128:
27238 Opcode = X86ISD::PCMPESTR;
27239 X86CC = X86::COND_O;
27240 break;
27241 case Intrinsic::x86_sse42_pcmpistris128:
27242 Opcode = X86ISD::PCMPISTR;
27243 X86CC = X86::COND_S;
27244 break;
27245 case Intrinsic::x86_sse42_pcmpestris128:
27246 Opcode = X86ISD::PCMPESTR;
27247 X86CC = X86::COND_S;
27248 break;
27249 case Intrinsic::x86_sse42_pcmpistriz128:
27250 Opcode = X86ISD::PCMPISTR;
27251 X86CC = X86::COND_E;
27252 break;
27253 case Intrinsic::x86_sse42_pcmpestriz128:
27254 Opcode = X86ISD::PCMPESTR;
27255 X86CC = X86::COND_E;
27256 break;
27257 }
27259 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27260 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27261 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27262 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27263 }
27264
27265 case Intrinsic::x86_sse42_pcmpistri128:
27266 case Intrinsic::x86_sse42_pcmpestri128: {
27267 unsigned Opcode;
27268 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27269 Opcode = X86ISD::PCMPISTR;
27270 else
27271 Opcode = X86ISD::PCMPESTR;
27272
27274 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27275 return DAG.getNode(Opcode, dl, VTs, NewOps);
27276 }
27277
27278 case Intrinsic::x86_sse42_pcmpistrm128:
27279 case Intrinsic::x86_sse42_pcmpestrm128: {
27280 unsigned Opcode;
27281 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27282 Opcode = X86ISD::PCMPISTR;
27283 else
27284 Opcode = X86ISD::PCMPESTR;
27285
27287 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27288 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27289 }
27290
27291 case Intrinsic::eh_sjlj_lsda: {
27292 MachineFunction &MF = DAG.getMachineFunction();
27293 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27294 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27295 auto &Context = MF.getContext();
27296 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27297 Twine(MF.getFunctionNumber()));
27298 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27299 DAG.getMCSymbol(S, PtrVT));
27300 }
27301
27302 case Intrinsic::x86_seh_lsda: {
27303 // Compute the symbol for the LSDA. We know it'll get emitted later.
27304 MachineFunction &MF = DAG.getMachineFunction();
27305 SDValue Op1 = Op.getOperand(1);
27306 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27309
27310 // Generate a simple absolute symbol reference. This intrinsic is only
27311 // supported on 32-bit Windows, which isn't PIC.
27312 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27313 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27314 }
27315
27316 case Intrinsic::eh_recoverfp: {
27317 SDValue FnOp = Op.getOperand(1);
27318 SDValue IncomingFPOp = Op.getOperand(2);
27319 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27320 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27321 if (!Fn)
27323 "llvm.eh.recoverfp must take a function as the first argument");
27324 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27325 }
27326
27327 case Intrinsic::localaddress: {
27328 // Returns one of the stack, base, or frame pointer registers, depending on
27329 // which is used to reference local variables.
27330 MachineFunction &MF = DAG.getMachineFunction();
27331 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27332 Register Reg;
27333 if (RegInfo->hasBasePointer(MF))
27334 Reg = RegInfo->getBaseRegister();
27335 else { // Handles the SP or FP case.
27336 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27337 if (CantUseFP)
27338 Reg = RegInfo->getPtrSizedStackRegister(MF);
27339 else
27340 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27341 }
27342 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27343 }
27344 case Intrinsic::x86_avx512_vp2intersect_q_512:
27345 case Intrinsic::x86_avx512_vp2intersect_q_256:
27346 case Intrinsic::x86_avx512_vp2intersect_q_128:
27347 case Intrinsic::x86_avx512_vp2intersect_d_512:
27348 case Intrinsic::x86_avx512_vp2intersect_d_256:
27349 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27350 SDLoc DL(Op);
27351 MVT MaskVT = Op.getSimpleValueType();
27352 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27354 Op.getOperand(1), Op.getOperand(2));
27355 SDValue Result0 =
27356 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27357 SDValue Result1 =
27358 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27359 return DAG.getMergeValues({Result0, Result1}, DL);
27360 }
27361 case Intrinsic::x86_mmx_pslli_w:
27362 case Intrinsic::x86_mmx_pslli_d:
27363 case Intrinsic::x86_mmx_pslli_q:
27364 case Intrinsic::x86_mmx_psrli_w:
27365 case Intrinsic::x86_mmx_psrli_d:
27366 case Intrinsic::x86_mmx_psrli_q:
27367 case Intrinsic::x86_mmx_psrai_w:
27368 case Intrinsic::x86_mmx_psrai_d: {
27369 SDLoc DL(Op);
27370 SDValue ShAmt = Op.getOperand(2);
27371 // If the argument is a constant, convert it to a target constant.
27372 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27373 // Clamp out of bounds shift amounts since they will otherwise be masked
27374 // to 8-bits which may make it no longer out of bounds.
27375 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27376 if (ShiftAmount == 0)
27377 return Op.getOperand(1);
27378
27379 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27380 Op.getOperand(0), Op.getOperand(1),
27381 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27382 }
27383
27384 unsigned NewIntrinsic;
27385 switch (IntNo) {
27386 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27387 case Intrinsic::x86_mmx_pslli_w:
27388 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27389 break;
27390 case Intrinsic::x86_mmx_pslli_d:
27391 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27392 break;
27393 case Intrinsic::x86_mmx_pslli_q:
27394 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27395 break;
27396 case Intrinsic::x86_mmx_psrli_w:
27397 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27398 break;
27399 case Intrinsic::x86_mmx_psrli_d:
27400 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27401 break;
27402 case Intrinsic::x86_mmx_psrli_q:
27403 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27404 break;
27405 case Intrinsic::x86_mmx_psrai_w:
27406 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27407 break;
27408 case Intrinsic::x86_mmx_psrai_d:
27409 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27410 break;
27411 }
27412
27413 // The vector shift intrinsics with scalars uses 32b shift amounts but
27414 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27415 // MMX register.
27416 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27417 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27418 DAG.getTargetConstant(NewIntrinsic, DL,
27420 Op.getOperand(1), ShAmt);
27421 }
27422 case Intrinsic::thread_pointer: {
27423 if (Subtarget.isTargetELF()) {
27424 SDLoc dl(Op);
27425 EVT PtrVT = Op.getValueType();
27426 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27428 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27429 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27430 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27431 }
27433 "Target OS doesn't support __builtin_thread_pointer() yet.");
27434 }
27435 }
27436}
27437
27439 SDValue Src, SDValue Mask, SDValue Base,
27440 SDValue Index, SDValue ScaleOp, SDValue Chain,
27441 const X86Subtarget &Subtarget) {
27442 SDLoc dl(Op);
27443 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27444 // Scale must be constant.
27445 if (!C)
27446 return SDValue();
27447 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27448 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27449 TLI.getPointerTy(DAG.getDataLayout()));
27450 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27451 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27452 // If source is undef or we know it won't be used, use a zero vector
27453 // to break register dependency.
27454 // TODO: use undef instead and let BreakFalseDeps deal with it?
27455 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27456 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27457
27458 // Cast mask to an integer type.
27459 Mask = DAG.getBitcast(MaskVT, Mask);
27460
27462
27463 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27464 SDValue Res =
27466 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27467 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27468}
27469
27471 SDValue Src, SDValue Mask, SDValue Base,
27472 SDValue Index, SDValue ScaleOp, SDValue Chain,
27473 const X86Subtarget &Subtarget) {
27474 MVT VT = Op.getSimpleValueType();
27475 SDLoc dl(Op);
27476 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27477 // Scale must be constant.
27478 if (!C)
27479 return SDValue();
27480 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27481 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27482 TLI.getPointerTy(DAG.getDataLayout()));
27483 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27485 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27486
27487 // We support two versions of the gather intrinsics. One with scalar mask and
27488 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27489 if (Mask.getValueType() != MaskVT)
27490 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27491
27492 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27493 // If source is undef or we know it won't be used, use a zero vector
27494 // to break register dependency.
27495 // TODO: use undef instead and let BreakFalseDeps deal with it?
27496 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27497 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27498
27500
27501 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27502 SDValue Res =
27504 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27505 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27506}
27507
27509 SDValue Src, SDValue Mask, SDValue Base,
27510 SDValue Index, SDValue ScaleOp, SDValue Chain,
27511 const X86Subtarget &Subtarget) {
27512 SDLoc dl(Op);
27513 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27514 // Scale must be constant.
27515 if (!C)
27516 return SDValue();
27517 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27518 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27519 TLI.getPointerTy(DAG.getDataLayout()));
27520 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27521 Src.getSimpleValueType().getVectorNumElements());
27522 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27523
27524 // We support two versions of the scatter intrinsics. One with scalar mask and
27525 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27526 if (Mask.getValueType() != MaskVT)
27527 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27528
27530
27531 SDVTList VTs = DAG.getVTList(MVT::Other);
27532 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27533 SDValue Res =
27535 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27536 return Res;
27537}
27538
27540 SDValue Mask, SDValue Base, SDValue Index,
27541 SDValue ScaleOp, SDValue Chain,
27542 const X86Subtarget &Subtarget) {
27543 SDLoc dl(Op);
27544 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27545 // Scale must be constant.
27546 if (!C)
27547 return SDValue();
27548 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27549 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27550 TLI.getPointerTy(DAG.getDataLayout()));
27551 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27552 SDValue Segment = DAG.getRegister(0, MVT::i32);
27553 MVT MaskVT =
27554 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27555 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27556 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27557 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27558 return SDValue(Res, 0);
27559}
27560
27561/// Handles the lowering of builtin intrinsics with chain that return their
27562/// value into registers EDX:EAX.
27563/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27564/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27565/// TargetOpcode.
27566/// Returns a Glue value which can be used to add extra copy-from-reg if the
27567/// expanded intrinsics implicitly defines extra registers (i.e. not just
27568/// EDX:EAX).
27570 SelectionDAG &DAG,
27571 unsigned TargetOpcode,
27572 unsigned SrcReg,
27573 const X86Subtarget &Subtarget,
27575 SDValue Chain = N->getOperand(0);
27576 SDValue Glue;
27577
27578 if (SrcReg) {
27579 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27580 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27581 Glue = Chain.getValue(1);
27582 }
27583
27584 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27585 SDValue N1Ops[] = {Chain, Glue};
27586 SDNode *N1 = DAG.getMachineNode(
27587 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27588 Chain = SDValue(N1, 0);
27589
27590 // Reads the content of XCR and returns it in registers EDX:EAX.
27591 SDValue LO, HI;
27592 if (Subtarget.is64Bit()) {
27593 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27594 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27595 LO.getValue(2));
27596 } else {
27597 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27598 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27599 LO.getValue(2));
27600 }
27601 Chain = HI.getValue(1);
27602 Glue = HI.getValue(2);
27603
27604 if (Subtarget.is64Bit()) {
27605 // Merge the two 32-bit values into a 64-bit one.
27606 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27607 DAG.getConstant(32, DL, MVT::i8));
27608 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27609 Results.push_back(Chain);
27610 return Glue;
27611 }
27612
27613 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27614 SDValue Ops[] = { LO, HI };
27615 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27616 Results.push_back(Pair);
27617 Results.push_back(Chain);
27618 return Glue;
27619}
27620
27621/// Handles the lowering of builtin intrinsics that read the time stamp counter
27622/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27623/// READCYCLECOUNTER nodes.
27624static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27625 SelectionDAG &DAG,
27626 const X86Subtarget &Subtarget,
27628 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27629 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27630 // and the EAX register is loaded with the low-order 32 bits.
27631 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27632 /* NoRegister */0, Subtarget,
27633 Results);
27634 if (Opcode != X86::RDTSCP)
27635 return;
27636
27637 SDValue Chain = Results[1];
27638 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27639 // the ECX register. Add 'ecx' explicitly to the chain.
27640 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27641 Results[1] = ecx;
27642 Results.push_back(ecx.getValue(1));
27643}
27644
27646 SelectionDAG &DAG) {
27648 SDLoc DL(Op);
27649 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27650 Results);
27651 return DAG.getMergeValues(Results, DL);
27652}
27653
27656 SDValue Chain = Op.getOperand(0);
27657 SDValue RegNode = Op.getOperand(2);
27658 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27659 if (!EHInfo)
27660 report_fatal_error("EH registrations only live in functions using WinEH");
27661
27662 // Cast the operand to an alloca, and remember the frame index.
27663 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27664 if (!FINode)
27665 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27666 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27667
27668 // Return the chain operand without making any DAG nodes.
27669 return Chain;
27670}
27671
27674 SDValue Chain = Op.getOperand(0);
27675 SDValue EHGuard = Op.getOperand(2);
27676 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27677 if (!EHInfo)
27678 report_fatal_error("EHGuard only live in functions using WinEH");
27679
27680 // Cast the operand to an alloca, and remember the frame index.
27681 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27682 if (!FINode)
27683 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27684 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27685
27686 // Return the chain operand without making any DAG nodes.
27687 return Chain;
27688}
27689
27690/// Emit Truncating Store with signed or unsigned saturation.
27691static SDValue
27692EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27693 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27694 SelectionDAG &DAG) {
27695 SDVTList VTs = DAG.getVTList(MVT::Other);
27696 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27697 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27698 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27699 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27700}
27701
27702/// Emit Masked Truncating Store with signed or unsigned saturation.
27703static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27704 const SDLoc &DL,
27705 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27706 MachineMemOperand *MMO, SelectionDAG &DAG) {
27707 SDVTList VTs = DAG.getVTList(MVT::Other);
27708 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27709 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27710 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27711}
27712
27714 const MachineFunction &MF) {
27715 if (!Subtarget.is64Bit())
27716 return false;
27717 // 64-bit targets support extended Swift async frame setup,
27718 // except for targets that use the windows 64 prologue.
27719 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27720}
27721
27723 SelectionDAG &DAG) {
27724 unsigned IntNo = Op.getConstantOperandVal(1);
27725 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27726 if (!IntrData) {
27727 switch (IntNo) {
27728
27729 case Intrinsic::swift_async_context_addr: {
27730 SDLoc dl(Op);
27731 auto &MF = DAG.getMachineFunction();
27732 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27733 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27735 X86FI->setHasSwiftAsyncContext(true);
27736 SDValue Chain = Op->getOperand(0);
27737 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27738 SDValue Result =
27739 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27740 DAG.getTargetConstant(8, dl, MVT::i32)),
27741 0);
27742 // Return { result, chain }.
27743 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27744 CopyRBP.getValue(1));
27745 } else {
27746 // No special extended frame, create or reuse an existing stack slot.
27747 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27748 if (!X86FI->getSwiftAsyncContextFrameIdx())
27749 X86FI->setSwiftAsyncContextFrameIdx(
27750 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27751 false));
27752 SDValue Result =
27753 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27754 PtrSize == 8 ? MVT::i64 : MVT::i32);
27755 // Return { result, chain }.
27756 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27757 Op->getOperand(0));
27758 }
27759 }
27760
27761 case llvm::Intrinsic::x86_seh_ehregnode:
27762 return MarkEHRegistrationNode(Op, DAG);
27763 case llvm::Intrinsic::x86_seh_ehguard:
27764 return MarkEHGuard(Op, DAG);
27765 case llvm::Intrinsic::x86_rdpkru: {
27766 SDLoc dl(Op);
27767 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27768 // Create a RDPKRU node and pass 0 to the ECX parameter.
27769 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27770 DAG.getConstant(0, dl, MVT::i32));
27771 }
27772 case llvm::Intrinsic::x86_wrpkru: {
27773 SDLoc dl(Op);
27774 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27775 // to the EDX and ECX parameters.
27776 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27777 Op.getOperand(0), Op.getOperand(2),
27778 DAG.getConstant(0, dl, MVT::i32),
27779 DAG.getConstant(0, dl, MVT::i32));
27780 }
27781 case llvm::Intrinsic::asan_check_memaccess: {
27782 // Mark this as adjustsStack because it will be lowered to a call.
27784 // Don't do anything here, we will expand these intrinsics out later.
27785 return Op;
27786 }
27787 case llvm::Intrinsic::x86_flags_read_u32:
27788 case llvm::Intrinsic::x86_flags_read_u64:
27789 case llvm::Intrinsic::x86_flags_write_u32:
27790 case llvm::Intrinsic::x86_flags_write_u64: {
27791 // We need a frame pointer because this will get lowered to a PUSH/POP
27792 // sequence.
27795 // Don't do anything here, we will expand these intrinsics out later
27796 // during FinalizeISel in EmitInstrWithCustomInserter.
27797 return Op;
27798 }
27799 case Intrinsic::x86_lwpins32:
27800 case Intrinsic::x86_lwpins64:
27801 case Intrinsic::x86_umwait:
27802 case Intrinsic::x86_tpause: {
27803 SDLoc dl(Op);
27804 SDValue Chain = Op->getOperand(0);
27805 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27806 unsigned Opcode;
27807
27808 switch (IntNo) {
27809 default: llvm_unreachable("Impossible intrinsic");
27810 case Intrinsic::x86_umwait:
27811 Opcode = X86ISD::UMWAIT;
27812 break;
27813 case Intrinsic::x86_tpause:
27814 Opcode = X86ISD::TPAUSE;
27815 break;
27816 case Intrinsic::x86_lwpins32:
27817 case Intrinsic::x86_lwpins64:
27818 Opcode = X86ISD::LWPINS;
27819 break;
27820 }
27821
27823 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27824 Op->getOperand(3), Op->getOperand(4));
27825 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27826 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27827 Operation.getValue(1));
27828 }
27829 case Intrinsic::x86_enqcmd:
27830 case Intrinsic::x86_enqcmds: {
27831 SDLoc dl(Op);
27832 SDValue Chain = Op.getOperand(0);
27833 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27834 unsigned Opcode;
27835 switch (IntNo) {
27836 default: llvm_unreachable("Impossible intrinsic!");
27837 case Intrinsic::x86_enqcmd:
27838 Opcode = X86ISD::ENQCMD;
27839 break;
27840 case Intrinsic::x86_enqcmds:
27841 Opcode = X86ISD::ENQCMDS;
27842 break;
27843 }
27844 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27845 Op.getOperand(3));
27846 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27847 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27848 Operation.getValue(1));
27849 }
27850 case Intrinsic::x86_aesenc128kl:
27851 case Intrinsic::x86_aesdec128kl:
27852 case Intrinsic::x86_aesenc256kl:
27853 case Intrinsic::x86_aesdec256kl: {
27854 SDLoc DL(Op);
27855 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27856 SDValue Chain = Op.getOperand(0);
27857 unsigned Opcode;
27858
27859 switch (IntNo) {
27860 default: llvm_unreachable("Impossible intrinsic");
27861 case Intrinsic::x86_aesenc128kl:
27862 Opcode = X86ISD::AESENC128KL;
27863 break;
27864 case Intrinsic::x86_aesdec128kl:
27865 Opcode = X86ISD::AESDEC128KL;
27866 break;
27867 case Intrinsic::x86_aesenc256kl:
27868 Opcode = X86ISD::AESENC256KL;
27869 break;
27870 case Intrinsic::x86_aesdec256kl:
27871 Opcode = X86ISD::AESDEC256KL;
27872 break;
27873 }
27874
27876 MachineMemOperand *MMO = MemIntr->getMemOperand();
27877 EVT MemVT = MemIntr->getMemoryVT();
27879 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27880 MMO);
27881 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27882
27883 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27884 {ZF, Operation.getValue(0), Operation.getValue(2)});
27885 }
27886 case Intrinsic::x86_aesencwide128kl:
27887 case Intrinsic::x86_aesdecwide128kl:
27888 case Intrinsic::x86_aesencwide256kl:
27889 case Intrinsic::x86_aesdecwide256kl: {
27890 SDLoc DL(Op);
27891 SDVTList VTs = DAG.getVTList(
27892 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27893 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27894 SDValue Chain = Op.getOperand(0);
27895 unsigned Opcode;
27896
27897 switch (IntNo) {
27898 default: llvm_unreachable("Impossible intrinsic");
27899 case Intrinsic::x86_aesencwide128kl:
27900 Opcode = X86ISD::AESENCWIDE128KL;
27901 break;
27902 case Intrinsic::x86_aesdecwide128kl:
27903 Opcode = X86ISD::AESDECWIDE128KL;
27904 break;
27905 case Intrinsic::x86_aesencwide256kl:
27906 Opcode = X86ISD::AESENCWIDE256KL;
27907 break;
27908 case Intrinsic::x86_aesdecwide256kl:
27909 Opcode = X86ISD::AESDECWIDE256KL;
27910 break;
27911 }
27912
27914 MachineMemOperand *MMO = MemIntr->getMemOperand();
27915 EVT MemVT = MemIntr->getMemoryVT();
27917 Opcode, DL, VTs,
27918 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27919 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27920 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27921 MemVT, MMO);
27922 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27923
27924 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27925 {ZF, Operation.getValue(1), Operation.getValue(2),
27926 Operation.getValue(3), Operation.getValue(4),
27927 Operation.getValue(5), Operation.getValue(6),
27928 Operation.getValue(7), Operation.getValue(8),
27929 Operation.getValue(9)});
27930 }
27931 case Intrinsic::x86_testui: {
27932 SDLoc dl(Op);
27933 SDValue Chain = Op.getOperand(0);
27934 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27935 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27936 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27937 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27938 Operation.getValue(1));
27939 }
27940 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27941 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27942 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27943 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27944 case Intrinsic::x86_t2rpntlvwz0_internal:
27945 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27946 case Intrinsic::x86_t2rpntlvwz1_internal:
27947 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27948 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27950 unsigned IntNo = Op.getConstantOperandVal(1);
27951 unsigned Opc = 0;
27952 switch (IntNo) {
27953 default:
27954 llvm_unreachable("Unexpected intrinsic!");
27955 case Intrinsic::x86_t2rpntlvwz0_internal:
27956 Opc = X86::PT2RPNTLVWZ0V;
27957 break;
27958 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27959 Opc = X86::PT2RPNTLVWZ0T1V;
27960 break;
27961 case Intrinsic::x86_t2rpntlvwz1_internal:
27962 Opc = X86::PT2RPNTLVWZ1V;
27963 break;
27964 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27965 Opc = X86::PT2RPNTLVWZ1T1V;
27966 break;
27967 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27968 Opc = X86::PT2RPNTLVWZ0RSV;
27969 break;
27970 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27971 Opc = X86::PT2RPNTLVWZ0RST1V;
27972 break;
27973 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27974 Opc = X86::PT2RPNTLVWZ1RSV;
27975 break;
27976 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27977 Opc = X86::PT2RPNTLVWZ1RST1V;
27978 break;
27979 }
27980
27981 SDLoc DL(Op);
27982 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27983
27984 SDValue Ops[] = {Op.getOperand(2), // Row
27985 Op.getOperand(3), // Col0
27986 Op.getOperand(4), // Col1
27987 Op.getOperand(5), // Base
27988 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27989 Op.getOperand(6), // Index
27990 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27991 DAG.getRegister(0, MVT::i16), // Segment
27992 Op.getOperand(0)}; // Chain
27993
27994 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27995 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27996 SDValue(Res, 0));
27997 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27998 SDValue(Res, 0));
27999 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
28000 }
28001 case Intrinsic::x86_atomic_bts_rm:
28002 case Intrinsic::x86_atomic_btc_rm:
28003 case Intrinsic::x86_atomic_btr_rm: {
28004 SDLoc DL(Op);
28005 MVT VT = Op.getSimpleValueType();
28006 SDValue Chain = Op.getOperand(0);
28007 SDValue Op1 = Op.getOperand(2);
28008 SDValue Op2 = Op.getOperand(3);
28009 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
28010 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
28012 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28013 SDValue Res =
28014 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28015 {Chain, Op1, Op2}, VT, MMO);
28016 Chain = Res.getValue(1);
28017 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28018 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28019 }
28020 case Intrinsic::x86_atomic_bts:
28021 case Intrinsic::x86_atomic_btc:
28022 case Intrinsic::x86_atomic_btr: {
28023 SDLoc DL(Op);
28024 MVT VT = Op.getSimpleValueType();
28025 SDValue Chain = Op.getOperand(0);
28026 SDValue Op1 = Op.getOperand(2);
28027 SDValue Op2 = Op.getOperand(3);
28028 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28029 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28030 : X86ISD::LBTR;
28031 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28032 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28033 SDValue Res =
28034 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28035 {Chain, Op1, Op2, Size}, VT, MMO);
28036 Chain = Res.getValue(1);
28037 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28038 unsigned Imm = Op2->getAsZExtVal();
28039 if (Imm)
28040 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28041 DAG.getShiftAmountConstant(Imm, VT, DL));
28042 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28043 }
28044 case Intrinsic::x86_cmpccxadd32:
28045 case Intrinsic::x86_cmpccxadd64: {
28046 SDLoc DL(Op);
28047 SDValue Chain = Op.getOperand(0);
28048 SDValue Addr = Op.getOperand(2);
28049 SDValue Src1 = Op.getOperand(3);
28050 SDValue Src2 = Op.getOperand(4);
28051 SDValue CC = Op.getOperand(5);
28052 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28054 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28055 MVT::i32, MMO);
28056 return Operation;
28057 }
28058 case Intrinsic::x86_aadd32:
28059 case Intrinsic::x86_aadd64:
28060 case Intrinsic::x86_aand32:
28061 case Intrinsic::x86_aand64:
28062 case Intrinsic::x86_aor32:
28063 case Intrinsic::x86_aor64:
28064 case Intrinsic::x86_axor32:
28065 case Intrinsic::x86_axor64: {
28066 SDLoc DL(Op);
28067 SDValue Chain = Op.getOperand(0);
28068 SDValue Op1 = Op.getOperand(2);
28069 SDValue Op2 = Op.getOperand(3);
28070 MVT VT = Op2.getSimpleValueType();
28071 unsigned Opc = 0;
28072 switch (IntNo) {
28073 default:
28074 llvm_unreachable("Unknown Intrinsic");
28075 case Intrinsic::x86_aadd32:
28076 case Intrinsic::x86_aadd64:
28077 Opc = X86ISD::AADD;
28078 break;
28079 case Intrinsic::x86_aand32:
28080 case Intrinsic::x86_aand64:
28081 Opc = X86ISD::AAND;
28082 break;
28083 case Intrinsic::x86_aor32:
28084 case Intrinsic::x86_aor64:
28085 Opc = X86ISD::AOR;
28086 break;
28087 case Intrinsic::x86_axor32:
28088 case Intrinsic::x86_axor64:
28089 Opc = X86ISD::AXOR;
28090 break;
28091 }
28092 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28093 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28094 {Chain, Op1, Op2}, VT, MMO);
28095 }
28096 case Intrinsic::x86_atomic_add_cc:
28097 case Intrinsic::x86_atomic_sub_cc:
28098 case Intrinsic::x86_atomic_or_cc:
28099 case Intrinsic::x86_atomic_and_cc:
28100 case Intrinsic::x86_atomic_xor_cc: {
28101 SDLoc DL(Op);
28102 SDValue Chain = Op.getOperand(0);
28103 SDValue Op1 = Op.getOperand(2);
28104 SDValue Op2 = Op.getOperand(3);
28105 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28106 MVT VT = Op2.getSimpleValueType();
28107 unsigned Opc = 0;
28108 switch (IntNo) {
28109 default:
28110 llvm_unreachable("Unknown Intrinsic");
28111 case Intrinsic::x86_atomic_add_cc:
28112 Opc = X86ISD::LADD;
28113 break;
28114 case Intrinsic::x86_atomic_sub_cc:
28115 Opc = X86ISD::LSUB;
28116 break;
28117 case Intrinsic::x86_atomic_or_cc:
28118 Opc = X86ISD::LOR;
28119 break;
28120 case Intrinsic::x86_atomic_and_cc:
28121 Opc = X86ISD::LAND;
28122 break;
28123 case Intrinsic::x86_atomic_xor_cc:
28124 Opc = X86ISD::LXOR;
28125 break;
28126 }
28127 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28128 SDValue LockArith =
28129 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28130 {Chain, Op1, Op2}, VT, MMO);
28131 Chain = LockArith.getValue(1);
28132 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28133 }
28134 }
28135 return SDValue();
28136 }
28137
28138 SDLoc dl(Op);
28139 switch(IntrData->Type) {
28140 default: llvm_unreachable("Unknown Intrinsic Type");
28141 case RDSEED:
28142 case RDRAND: {
28143 // Emit the node with the right value type.
28144 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28145 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28146
28147 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28148 // Otherwise return the value from Rand, which is always 0, casted to i32.
28149 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28150 DAG.getConstant(1, dl, Op->getValueType(1)),
28151 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28152 SDValue(Result.getNode(), 1)};
28153 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28154
28155 // Return { result, isValid, chain }.
28156 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28157 SDValue(Result.getNode(), 2));
28158 }
28159 case GATHER_AVX2: {
28160 SDValue Chain = Op.getOperand(0);
28161 SDValue Src = Op.getOperand(2);
28162 SDValue Base = Op.getOperand(3);
28163 SDValue Index = Op.getOperand(4);
28164 SDValue Mask = Op.getOperand(5);
28165 SDValue Scale = Op.getOperand(6);
28166 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28167 Scale, Chain, Subtarget);
28168 }
28169 case GATHER: {
28170 //gather(v1, mask, index, base, scale);
28171 SDValue Chain = Op.getOperand(0);
28172 SDValue Src = Op.getOperand(2);
28173 SDValue Base = Op.getOperand(3);
28174 SDValue Index = Op.getOperand(4);
28175 SDValue Mask = Op.getOperand(5);
28176 SDValue Scale = Op.getOperand(6);
28177 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28178 Chain, Subtarget);
28179 }
28180 case SCATTER: {
28181 //scatter(base, mask, index, v1, scale);
28182 SDValue Chain = Op.getOperand(0);
28183 SDValue Base = Op.getOperand(2);
28184 SDValue Mask = Op.getOperand(3);
28185 SDValue Index = Op.getOperand(4);
28186 SDValue Src = Op.getOperand(5);
28187 SDValue Scale = Op.getOperand(6);
28188 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28189 Scale, Chain, Subtarget);
28190 }
28191 case PREFETCH: {
28192 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28193 assert((HintVal == 2 || HintVal == 3) &&
28194 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28195 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28196 SDValue Chain = Op.getOperand(0);
28197 SDValue Mask = Op.getOperand(2);
28198 SDValue Index = Op.getOperand(3);
28199 SDValue Base = Op.getOperand(4);
28200 SDValue Scale = Op.getOperand(5);
28201 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28202 Subtarget);
28203 }
28204 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28205 case RDTSC: {
28207 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28208 Results);
28209 return DAG.getMergeValues(Results, dl);
28210 }
28211 // Read Performance Monitoring Counters.
28212 case RDPMC:
28213 // Read Processor Register.
28214 case RDPRU:
28215 // GetExtended Control Register.
28216 case XGETBV: {
28218
28219 // RDPMC uses ECX to select the index of the performance counter to read.
28220 // RDPRU uses ECX to select the processor register to read.
28221 // XGETBV uses ECX to select the index of the XCR register to return.
28222 // The result is stored into registers EDX:EAX.
28223 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28224 Subtarget, Results);
28225 return DAG.getMergeValues(Results, dl);
28226 }
28227 // XTEST intrinsics.
28228 case XTEST: {
28229 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28230 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28231
28232 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28233 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28234 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28235 Ret, SDValue(InTrans.getNode(), 1));
28236 }
28239 case TRUNCATE_TO_MEM_VI32: {
28240 SDValue Mask = Op.getOperand(4);
28241 SDValue DataToTruncate = Op.getOperand(3);
28242 SDValue Addr = Op.getOperand(2);
28243 SDValue Chain = Op.getOperand(0);
28244
28246 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28247
28248 EVT MemVT = MemIntr->getMemoryVT();
28249
28250 uint16_t TruncationOp = IntrData->Opc0;
28251 switch (TruncationOp) {
28252 case X86ISD::VTRUNC: {
28253 if (isAllOnesConstant(Mask)) // return just a truncate store
28254 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28255 MemIntr->getMemOperand());
28256
28257 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28258 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28259 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28260
28261 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28262 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28263 true /* truncating */);
28264 }
28265 case X86ISD::VTRUNCUS:
28266 case X86ISD::VTRUNCS: {
28267 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28268 if (isAllOnesConstant(Mask))
28269 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28270 MemIntr->getMemOperand(), DAG);
28271
28272 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28273 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28274
28275 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28276 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28277 }
28278 default:
28279 llvm_unreachable("Unsupported truncstore intrinsic");
28280 }
28281 }
28282 case INTR_TYPE_CAST_MMX:
28283 return SDValue(); // handled in combineINTRINSIC_*
28284 }
28285}
28286
28287SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28288 SelectionDAG &DAG) const {
28289 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28290 MFI.setReturnAddressIsTaken(true);
28291
28292 unsigned Depth = Op.getConstantOperandVal(0);
28293 SDLoc dl(Op);
28294 EVT PtrVT = Op.getValueType();
28295
28296 if (Depth > 0) {
28297 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28298 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28299 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28300 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28301 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28302 MachinePointerInfo());
28303 }
28304
28305 // Just load the return address.
28306 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28307 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28308 MachinePointerInfo());
28309}
28310
28311SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28312 SelectionDAG &DAG) const {
28314 return getReturnAddressFrameIndex(DAG);
28315}
28316
28317SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28318 MachineFunction &MF = DAG.getMachineFunction();
28319 MachineFrameInfo &MFI = MF.getFrameInfo();
28320 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
28321 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28322 EVT VT = Op.getValueType();
28323
28324 MFI.setFrameAddressIsTaken(true);
28325
28326 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28327 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28328 // is not possible to crawl up the stack without looking at the unwind codes
28329 // simultaneously.
28330 int FrameAddrIndex = FuncInfo->getFAIndex();
28331 if (!FrameAddrIndex) {
28332 // Set up a frame object for the return address.
28333 unsigned SlotSize = RegInfo->getSlotSize();
28334 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28335 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28336 FuncInfo->setFAIndex(FrameAddrIndex);
28337 }
28338 return DAG.getFrameIndex(FrameAddrIndex, VT);
28339 }
28340
28341 Register FrameReg =
28342 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28343 SDLoc dl(Op); // FIXME probably not meaningful
28344 unsigned Depth = Op.getConstantOperandVal(0);
28345 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28346 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28347 "Invalid Frame Register!");
28348 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28349 while (Depth--)
28350 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28351 MachinePointerInfo());
28352 return FrameAddr;
28353}
28354
28355// FIXME? Maybe this could be a TableGen attribute on some registers and
28356// this table could be generated automatically from RegInfo.
28358 const MachineFunction &MF) const {
28359 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28360
28362 .Case("esp", X86::ESP)
28363 .Case("rsp", X86::RSP)
28364 .Case("ebp", X86::EBP)
28365 .Case("rbp", X86::RBP)
28366 .Case("r14", X86::R14)
28367 .Case("r15", X86::R15)
28368 .Default(0);
28369
28370 if (Reg == X86::EBP || Reg == X86::RBP) {
28371 if (!TFI.hasFP(MF))
28372 report_fatal_error("register " + StringRef(RegName) +
28373 " is allocatable: function has no frame pointer");
28374#ifndef NDEBUG
28375 else {
28376 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28377 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28378 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28379 "Invalid Frame Register!");
28380 }
28381#endif
28382 }
28383
28384 return Reg;
28385}
28386
28387SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28388 SelectionDAG &DAG) const {
28389 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28390 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28391}
28392
28394 const Constant *PersonalityFn) const {
28395 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28396 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28397
28398 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28399}
28400
28402 const Constant *PersonalityFn) const {
28403 // Funclet personalities don't use selectors (the runtime does the selection).
28405 return X86::NoRegister;
28406 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28407}
28408
28410 return Subtarget.isTargetWin64();
28411}
28412
28413SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28414 SDValue Chain = Op.getOperand(0);
28415 SDValue Offset = Op.getOperand(1);
28416 SDValue Handler = Op.getOperand(2);
28417 SDLoc dl (Op);
28418
28419 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28420 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28421 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28422 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28423 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28424 "Invalid Frame Register!");
28425 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28426 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28427
28428 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28429 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28430 dl));
28431 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28432 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28433 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28434
28435 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28436 DAG.getRegister(StoreAddrReg, PtrVT));
28437}
28438
28439SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28440 SelectionDAG &DAG) const {
28441 SDLoc DL(Op);
28442 // If the subtarget is not 64bit, we may need the global base reg
28443 // after isel expand pseudo, i.e., after CGBR pass ran.
28444 // Therefore, ask for the GlobalBaseReg now, so that the pass
28445 // inserts the code for us in case we need it.
28446 // Otherwise, we will end up in a situation where we will
28447 // reference a virtual register that is not defined!
28448 if (!Subtarget.is64Bit()) {
28449 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28450 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28451 }
28452 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28453 DAG.getVTList(MVT::i32, MVT::Other),
28454 Op.getOperand(0), Op.getOperand(1));
28455}
28456
28457SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28458 SelectionDAG &DAG) const {
28459 SDLoc DL(Op);
28460 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28461 Op.getOperand(0), Op.getOperand(1));
28462}
28463
28464SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28465 SelectionDAG &DAG) const {
28466 SDLoc DL(Op);
28467 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28468 Op.getOperand(0));
28469}
28470
28472 return Op.getOperand(0);
28473}
28474
28475SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28476 SelectionDAG &DAG) const {
28477 SDValue Root = Op.getOperand(0);
28478 SDValue Trmp = Op.getOperand(1); // trampoline
28479 SDValue FPtr = Op.getOperand(2); // nested function
28480 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28481 SDLoc dl (Op);
28482
28483 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28484 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28485
28486 if (Subtarget.is64Bit()) {
28487 SDValue OutChains[6];
28488
28489 // Large code-model.
28490 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28491 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28492
28493 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28494 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28495
28496 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28497
28498 // Load the pointer to the nested function into R11.
28499 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28500 SDValue Addr = Trmp;
28501 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28502 Addr, MachinePointerInfo(TrmpAddr));
28503
28504 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28505 DAG.getConstant(2, dl, MVT::i64));
28506 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28507 MachinePointerInfo(TrmpAddr, 2), Align(2));
28508
28509 // Load the 'nest' parameter value into R10.
28510 // R10 is specified in X86CallingConv.td
28511 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28512 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28513 DAG.getConstant(10, dl, MVT::i64));
28514 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28515 Addr, MachinePointerInfo(TrmpAddr, 10));
28516
28517 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28518 DAG.getConstant(12, dl, MVT::i64));
28519 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28520 MachinePointerInfo(TrmpAddr, 12), Align(2));
28521
28522 // Jump to the nested function.
28523 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28524 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28525 DAG.getConstant(20, dl, MVT::i64));
28526 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28527 Addr, MachinePointerInfo(TrmpAddr, 20));
28528
28529 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28530 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28531 DAG.getConstant(22, dl, MVT::i64));
28532 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28533 Addr, MachinePointerInfo(TrmpAddr, 22));
28534
28535 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28536 } else {
28537 const Function *Func =
28538 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28539 CallingConv::ID CC = Func->getCallingConv();
28540 unsigned NestReg;
28541
28542 switch (CC) {
28543 default:
28544 llvm_unreachable("Unsupported calling convention");
28545 case CallingConv::C:
28547 // Pass 'nest' parameter in ECX.
28548 // Must be kept in sync with X86CallingConv.td
28549 NestReg = X86::ECX;
28550
28551 // Check that ECX wasn't needed by an 'inreg' parameter.
28552 FunctionType *FTy = Func->getFunctionType();
28553 const AttributeList &Attrs = Func->getAttributes();
28554
28555 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28556 unsigned InRegCount = 0;
28557 unsigned Idx = 0;
28558
28559 for (FunctionType::param_iterator I = FTy->param_begin(),
28560 E = FTy->param_end(); I != E; ++I, ++Idx)
28561 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28562 const DataLayout &DL = DAG.getDataLayout();
28563 // FIXME: should only count parameters that are lowered to integers.
28564 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28565 }
28566
28567 if (InRegCount > 2) {
28568 report_fatal_error("Nest register in use - reduce number of inreg"
28569 " parameters!");
28570 }
28571 }
28572 break;
28573 }
28576 case CallingConv::Fast:
28577 case CallingConv::Tail:
28579 // Pass 'nest' parameter in EAX.
28580 // Must be kept in sync with X86CallingConv.td
28581 NestReg = X86::EAX;
28582 break;
28583 }
28584
28585 SDValue OutChains[4];
28586 SDValue Addr, Disp;
28587
28588 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28589 DAG.getConstant(10, dl, MVT::i32));
28590 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28591
28592 // This is storing the opcode for MOV32ri.
28593 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28594 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28595 OutChains[0] =
28596 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28597 Trmp, MachinePointerInfo(TrmpAddr));
28598
28599 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28600 DAG.getConstant(1, dl, MVT::i32));
28601 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28602 MachinePointerInfo(TrmpAddr, 1), Align(1));
28603
28604 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28605 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28606 DAG.getConstant(5, dl, MVT::i32));
28607 OutChains[2] =
28608 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28609 MachinePointerInfo(TrmpAddr, 5), Align(1));
28610
28611 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28612 DAG.getConstant(6, dl, MVT::i32));
28613 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28614 MachinePointerInfo(TrmpAddr, 6), Align(1));
28615
28616 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28617 }
28618}
28619
28620SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28621 SelectionDAG &DAG) const {
28622 /*
28623 The rounding mode is in bits 11:10 of FPSR, and has the following
28624 settings:
28625 00 Round to nearest
28626 01 Round to -inf
28627 10 Round to +inf
28628 11 Round to 0
28629
28630 GET_ROUNDING, on the other hand, expects the following:
28631 -1 Undefined
28632 0 Round to 0
28633 1 Round to nearest
28634 2 Round to +inf
28635 3 Round to -inf
28636
28637 To perform the conversion, we use a packed lookup table of the four 2-bit
28638 values that we can index by FPSP[11:10]
28639 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28640
28641 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28642 */
28643
28644 MachineFunction &MF = DAG.getMachineFunction();
28645 MVT VT = Op.getSimpleValueType();
28646 SDLoc DL(Op);
28647
28648 // Save FP Control Word to stack slot
28649 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28650 SDValue StackSlot =
28651 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28652
28653 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28654
28655 SDValue Chain = Op.getOperand(0);
28656 SDValue Ops[] = {Chain, StackSlot};
28658 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28660
28661 // Load FP Control Word from stack slot
28662 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28663 Chain = CWD.getValue(1);
28664
28665 // Mask and turn the control bits into a shift for the lookup table.
28666 SDValue Shift =
28667 DAG.getNode(ISD::SRL, DL, MVT::i16,
28668 DAG.getNode(ISD::AND, DL, MVT::i16,
28669 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28670 DAG.getConstant(9, DL, MVT::i8));
28671 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28672
28673 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28674 SDValue RetVal =
28675 DAG.getNode(ISD::AND, DL, MVT::i32,
28676 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28677 DAG.getConstant(3, DL, MVT::i32));
28678
28679 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28680
28681 return DAG.getMergeValues({RetVal, Chain}, DL);
28682}
28683
28684SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28685 SelectionDAG &DAG) const {
28686 MachineFunction &MF = DAG.getMachineFunction();
28687 SDLoc DL(Op);
28688 SDValue Chain = Op.getNode()->getOperand(0);
28689
28690 // FP control word may be set only from data in memory. So we need to allocate
28691 // stack space to save/load FP control word.
28692 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28693 SDValue StackSlot =
28694 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28695 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28696 MachineMemOperand *MMO =
28698
28699 // Store FP control word into memory.
28700 SDValue Ops[] = {Chain, StackSlot};
28701 Chain = DAG.getMemIntrinsicNode(
28702 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28703
28704 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28705 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28706 Chain = CWD.getValue(1);
28707 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28708 DAG.getConstant(0xf3ff, DL, MVT::i16));
28709
28710 // Calculate new rounding mode.
28711 SDValue NewRM = Op.getNode()->getOperand(1);
28712 SDValue RMBits;
28713 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28714 uint64_t RM = CVal->getZExtValue();
28715 int FieldVal = X86::getRoundingModeX86(RM);
28716
28717 if (FieldVal == X86::rmInvalid) {
28718 FieldVal = X86::rmToNearest;
28719 LLVMContext &C = MF.getFunction().getContext();
28720 C.diagnose(DiagnosticInfoUnsupported(
28721 MF.getFunction(), "rounding mode is not supported by X86 hardware",
28722 DiagnosticLocation(DL.getDebugLoc()), DS_Error));
28723 }
28724 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28725 } else {
28726 // Need to convert argument into bits of control word:
28727 // 0 Round to 0 -> 11
28728 // 1 Round to nearest -> 00
28729 // 2 Round to +inf -> 10
28730 // 3 Round to -inf -> 01
28731 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28732 // To make the conversion, put all these values into a value 0xc9 and shift
28733 // it left depending on the rounding mode:
28734 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28735 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28736 // ...
28737 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28738 SDValue ShiftValue =
28739 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28740 DAG.getNode(ISD::ADD, DL, MVT::i32,
28741 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28742 DAG.getConstant(1, DL, MVT::i8)),
28743 DAG.getConstant(4, DL, MVT::i32)));
28744 SDValue Shifted =
28745 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28746 ShiftValue);
28747 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28748 DAG.getConstant(0xc00, DL, MVT::i16));
28749 }
28750
28751 // Update rounding mode bits and store the new FP Control Word into stack.
28752 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28753 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28754
28755 // Load FP control word from the slot.
28756 SDValue OpsLD[] = {Chain, StackSlot};
28757 MachineMemOperand *MMOL =
28759 Chain = DAG.getMemIntrinsicNode(
28760 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28761
28762 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28763 // same way but in bits 14:13.
28764 if (Subtarget.hasSSE1()) {
28765 // Store MXCSR into memory.
28766 Chain = DAG.getNode(
28767 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28768 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28769 StackSlot);
28770
28771 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28772 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28773 Chain = CWD.getValue(1);
28774 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28775 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28776
28777 // Shift X87 RM bits from 11:10 to 14:13.
28778 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28779 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28780 DAG.getConstant(3, DL, MVT::i8));
28781
28782 // Update rounding mode bits and store the new FP Control Word into stack.
28783 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28784 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28785
28786 // Load MXCSR from the slot.
28787 Chain = DAG.getNode(
28788 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28789 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28790 StackSlot);
28791 }
28792
28793 return Chain;
28794}
28795
28796const unsigned X87StateSize = 28;
28797const unsigned FPStateSize = 32;
28798[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28799
28800SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28801 SelectionDAG &DAG) const {
28803 SDLoc DL(Op);
28804 SDValue Chain = Op->getOperand(0);
28805 SDValue Ptr = Op->getOperand(1);
28807 EVT MemVT = Node->getMemoryVT();
28809 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28810
28811 // Get x87 state, if it presents.
28812 if (Subtarget.hasX87()) {
28813 Chain =
28814 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28815 {Chain, Ptr}, MemVT, MMO);
28816
28817 // FNSTENV changes the exception mask, so load back the stored environment.
28818 MachineMemOperand::Flags NewFlags =
28821 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28822 Chain =
28823 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28824 {Chain, Ptr}, MemVT, MMO);
28825 }
28826
28827 // If target supports SSE, get MXCSR as well.
28828 if (Subtarget.hasSSE1()) {
28829 // Get pointer to the MXCSR location in memory.
28831 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28832 DAG.getConstant(X87StateSize, DL, PtrVT));
28833 // Store MXCSR into memory.
28834 Chain = DAG.getNode(
28835 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28836 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28837 MXCSRAddr);
28838 }
28839
28840 return Chain;
28841}
28842
28844 EVT MemVT, MachineMemOperand *MMO,
28845 SelectionDAG &DAG,
28846 const X86Subtarget &Subtarget) {
28847 // Set x87 state, if it presents.
28848 if (Subtarget.hasX87())
28849 Chain =
28850 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28851 {Chain, Ptr}, MemVT, MMO);
28852 // If target supports SSE, set MXCSR as well.
28853 if (Subtarget.hasSSE1()) {
28854 // Get pointer to the MXCSR location in memory.
28856 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28857 DAG.getConstant(X87StateSize, DL, PtrVT));
28858 // Load MXCSR from memory.
28859 Chain = DAG.getNode(
28860 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28861 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28862 MXCSRAddr);
28863 }
28864 return Chain;
28865}
28866
28867SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28868 SelectionDAG &DAG) const {
28869 SDLoc DL(Op);
28870 SDValue Chain = Op->getOperand(0);
28871 SDValue Ptr = Op->getOperand(1);
28873 EVT MemVT = Node->getMemoryVT();
28875 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28876 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28877}
28878
28879SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28880 SelectionDAG &DAG) const {
28881 MachineFunction &MF = DAG.getMachineFunction();
28882 SDLoc DL(Op);
28883 SDValue Chain = Op.getNode()->getOperand(0);
28884
28885 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28886 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28888
28889 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28890 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28891 // for compatibility with glibc.
28892 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28893 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28894 Constant *Zero = ConstantInt::get(ItemTy, 0);
28895 for (unsigned I = 0; I < 6; ++I)
28896 FPEnvVals.push_back(Zero);
28897
28898 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28899 // all exceptions, sets DAZ and FTZ to 0.
28900 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28901 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28902 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
28903 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28904 MachinePointerInfo MPI =
28906 MachineMemOperand *MMO = MF.getMachineMemOperand(
28908
28909 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28910}
28911
28912// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28913uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28914 assert((Amt < 8) && "Shift/Rotation amount out of range");
28915 switch (Opcode) {
28916 case ISD::BITREVERSE:
28917 return 0x8040201008040201ULL;
28918 case ISD::SHL:
28919 return ((0x0102040810204080ULL >> (Amt)) &
28920 (0x0101010101010101ULL * (0xFF >> (Amt))));
28921 case ISD::SRL:
28922 return ((0x0102040810204080ULL << (Amt)) &
28923 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28924 case ISD::SRA:
28925 return (getGFNICtrlImm(ISD::SRL, Amt) |
28926 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28927 case ISD::ROTL:
28928 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28929 case ISD::ROTR:
28930 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28931 }
28932 llvm_unreachable("Unsupported GFNI opcode");
28933}
28934
28935// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28936SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28937 MVT VT, unsigned Amt = 0) {
28938 assert(VT.getVectorElementType() == MVT::i8 &&
28939 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28940 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28941 SmallVector<SDValue> MaskBits;
28942 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28943 uint64_t Bits = (Imm >> (I % 64)) & 255;
28944 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28945 }
28946 return DAG.getBuildVector(VT, DL, MaskBits);
28947}
28948
28949/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28950//
28951// i8/i16 vector implemented using dword LZCNT vector instruction
28952// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28953// split the vector, perform operation on it's Lo a Hi part and
28954// concatenate the results.
28956 const X86Subtarget &Subtarget) {
28957 assert(Op.getOpcode() == ISD::CTLZ);
28958 SDLoc dl(Op);
28959 MVT VT = Op.getSimpleValueType();
28960 MVT EltVT = VT.getVectorElementType();
28961 unsigned NumElems = VT.getVectorNumElements();
28962
28963 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28964 "Unsupported element type");
28965
28966 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28967 if (NumElems > 16 ||
28968 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28969 return splitVectorIntUnary(Op, DAG, dl);
28970
28971 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28972 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28973 "Unsupported value type for operation");
28974
28975 // Use native supported vector instruction vplzcntd.
28976 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28977 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28978 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28979 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28980
28981 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28982}
28983
28984// Lower CTLZ using a PSHUFB lookup table implementation.
28986 const X86Subtarget &Subtarget,
28987 SelectionDAG &DAG) {
28988 MVT VT = Op.getSimpleValueType();
28989 int NumElts = VT.getVectorNumElements();
28990 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28991 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28992
28993 // Per-nibble leading zero PSHUFB lookup table.
28994 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28995 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28996 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28997 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28998
29000 for (int i = 0; i < NumBytes; ++i)
29001 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29002 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29003
29004 // Begin by bitcasting the input to byte vector, then split those bytes
29005 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
29006 // If the hi input nibble is zero then we add both results together, otherwise
29007 // we just take the hi result (by masking the lo result to zero before the
29008 // add).
29009 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29010 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29011
29012 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29013 SDValue Lo = Op0;
29014 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29015 SDValue HiZ;
29016 if (CurrVT.is512BitVector()) {
29017 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29018 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29019 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29020 } else {
29021 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29022 }
29023
29024 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29025 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29026 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29027 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29028
29029 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29030 // of the current vector width in the same way we did for the nibbles.
29031 // If the upper half of the input element is zero then add the halves'
29032 // leading zero counts together, otherwise just use the upper half's.
29033 // Double the width of the result until we are at target width.
29034 while (CurrVT != VT) {
29035 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29036 int CurrNumElts = CurrVT.getVectorNumElements();
29037 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29038 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29039 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29040
29041 // Check if the upper half of the input element is zero.
29042 if (CurrVT.is512BitVector()) {
29043 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29044 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29045 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29046 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29047 } else {
29048 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29049 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29050 }
29051 HiZ = DAG.getBitcast(NextVT, HiZ);
29052
29053 // Move the upper/lower halves to the lower bits as we'll be extending to
29054 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29055 // together.
29056 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29057 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29058 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29059 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29060 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29061 CurrVT = NextVT;
29062 }
29063
29064 return Res;
29065}
29066
29068 const X86Subtarget &Subtarget,
29069 SelectionDAG &DAG) {
29070 MVT VT = Op.getSimpleValueType();
29071
29072 if (Subtarget.hasCDI() &&
29073 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29074 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29075 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29076
29077 // Decompose 256-bit ops into smaller 128-bit ops.
29078 if (VT.is256BitVector() && !Subtarget.hasInt256())
29079 return splitVectorIntUnary(Op, DAG, DL);
29080
29081 // Decompose 512-bit ops into smaller 256-bit ops.
29082 if (VT.is512BitVector() && !Subtarget.hasBWI())
29083 return splitVectorIntUnary(Op, DAG, DL);
29084
29085 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29086 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29087}
29088
29090 SelectionDAG &DAG,
29091 const X86Subtarget &Subtarget) {
29092 MVT VT = Op.getSimpleValueType();
29093 SDValue Input = Op.getOperand(0);
29094
29095 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29096 "Expected vXi8 input for GFNI-based CTLZ lowering");
29097
29098 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29099
29100 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29101 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29102
29103 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29104 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29105 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29106
29107 SDValue LZCNT =
29108 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29109 DAG.getTargetConstant(8, DL, MVT::i8));
29110 return LZCNT;
29111}
29112
29113static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29114 SelectionDAG &DAG) {
29115 MVT VT = Op.getSimpleValueType();
29116 MVT OpVT = VT;
29117 unsigned NumBits = VT.getSizeInBits();
29118 SDLoc dl(Op);
29119 unsigned Opc = Op.getOpcode();
29120
29121 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29122 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29123
29124 if (VT.isVector())
29125 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29126
29127 Op = Op.getOperand(0);
29128 if (VT == MVT::i8) {
29129 // Zero extend to i32 since there is not an i8 bsr.
29130 OpVT = MVT::i32;
29131 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29132 }
29133
29134 // Check if we can safely pass a result though BSR for zero sources.
29135 SDValue PassThru = DAG.getUNDEF(OpVT);
29136 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29137 !DAG.isKnownNeverZero(Op))
29138 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29139
29140 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29141 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29142 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29143
29144 // Skip CMOV if we're using a pass through value.
29145 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29146 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29147 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29148 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29149 Op.getValue(1)};
29150 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29151 }
29152
29153 // Finally xor with NumBits-1.
29154 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29155 DAG.getConstant(NumBits - 1, dl, OpVT));
29156
29157 if (VT == MVT::i8)
29158 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29159 return Op;
29160}
29161
29162static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29163 SelectionDAG &DAG) {
29164 MVT VT = Op.getSimpleValueType();
29165 unsigned NumBits = VT.getScalarSizeInBits();
29166 SDValue N0 = Op.getOperand(0);
29167 SDLoc dl(Op);
29168 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29169
29170 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29171 "Only scalar CTTZ requires custom lowering");
29172
29173 // Check if we can safely pass a result though BSF for zero sources.
29174 SDValue PassThru = DAG.getUNDEF(VT);
29175 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29176 PassThru = DAG.getConstant(NumBits, dl, VT);
29177
29178 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29179 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29180 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29181
29182 // Skip CMOV if src is never zero or we're using a pass through value.
29183 if (NonZeroSrc || !PassThru.isUndef())
29184 return Op;
29185
29186 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29187 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29188 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29189 Op.getValue(1)};
29190 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29191}
29192
29194 const X86Subtarget &Subtarget) {
29195 MVT VT = Op.getSimpleValueType();
29196 SDLoc DL(Op);
29197
29198 if (VT == MVT::i16 || VT == MVT::i32)
29199 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29200
29201 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29202 return splitVectorIntBinary(Op, DAG, DL);
29203
29204 assert(Op.getSimpleValueType().is256BitVector() &&
29205 Op.getSimpleValueType().isInteger() &&
29206 "Only handle AVX 256-bit vector integer operation");
29207 return splitVectorIntBinary(Op, DAG, DL);
29208}
29209
29211 const X86Subtarget &Subtarget) {
29212 MVT VT = Op.getSimpleValueType();
29213 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29214 unsigned Opcode = Op.getOpcode();
29215 SDLoc DL(Op);
29216
29217 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29218 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29219 assert(Op.getSimpleValueType().isInteger() &&
29220 "Only handle AVX vector integer operation");
29221 return splitVectorIntBinary(Op, DAG, DL);
29222 }
29223
29224 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29225 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29226 EVT SetCCResultType =
29227 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29228
29229 unsigned BitWidth = VT.getScalarSizeInBits();
29230 if (Opcode == ISD::USUBSAT) {
29231 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29232 // Handle a special-case with a bit-hack instead of cmp+select:
29233 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29234 // If the target can use VPTERNLOG, DAGToDAG will match this as
29235 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29236 // "broadcast" constant load.
29238 if (C && C->getAPIntValue().isSignMask()) {
29239 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29240 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29241 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29242 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29243 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29244 }
29245 }
29246 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29247 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29248 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29249 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29250 // TODO: Move this to DAGCombiner?
29251 if (SetCCResultType == VT &&
29252 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29253 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29254 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29255 }
29256 }
29257
29258 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29259 (!VT.isVector() || VT == MVT::v2i64)) {
29262 SDValue Zero = DAG.getConstant(0, DL, VT);
29263 SDValue Result =
29264 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29265 DAG.getVTList(VT, SetCCResultType), X, Y);
29266 SDValue SumDiff = Result.getValue(0);
29267 SDValue Overflow = Result.getValue(1);
29268 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29269 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29270 SDValue SumNeg =
29271 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29272 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29273 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29274 }
29275
29276 // Use default expansion.
29277 return SDValue();
29278}
29279
29280static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29281 SelectionDAG &DAG) {
29282 MVT VT = Op.getSimpleValueType();
29283 SDLoc DL(Op);
29284
29285 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29286 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29287 // 8-bit integer abs to NEG and CMOV.
29288 SDValue N0 = Op.getOperand(0);
29289 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29290 DAG.getConstant(0, DL, VT), N0);
29291 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29292 SDValue(Neg.getNode(), 1)};
29293 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29294 }
29295
29296 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29297 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29298 SDValue Src = Op.getOperand(0);
29299 SDValue Neg = DAG.getNegative(Src, DL, VT);
29300 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29301 }
29302
29303 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29304 assert(VT.isInteger() &&
29305 "Only handle AVX 256-bit vector integer operation");
29306 return splitVectorIntUnary(Op, DAG, DL);
29307 }
29308
29309 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29310 return splitVectorIntUnary(Op, DAG, DL);
29311
29312 // Default to expand.
29313 return SDValue();
29314}
29315
29316static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29317 SelectionDAG &DAG) {
29318 MVT VT = Op.getSimpleValueType();
29319 SDLoc DL(Op);
29320
29321 // For AVX1 cases, split to use legal ops.
29322 if (VT.is256BitVector() && !Subtarget.hasInt256())
29323 return splitVectorIntBinary(Op, DAG, DL);
29324
29325 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29326 return splitVectorIntBinary(Op, DAG, DL);
29327
29328 // Default to expand.
29329 return SDValue();
29330}
29331
29332static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29333 SelectionDAG &DAG) {
29334 MVT VT = Op.getSimpleValueType();
29335 SDLoc DL(Op);
29336
29337 // For AVX1 cases, split to use legal ops.
29338 if (VT.is256BitVector() && !Subtarget.hasInt256())
29339 return splitVectorIntBinary(Op, DAG, DL);
29340
29341 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29342 return splitVectorIntBinary(Op, DAG, DL);
29343
29344 // Default to expand.
29345 return SDValue();
29346}
29347
29349 SelectionDAG &DAG) {
29350 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29351 EVT VT = Op.getValueType();
29352 SDValue X = Op.getOperand(0);
29353 SDValue Y = Op.getOperand(1);
29354 SDLoc DL(Op);
29355 bool IsMaxOp =
29356 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29357 bool IsNum =
29358 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29359 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29360 unsigned Opc = 0;
29361 if (VT.isVector())
29363 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29365
29366 if (Opc) {
29367 SDValue Imm =
29368 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29369 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29370 }
29371 }
29372
29373 uint64_t SizeInBits = VT.getScalarSizeInBits();
29374 APInt PreferredZero = APInt::getZero(SizeInBits);
29375 APInt OppositeZero = PreferredZero;
29376 EVT IVT = VT.changeTypeToInteger();
29377 X86ISD::NodeType MinMaxOp;
29378 if (IsMaxOp) {
29379 MinMaxOp = X86ISD::FMAX;
29380 OppositeZero.setSignBit();
29381 } else {
29382 PreferredZero.setSignBit();
29383 MinMaxOp = X86ISD::FMIN;
29384 }
29385 EVT SetCCType =
29386 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29387
29388 // The tables below show the expected result of Max in cases of NaN and
29389 // signed zeros.
29390 //
29391 // Y Y
29392 // Num xNaN +0 -0
29393 // --------------- ---------------
29394 // Num | Max | Y | +0 | +0 | +0 |
29395 // X --------------- X ---------------
29396 // xNaN | X | X/Y | -0 | +0 | -0 |
29397 // --------------- ---------------
29398 //
29399 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29400 // reordering.
29401 //
29402 // We check if any of operands is NaN and return NaN. Then we check if any of
29403 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29404 // to ensure the correct zero is returned.
29405 auto MatchesZero = [](SDValue Op, APInt Zero) {
29407 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29408 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29409 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29410 return CstOp->getAPIntValue() == Zero;
29411 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29412 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29413 for (const SDValue &OpVal : Op->op_values()) {
29414 if (OpVal.isUndef())
29415 continue;
29416 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29417 if (!CstOp)
29418 return false;
29419 if (!CstOp->getValueAPF().isZero())
29420 continue;
29421 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29422 return false;
29423 }
29424 return true;
29425 }
29426 return false;
29427 };
29428
29429 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29430 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29431 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29432 Op->getFlags().hasNoSignedZeros() ||
29433 DAG.isKnownNeverZeroFloat(X) ||
29435 SDValue NewX, NewY;
29436 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29437 MatchesZero(X, OppositeZero)) {
29438 // Operands are already in right order or order does not matter.
29439 NewX = X;
29440 NewY = Y;
29441 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29442 NewX = Y;
29443 NewY = X;
29444 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29445 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29446 if (IsXNeverNaN)
29447 std::swap(X, Y);
29448 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29449 // xmm register.
29450 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29452 // Bits of classes:
29453 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29454 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29455 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29456 DL, MVT::i32);
29457 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29458 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29459 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29460 DAG.getVectorIdxConstant(0, DL));
29461 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29462 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29463 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29464 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29465 } else {
29466 SDValue IsXSigned;
29467 if (Subtarget.is64Bit() || VT != MVT::f64) {
29468 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29469 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29470 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29471 } else {
29472 assert(VT == MVT::f64);
29473 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29474 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29475 DAG.getVectorIdxConstant(0, DL));
29476 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29477 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29478 DAG.getVectorIdxConstant(1, DL));
29479 Hi = DAG.getBitcast(MVT::i32, Hi);
29480 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29481 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29482 *DAG.getContext(), MVT::i32);
29483 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29484 }
29485 if (MinMaxOp == X86ISD::FMAX) {
29486 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29487 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29488 } else {
29489 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29490 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29491 }
29492 }
29493
29494 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29495 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29496
29497 // If we did no ordering operands for signed zero handling and we need
29498 // to process NaN and we know that one of the operands is not NaN then:
29499 // - For minimum/maximum, put it in the first operand,
29500 // - For minimumnum/maximumnum, put it in the second operand,
29501 // and we will not need to post handle NaN after max/min.
29502 if (IgnoreSignedZero && !IgnoreNaN &&
29503 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29504 std::swap(NewX, NewY);
29505
29506 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29507
29508 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29509 return MinMax;
29510
29511 if (DAG.isKnownNeverNaN(NewX))
29512 NewX = NewY;
29513
29514 SDValue IsNaN =
29515 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29516
29517 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29518}
29519
29520static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29521 SelectionDAG &DAG) {
29522 MVT VT = Op.getSimpleValueType();
29523 SDLoc dl(Op);
29524
29525 // For AVX1 cases, split to use legal ops.
29526 if (VT.is256BitVector() && !Subtarget.hasInt256())
29527 return splitVectorIntBinary(Op, DAG, dl);
29528
29529 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29530 return splitVectorIntBinary(Op, DAG, dl);
29531
29532 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29533 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29534
29535 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29536 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29537 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29538
29539 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29540 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29541 if (VT.bitsGE(MVT::i32)) {
29542 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29543 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29544 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29545 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29546 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29547 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29548 DAG.getTargetConstant(CC, dl, MVT::i8),
29549 Diff1.getValue(1));
29550 }
29551
29552 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29553 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29554 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29555 MVT WideVT = MVT::getIntegerVT(WideBits);
29556 if (TLI.isTypeLegal(WideVT)) {
29557 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29558 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29559 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29560 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29561 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29562 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29563 DAG.getTargetConstant(CC, dl, MVT::i8),
29564 Diff1.getValue(1));
29565 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29566 }
29567 }
29568
29569 // Default to expand.
29570 return SDValue();
29571}
29572
29573static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29574 SelectionDAG &DAG) {
29575 SDLoc dl(Op);
29576 MVT VT = Op.getSimpleValueType();
29577
29578 // Decompose 256-bit ops into 128-bit ops.
29579 if (VT.is256BitVector() && !Subtarget.hasInt256())
29580 return splitVectorIntBinary(Op, DAG, dl);
29581
29582 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29583 return splitVectorIntBinary(Op, DAG, dl);
29584
29585 SDValue A = Op.getOperand(0);
29586 SDValue B = Op.getOperand(1);
29587
29588 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29589 // vector pairs, multiply and truncate.
29590 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29591 unsigned NumElts = VT.getVectorNumElements();
29592 unsigned NumLanes = VT.getSizeInBits() / 128;
29593 unsigned NumEltsPerLane = NumElts / NumLanes;
29594
29595 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29596 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29597 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29598 return DAG.getNode(
29599 ISD::TRUNCATE, dl, VT,
29600 DAG.getNode(ISD::MUL, dl, ExVT,
29601 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29602 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29603 }
29604
29605 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29606
29607 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29608 // Don't do this if we only need to unpack one half.
29609 if (Subtarget.hasSSSE3()) {
29610 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29611 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29612 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29613 if (BIsBuildVector) {
29614 for (auto [Idx, Val] : enumerate(B->ops())) {
29615 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29616 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29617 else
29618 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29619 }
29620 }
29621 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29622 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29623 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29624 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29625 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29626 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29627 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29628 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29629 DAG.getTargetConstant(8, dl, MVT::i8));
29630 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29631 }
29632 }
29633
29634 // Extract the lo/hi parts to any extend to i16.
29635 // We're going to mask off the low byte of each result element of the
29636 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29637 // element.
29638 SDValue Undef = DAG.getUNDEF(VT);
29639 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29640 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29641
29642 SDValue BLo, BHi;
29643 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29644 // If the RHS is a constant, manually unpackl/unpackh.
29645 SmallVector<SDValue, 16> LoOps, HiOps;
29646 for (unsigned i = 0; i != NumElts; i += 16) {
29647 for (unsigned j = 0; j != 8; ++j) {
29648 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29649 MVT::i16));
29650 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29651 MVT::i16));
29652 }
29653 }
29654
29655 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29656 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29657 } else {
29658 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29659 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29660 }
29661
29662 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29663 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29664 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29665 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29666 }
29667
29668 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29669 if (VT == MVT::v4i32) {
29670 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29671 "Should not custom lower when pmulld is available!");
29672
29673 // Extract the odd parts.
29674 static const int UnpackMask[] = {1, 1, 3, 3};
29675 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29676 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29677
29678 // Multiply the even parts.
29679 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29680 DAG.getBitcast(MVT::v2i64, A),
29681 DAG.getBitcast(MVT::v2i64, B));
29682 // Now multiply odd parts.
29683 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29684 DAG.getBitcast(MVT::v2i64, Aodds),
29685 DAG.getBitcast(MVT::v2i64, Bodds));
29686
29687 Evens = DAG.getBitcast(VT, Evens);
29688 Odds = DAG.getBitcast(VT, Odds);
29689
29690 // Merge the two vectors back together with a shuffle. This expands into 2
29691 // shuffles.
29692 static const int ShufMask[] = { 0, 4, 2, 6 };
29693 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29694 }
29695
29696 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29697 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29698 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29699
29700 // Ahi = psrlqi(a, 32);
29701 // Bhi = psrlqi(b, 32);
29702 //
29703 // AloBlo = pmuludq(a, b);
29704 // AloBhi = pmuludq(a, Bhi);
29705 // AhiBlo = pmuludq(Ahi, b);
29706 //
29707 // Hi = psllqi(AloBhi + AhiBlo, 32);
29708 // return AloBlo + Hi;
29709 KnownBits AKnown = DAG.computeKnownBits(A);
29710 KnownBits BKnown = DAG.computeKnownBits(B);
29711
29712 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29713 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29714 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29715
29716 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29717 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29718 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29719
29720 SDValue Zero = DAG.getConstant(0, dl, VT);
29721
29722 // Only multiply lo/hi halves that aren't known to be zero.
29723 SDValue AloBlo = Zero;
29724 if (!ALoIsZero && !BLoIsZero)
29725 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29726
29727 SDValue AloBhi = Zero;
29728 if (!ALoIsZero && !BHiIsZero) {
29729 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29730 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29731 }
29732
29733 SDValue AhiBlo = Zero;
29734 if (!AHiIsZero && !BLoIsZero) {
29735 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29736 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29737 }
29738
29739 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29740 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29741
29742 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29743}
29744
29746 MVT VT, bool IsSigned,
29747 const X86Subtarget &Subtarget,
29748 SelectionDAG &DAG,
29749 SDValue *Low = nullptr) {
29750 unsigned NumElts = VT.getVectorNumElements();
29751
29752 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29753 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29754 // lane results back together.
29755
29756 // We'll take different approaches for signed and unsigned.
29757 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29758 // and use pmullw to calculate the full 16-bit product.
29759 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29760 // shift them left into the upper byte of each word. This allows us to use
29761 // pmulhw to calculate the full 16-bit product. This trick means we don't
29762 // need to sign extend the bytes to use pmullw.
29763
29764 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29765 SDValue Zero = DAG.getConstant(0, dl, VT);
29766
29767 SDValue ALo, AHi;
29768 if (IsSigned) {
29769 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29770 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29771 } else {
29772 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29773 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29774 }
29775
29776 SDValue BLo, BHi;
29777 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29778 // If the RHS is a constant, manually unpackl/unpackh and extend.
29779 SmallVector<SDValue, 16> LoOps, HiOps;
29780 for (unsigned i = 0; i != NumElts; i += 16) {
29781 for (unsigned j = 0; j != 8; ++j) {
29782 SDValue LoOp = B.getOperand(i + j);
29783 SDValue HiOp = B.getOperand(i + j + 8);
29784
29785 if (IsSigned) {
29786 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29787 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29788 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29789 DAG.getConstant(8, dl, MVT::i16));
29790 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29791 DAG.getConstant(8, dl, MVT::i16));
29792 } else {
29793 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29794 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29795 }
29796
29797 LoOps.push_back(LoOp);
29798 HiOps.push_back(HiOp);
29799 }
29800 }
29801
29802 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29803 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29804 } else if (IsSigned) {
29805 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29806 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29807 } else {
29808 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29809 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29810 }
29811
29812 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29813 // pack back to vXi8.
29814 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29815 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29816 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29817
29818 if (Low)
29819 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29820
29821 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29822}
29823
29824static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29825 SelectionDAG &DAG) {
29826 SDLoc dl(Op);
29827 MVT VT = Op.getSimpleValueType();
29828 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29829 unsigned NumElts = VT.getVectorNumElements();
29830 SDValue A = Op.getOperand(0);
29831 SDValue B = Op.getOperand(1);
29832
29833 // Decompose 256-bit ops into 128-bit ops.
29834 if (VT.is256BitVector() && !Subtarget.hasInt256())
29835 return splitVectorIntBinary(Op, DAG, dl);
29836
29837 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29838 return splitVectorIntBinary(Op, DAG, dl);
29839
29840 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29841 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29842 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29843 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29844
29845 // PMULxD operations multiply each even value (starting at 0) of LHS with
29846 // the related value of RHS and produce a widen result.
29847 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29848 // => <2 x i64> <ae|cg>
29849 //
29850 // In other word, to have all the results, we need to perform two PMULxD:
29851 // 1. one with the even values.
29852 // 2. one with the odd values.
29853 // To achieve #2, with need to place the odd values at an even position.
29854 //
29855 // Place the odd value at an even position (basically, shift all values 1
29856 // step to the left):
29857 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29858 9, -1, 11, -1, 13, -1, 15, -1};
29859 // <a|b|c|d> => <b|undef|d|undef>
29860 SDValue Odd0 =
29861 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29862 // <e|f|g|h> => <f|undef|h|undef>
29863 SDValue Odd1 =
29864 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29865
29866 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29867 // ints.
29868 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29869 unsigned Opcode =
29870 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29871 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29872 // => <2 x i64> <ae|cg>
29873 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29874 DAG.getBitcast(MulVT, A),
29875 DAG.getBitcast(MulVT, B)));
29876 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29877 // => <2 x i64> <bf|dh>
29878 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29879 DAG.getBitcast(MulVT, Odd0),
29880 DAG.getBitcast(MulVT, Odd1)));
29881
29882 // Shuffle it back into the right order.
29883 SmallVector<int, 16> ShufMask(NumElts);
29884 for (int i = 0; i != (int)NumElts; ++i)
29885 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29886
29887 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29888
29889 // If we have a signed multiply but no PMULDQ fix up the result of an
29890 // unsigned multiply.
29891 if (IsSigned && !Subtarget.hasSSE41()) {
29892 SDValue Zero = DAG.getConstant(0, dl, VT);
29893 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29894 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29895 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29896 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29897
29898 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29899 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29900 }
29901
29902 return Res;
29903 }
29904
29905 // Only i8 vectors should need custom lowering after this.
29906 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29907 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29908 "Unsupported vector type");
29909
29910 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29911 // logical shift down the upper half and pack back to i8.
29912
29913 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29914 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29915
29916 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29917 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29918 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29919 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29920 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29921 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29922 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29923 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29924 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29925 }
29926
29927 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29928}
29929
29930// Custom lowering for SMULO/UMULO.
29931static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29932 SelectionDAG &DAG) {
29933 MVT VT = Op.getSimpleValueType();
29934
29935 // Scalars defer to LowerXALUO.
29936 if (!VT.isVector())
29937 return LowerXALUO(Op, DAG);
29938
29939 SDLoc dl(Op);
29940 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29941 SDValue A = Op.getOperand(0);
29942 SDValue B = Op.getOperand(1);
29943 EVT OvfVT = Op->getValueType(1);
29944
29945 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29946 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29947 // Extract the LHS Lo/Hi vectors
29948 SDValue LHSLo, LHSHi;
29949 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29950
29951 // Extract the RHS Lo/Hi vectors
29952 SDValue RHSLo, RHSHi;
29953 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29954
29955 EVT LoOvfVT, HiOvfVT;
29956 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29957 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29958 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29959
29960 // Issue the split operations.
29961 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29962 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29963
29964 // Join the separate data results and the overflow results.
29965 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29966 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29967 Hi.getValue(1));
29968
29969 return DAG.getMergeValues({Res, Ovf}, dl);
29970 }
29971
29972 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29973 EVT SetccVT =
29974 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29975
29976 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29977 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29978 unsigned NumElts = VT.getVectorNumElements();
29979 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29980 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29981 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29982 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29983 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29984
29985 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29986
29987 SDValue Ovf;
29988 if (IsSigned) {
29989 SDValue High, LowSign;
29990 if (OvfVT.getVectorElementType() == MVT::i1 &&
29991 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29992 // Rather the truncating try to do the compare on vXi16 or vXi32.
29993 // Shift the high down filling with sign bits.
29994 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29995 // Fill all 16 bits with the sign bit from the low.
29996 LowSign =
29997 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29998 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29999 15, DAG);
30000 SetccVT = OvfVT;
30001 if (!Subtarget.hasBWI()) {
30002 // We can't do a vXi16 compare so sign extend to v16i32.
30003 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
30004 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
30005 }
30006 } else {
30007 // Otherwise do the compare at vXi8.
30008 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30009 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30010 LowSign =
30011 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30012 }
30013
30014 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30015 } else {
30016 SDValue High =
30017 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30018 if (OvfVT.getVectorElementType() == MVT::i1 &&
30019 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30020 // Rather the truncating try to do the compare on vXi16 or vXi32.
30021 SetccVT = OvfVT;
30022 if (!Subtarget.hasBWI()) {
30023 // We can't do a vXi16 compare so sign extend to v16i32.
30024 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30025 }
30026 } else {
30027 // Otherwise do the compare at vXi8.
30028 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30029 }
30030
30031 Ovf =
30032 DAG.getSetCC(dl, SetccVT, High,
30033 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30034 }
30035
30036 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30037
30038 return DAG.getMergeValues({Low, Ovf}, dl);
30039 }
30040
30041 SDValue Low;
30042 SDValue High =
30043 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30044
30045 SDValue Ovf;
30046 if (IsSigned) {
30047 // SMULO overflows if the high bits don't match the sign of the low.
30048 SDValue LowSign =
30049 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30050 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30051 } else {
30052 // UMULO overflows if the high bits are non-zero.
30053 Ovf =
30054 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30055 }
30056
30057 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30058
30059 return DAG.getMergeValues({Low, Ovf}, dl);
30060}
30061
30062SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30063 assert(Subtarget.isTargetWin64() && "Unexpected target");
30064 EVT VT = Op.getValueType();
30065 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30066 "Unexpected return type for lowering");
30067
30068 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30070 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30071 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30072 }
30073
30074 RTLIB::Libcall LC;
30075 bool isSigned;
30076 switch (Op->getOpcode()) {
30077 // clang-format off
30078 default: llvm_unreachable("Unexpected request for libcall!");
30079 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30080 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30081 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30082 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30083 // clang-format on
30084 }
30085
30086 SDLoc dl(Op);
30087 SDValue InChain = DAG.getEntryNode();
30088
30090 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30091 EVT ArgVT = Op->getOperand(i).getValueType();
30092 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30093 "Unexpected argument type for lowering");
30094 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30095 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30096 MachinePointerInfo MPI =
30098 InChain =
30099 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30100 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30101 }
30102
30105
30106 TargetLowering::CallLoweringInfo CLI(DAG);
30107 CLI.setDebugLoc(dl)
30108 .setChain(InChain)
30109 .setLibCallee(
30111 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30112 std::move(Args))
30113 .setInRegister()
30114 .setSExtResult(isSigned)
30115 .setZExtResult(!isSigned);
30116
30117 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30118 return DAG.getBitcast(VT, CallInfo.first);
30119}
30120
30121SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30122 SelectionDAG &DAG,
30123 SDValue &Chain) const {
30124 assert(Subtarget.isTargetWin64() && "Unexpected target");
30125 EVT VT = Op.getValueType();
30126 bool IsStrict = Op->isStrictFPOpcode();
30127
30128 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30129 EVT ArgVT = Arg.getValueType();
30130
30131 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30132 "Unexpected return type for lowering");
30133
30134 RTLIB::Libcall LC;
30135 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30136 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30137 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30138 else
30139 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30140 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30141
30142 SDLoc dl(Op);
30143 MakeLibCallOptions CallOptions;
30144 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30145
30147 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30148 // expected VT (i128).
30149 std::tie(Result, Chain) =
30150 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30151 Result = DAG.getBitcast(VT, Result);
30152 return Result;
30153}
30154
30155SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30156 SelectionDAG &DAG) const {
30157 assert(Subtarget.isTargetWin64() && "Unexpected target");
30158 EVT VT = Op.getValueType();
30159 bool IsStrict = Op->isStrictFPOpcode();
30160
30161 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30162 EVT ArgVT = Arg.getValueType();
30163
30164 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30165 "Unexpected argument type for lowering");
30166
30167 RTLIB::Libcall LC;
30168 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30169 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30170 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30171 else
30172 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30173 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30174
30175 SDLoc dl(Op);
30176 MakeLibCallOptions CallOptions;
30177 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30178
30179 // Pass the i128 argument as an indirect argument on the stack.
30180 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30181 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30182 MachinePointerInfo MPI =
30184 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30185
30187 std::tie(Result, Chain) =
30188 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30189 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30190}
30191
30192// Return true if the required (according to Opcode) shift-imm form is natively
30193// supported by the Subtarget
30194static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30195 unsigned Opcode) {
30196 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30197 "Unexpected shift opcode");
30198
30199 if (!VT.isSimple())
30200 return false;
30201
30202 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30203 return false;
30204
30205 if (VT.getScalarSizeInBits() < 16)
30206 return false;
30207
30208 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30209 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30210 return true;
30211
30212 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30213 (VT.is256BitVector() && Subtarget.hasInt256());
30214
30215 bool AShift = LShift && (Subtarget.hasAVX512() ||
30216 (VT != MVT::v2i64 && VT != MVT::v4i64));
30217 return (Opcode == ISD::SRA) ? AShift : LShift;
30218}
30219
30220// The shift amount is a variable, but it is the same for all vector lanes.
30221// These instructions are defined together with shift-immediate.
30222static
30224 unsigned Opcode) {
30225 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30226}
30227
30228// Return true if the required (according to Opcode) variable-shift form is
30229// natively supported by the Subtarget
30230static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30231 unsigned Opcode) {
30232 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30233 "Unexpected shift opcode");
30234
30235 if (!VT.isSimple())
30236 return false;
30237
30238 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30239 return false;
30240
30241 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30242 return false;
30243
30244 // vXi16 supported only on AVX-512, BWI
30245 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30246 return false;
30247
30248 if (Subtarget.hasAVX512() &&
30249 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30250 return true;
30251
30252 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30253 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30254 return (Opcode == ISD::SRA) ? AShift : LShift;
30255}
30256
30258 const X86Subtarget &Subtarget) {
30259 MVT VT = Op.getSimpleValueType();
30260 SDLoc dl(Op);
30261 SDValue R = Op.getOperand(0);
30262 SDValue Amt = Op.getOperand(1);
30263 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30264 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30265
30266 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30267 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30268 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30269 SDValue Ex = DAG.getBitcast(ExVT, R);
30270
30271 // ashr(R, 63) === cmp_slt(R, 0)
30272 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30273 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30274 "Unsupported PCMPGT op");
30275 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30276 }
30277
30278 if (ShiftAmt >= 32) {
30279 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30280 SDValue Upper =
30281 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30283 ShiftAmt - 32, DAG);
30284 if (VT == MVT::v2i64)
30285 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30286 if (VT == MVT::v4i64)
30287 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30288 {9, 1, 11, 3, 13, 5, 15, 7});
30289 } else {
30290 // SRA upper i32, SRL whole i64 and select lower i32.
30292 ShiftAmt, DAG);
30293 SDValue Lower =
30294 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30295 Lower = DAG.getBitcast(ExVT, Lower);
30296 if (VT == MVT::v2i64)
30297 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30298 if (VT == MVT::v4i64)
30299 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30300 {8, 1, 10, 3, 12, 5, 14, 7});
30301 }
30302 return DAG.getBitcast(VT, Ex);
30303 };
30304
30305 // Optimize shl/srl/sra with constant shift amount.
30306 APInt APIntShiftAmt;
30307 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30308 return SDValue();
30309
30310 // If the shift amount is out of range, return undef.
30311 if (APIntShiftAmt.uge(EltSizeInBits))
30312 return DAG.getUNDEF(VT);
30313
30314 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30315
30316 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
30317 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30318
30319 // i64 SRA needs to be performed as partial shifts.
30320 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30321 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30322 Op.getOpcode() == ISD::SRA)
30323 return ArithmeticShiftRight64(ShiftAmt);
30324
30325 // If we're logical shifting an all-signbits value then we can just perform as
30326 // a mask.
30327 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30328 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30329 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30330 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30331 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30332 }
30333
30334 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30335 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30336 unsigned NumElts = VT.getVectorNumElements();
30337 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30338
30339 // Simple i8 add case
30340 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30341 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30342 // must be 0). (add undef, undef) however can be any value. To make this
30343 // safe, we must freeze R to ensure that register allocation uses the same
30344 // register for an undefined value. This ensures that the result will
30345 // still be even and preserves the original semantics.
30346 R = DAG.getFreeze(R);
30347 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30348 }
30349
30350 // ashr(R, 7) === cmp_slt(R, 0)
30351 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30352 SDValue Zeros = DAG.getConstant(0, dl, VT);
30353 if (VT.is512BitVector()) {
30354 assert(VT == MVT::v64i8 && "Unexpected element type!");
30355 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30356 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30357 }
30358 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30359 }
30360
30361 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30362 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30363 return SDValue();
30364
30365 if (Subtarget.hasGFNI()) {
30366 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30367 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30368 DAG.getTargetConstant(0, dl, MVT::i8));
30369 }
30370
30371 if (Op.getOpcode() == ISD::SHL) {
30372 // Make a large shift.
30373 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30374 ShiftAmt, DAG);
30375 SHL = DAG.getBitcast(VT, SHL);
30376 // Zero out the rightmost bits.
30377 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30378 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30379 }
30380 if (Op.getOpcode() == ISD::SRL) {
30381 // Make a large shift.
30382 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30383 ShiftAmt, DAG);
30384 SRL = DAG.getBitcast(VT, SRL);
30385 // Zero out the leftmost bits.
30386 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30387 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30388 }
30389 if (Op.getOpcode() == ISD::SRA) {
30390 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30391 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30392
30393 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30394 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30395 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30396 return Res;
30397 }
30398 llvm_unreachable("Unknown shift opcode.");
30399 }
30400
30401 return SDValue();
30402}
30403
30405 const X86Subtarget &Subtarget) {
30406 MVT VT = Op.getSimpleValueType();
30407 SDLoc dl(Op);
30408 SDValue R = Op.getOperand(0);
30409 SDValue Amt = Op.getOperand(1);
30410 unsigned Opcode = Op.getOpcode();
30411 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30412
30413 int BaseShAmtIdx = -1;
30414 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30415 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30416 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30417 Subtarget, DAG);
30418
30419 // vXi8 shifts - shift as v8i16 + mask result.
30420 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30421 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30422 VT == MVT::v64i8) &&
30423 !Subtarget.hasXOP()) {
30424 unsigned NumElts = VT.getVectorNumElements();
30425 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30426 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30427 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30428 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30429
30430 // Create the mask using vXi16 shifts. For shift-rights we need to move
30431 // the upper byte down before splatting the vXi8 mask.
30432 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30433 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30434 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30435 if (Opcode != ISD::SHL)
30436 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30437 8, DAG);
30438 BitMask = DAG.getBitcast(VT, BitMask);
30439 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30440 SmallVector<int, 64>(NumElts, 0));
30441
30442 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30443 DAG.getBitcast(ExtVT, R), BaseShAmt,
30444 BaseShAmtIdx, Subtarget, DAG);
30445 Res = DAG.getBitcast(VT, Res);
30446 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30447
30448 if (Opcode == ISD::SRA) {
30449 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30450 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30451 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30452 SignMask =
30453 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30454 BaseShAmtIdx, Subtarget, DAG);
30455 SignMask = DAG.getBitcast(VT, SignMask);
30456 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30457 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30458 }
30459 return Res;
30460 }
30461 }
30462 }
30463
30464 return SDValue();
30465}
30466
30467// Convert a shift/rotate left amount to a multiplication scale factor.
30469 const X86Subtarget &Subtarget,
30470 SelectionDAG &DAG) {
30471 MVT VT = Amt.getSimpleValueType();
30472 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30473 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30474 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30475 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30476 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30477 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30478 return SDValue();
30479
30480 MVT SVT = VT.getVectorElementType();
30481 unsigned SVTBits = SVT.getSizeInBits();
30482 unsigned NumElems = VT.getVectorNumElements();
30483
30484 APInt UndefElts;
30485 SmallVector<APInt> EltBits;
30486 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30487 APInt One(SVTBits, 1);
30488 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30489 for (unsigned I = 0; I != NumElems; ++I) {
30490 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30491 continue;
30492 uint64_t ShAmt = EltBits[I].getZExtValue();
30493 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30494 }
30495 return DAG.getBuildVector(VT, dl, Elts);
30496 }
30497
30498 // If the target doesn't support variable shifts, use either FP conversion
30499 // or integer multiplication to avoid shifting each element individually.
30500 if (VT == MVT::v4i32) {
30501 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30502 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30503 DAG.getConstant(0x3f800000U, dl, VT));
30504 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30505 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30506 }
30507
30508 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30509 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30510 SDValue Z = DAG.getConstant(0, dl, VT);
30511 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30512 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30513 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30514 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30515 if (Subtarget.hasSSE41())
30516 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30517 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30518 }
30519
30520 return SDValue();
30521}
30522
30523static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30524 SelectionDAG &DAG) {
30525 MVT VT = Op.getSimpleValueType();
30526 SDLoc dl(Op);
30527 SDValue R = Op.getOperand(0);
30528 SDValue Amt = Op.getOperand(1);
30529 unsigned NumElts = VT.getVectorNumElements();
30530 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30531 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30532
30533 unsigned Opc = Op.getOpcode();
30534 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30535 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30536
30537 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30538 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30539
30540 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30541 return V;
30542
30543 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30544 return V;
30545
30546 if (supportedVectorVarShift(VT, Subtarget, Opc))
30547 return Op;
30548
30549 // i64 vector arithmetic shift can be emulated with the transform:
30550 // M = lshr(SIGN_MASK, Amt)
30551 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30552 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30553 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30554 Opc == ISD::SRA) {
30555 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30556 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30557 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30558 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30559 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30560 return R;
30561 }
30562
30563 // XOP has 128-bit variable logical/arithmetic shifts.
30564 // +ve/-ve Amt = shift left/right.
30565 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30566 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30567 if (Opc == ISD::SRL || Opc == ISD::SRA)
30568 Amt = DAG.getNegative(Amt, dl, VT);
30569 if (Opc == ISD::SHL || Opc == ISD::SRL)
30570 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30571 if (Opc == ISD::SRA)
30572 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30573 }
30574
30575 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30576 // shifts per-lane and then shuffle the partial results back together.
30577 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30578 // Splat the shift amounts so the scalar shifts above will catch it.
30579 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30580 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30581 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30582 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30583 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30584 }
30585
30586 // Build a map of inrange constant amounts with element mask where they occur.
30588 if (ConstantAmt) {
30589 for (unsigned I = 0; I != NumElts; ++I) {
30590 SDValue A = Amt.getOperand(I);
30591 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30592 continue;
30593 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30594 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30595 if (!Inserted) {
30596 It->second.setBit(I);
30597 continue;
30598 }
30599 It->second = APInt::getOneBitSet(NumElts, I);
30600 }
30601 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30602 }
30603
30604 // If possible, lower this shift as a sequence of two shifts by
30605 // constant plus a BLENDing shuffle instead of scalarizing it.
30606 // Example:
30607 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30608 //
30609 // Could be rewritten as:
30610 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30611 //
30612 // The advantage is that the two shifts from the example would be
30613 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30614 if (UniqueCstAmt.size() == 2 &&
30615 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30616 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30617 unsigned AmtA = UniqueCstAmt.begin()->first;
30618 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30619 const APInt &MaskA = UniqueCstAmt.begin()->second;
30620 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30621 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30622 for (unsigned I = 0; I != NumElts; ++I) {
30623 if (MaskA[I])
30624 ShuffleMask[I] = I;
30625 if (MaskB[I])
30626 ShuffleMask[I] = I + NumElts;
30627 }
30628
30629 // Only perform this blend if we can perform it without loading a mask.
30630 if ((VT != MVT::v16i16 ||
30631 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30632 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30633 canWidenShuffleElements(ShuffleMask))) {
30634 SDValue Shift1 =
30635 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30636 SDValue Shift2 =
30637 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30638 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30639 }
30640 }
30641
30642 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30643 // using vYiM vector operations where X*N == Y*M and M > N.
30644 if (ConstantAmt &&
30645 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30646 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30647 !Subtarget.hasXOP()) {
30648 MVT NarrowScalarVT = VT.getScalarType();
30649 // We can do this extra fast if each pair of narrow elements is shifted by
30650 // the same amount by doing this SWAR style: use a shift to move the valid
30651 // bits to the right position, mask out any bits which crossed from one
30652 // element to the other.
30653 // This optimized lowering is only valid if the elements in a pair can
30654 // be treated identically.
30655 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30656 SmallVector<SDValue, 32> TmpAmtWideElts;
30657 int WideEltSizeInBits = EltSizeInBits;
30658 while (WideEltSizeInBits < 32) {
30659 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30660 // unprofitable.
30661 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30662 break;
30663 }
30664 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30665 bool SameShifts = true;
30666 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30667 unsigned DstI = SrcI / 2;
30668 // Both elements are undef? Make a note and keep going.
30669 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30670 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30671 continue;
30672 }
30673 // Even element is undef? We will shift it by the same shift amount as
30674 // the odd element.
30675 if (AmtWideElts[SrcI].isUndef()) {
30676 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30677 continue;
30678 }
30679 // Odd element is undef? We will shift it by the same shift amount as
30680 // the even element.
30681 if (AmtWideElts[SrcI + 1].isUndef()) {
30682 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30683 continue;
30684 }
30685 // Both elements are equal.
30686 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30687 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30688 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30689 continue;
30690 }
30691 // One of the provisional wide elements will not have the same shift
30692 // amount. Let's bail.
30693 SameShifts = false;
30694 break;
30695 }
30696 if (!SameShifts) {
30697 break;
30698 }
30699 WideEltSizeInBits *= 2;
30700 std::swap(TmpAmtWideElts, AmtWideElts);
30701 }
30702 APInt APIntShiftAmt;
30703 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30704 bool Profitable = WidenShift;
30705 // AVX512BW brings support for vpsllvw.
30706 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30707 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30708 Profitable = false;
30709 }
30710 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30711 // fairly cheaply in other ways.
30712 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30713 Profitable = false;
30714 }
30715 // Leave it up to GFNI if we have it around.
30716 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30717 // is probably a win to use other strategies in some cases.
30718 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30719 Profitable = false;
30720 }
30721
30722 // AVX1 does not have vpand which makes our masking impractical. It does
30723 // have vandps but that is an FP instruction and crossing FP<->int typically
30724 // has some cost.
30725 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30726 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30727 Profitable = false;
30728 }
30729 unsigned WideNumElts = AmtWideElts.size();
30730 // We are only dealing with identical pairs.
30731 if (Profitable && WideNumElts != NumElts) {
30732 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30733 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30734 // Cast the operand to vXiM.
30735 SDValue RWide = DAG.getBitcast(WideVT, R);
30736 // Create our new vector of shift amounts.
30737 SDValue AmtWide = DAG.getBuildVector(
30738 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30739 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30740 // Perform the actual shift.
30741 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30742 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30743 // Now we need to construct a mask which will "drop" bits that get
30744 // shifted past the LSB/MSB. For a logical shift left, it will look
30745 // like:
30746 // FullMask = (1 << EltSizeInBits) - 1
30747 // Mask = FullMask << Amt
30748 //
30749 // This masking ensures that bits cannot migrate from one narrow lane to
30750 // another. The construction of this mask will be constant folded.
30751 // The mask for a logical right shift is nearly identical, the only
30752 // difference is that the all ones mask is shifted right instead of left.
30753 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30754 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30755 Mask = DAG.getBitcast(WideVT, Mask);
30756 // Finally, we mask the shifted vector with the SWAR mask.
30757 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30758 Masked = DAG.getBitcast(VT, Masked);
30759 if (Opc != ISD::SRA) {
30760 // Logical shifts are complete at this point.
30761 return Masked;
30762 }
30763 // At this point, we have done a *logical* shift right. We now need to
30764 // sign extend the result so that we get behavior equivalent to an
30765 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30766 // are `EltSizeInBits-AmtWide` bits wide.
30767 //
30768 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30769 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30770 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30771 // can use the following trick to accomplish this:
30772 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30773 // (Masked ^ SignBitMask) - SignBitMask
30774 //
30775 // When the sign bit is already clear, this will compute:
30776 // Masked + SignBitMask - SignBitMask
30777 //
30778 // This is equal to Masked which is what we want: the sign bit was clear
30779 // so sign extending should be a no-op.
30780 //
30781 // When the sign bit is set, this will compute:
30782 // Masked - SignBitmask - SignBitMask
30783 //
30784 // This is equal to Masked - 2*SignBitMask which will correctly sign
30785 // extend our result.
30786 SDValue SplatHighBit =
30787 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30788 // This does not induce recursion, all operands are constants.
30789 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30790 SDValue FlippedSignBit =
30791 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30792 SDValue Subtraction =
30793 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30794 return Subtraction;
30795 }
30796 }
30797
30798 // If possible, lower this packed shift into a vector multiply instead of
30799 // expanding it into a sequence of scalar shifts.
30800 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30801 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30802 Subtarget.canExtendTo512BW())))
30803 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30804 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30805
30806 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30807 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30808 if (Opc == ISD::SRL && ConstantAmt &&
30809 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30810 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30811 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30812 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30813 SDValue Zero = DAG.getConstant(0, dl, VT);
30814 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30815 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30816 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30817 }
30818 }
30819
30820 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30821 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30822 // TODO: Special case handling for shift by 0/1, really we can afford either
30823 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30824 if (Opc == ISD::SRA && ConstantAmt &&
30825 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30826 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30827 !Subtarget.hasAVX512()) ||
30828 DAG.isKnownNeverZero(Amt))) {
30829 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30830 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30831 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30832 SDValue Amt0 =
30833 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30834 SDValue Amt1 =
30835 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30836 SDValue Sra1 =
30837 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30838 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30839 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30840 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30841 }
30842 }
30843
30844 // v4i32 Non Uniform Shifts.
30845 // If the shift amount is constant we can shift each lane using the SSE2
30846 // immediate shifts, else we need to zero-extend each lane to the lower i64
30847 // and shift using the SSE2 variable shifts.
30848 // The separate results can then be blended together.
30849 if (VT == MVT::v4i32) {
30850 SDValue Amt0, Amt1, Amt2, Amt3;
30851 if (ConstantAmt) {
30852 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30853 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30854 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30855 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30856 } else {
30857 // The SSE2 shifts use the lower i64 as the same shift amount for
30858 // all lanes and the upper i64 is ignored. On AVX we're better off
30859 // just zero-extending, but for SSE just duplicating the top 16-bits is
30860 // cheaper and has the same effect for out of range values.
30861 if (Subtarget.hasAVX()) {
30862 SDValue Z = DAG.getConstant(0, dl, VT);
30863 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30864 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30865 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30866 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30867 } else {
30868 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30869 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30870 {4, 5, 6, 7, -1, -1, -1, -1});
30871 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30872 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30873 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30874 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30875 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30876 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30877 }
30878 }
30879
30880 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30881 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30882 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30883 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30884 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30885
30886 // Merge the shifted lane results optimally with/without PBLENDW.
30887 // TODO - ideally shuffle combining would handle this.
30888 if (Subtarget.hasSSE41()) {
30889 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30890 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30891 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30892 }
30893 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30894 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30895 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30896 }
30897
30898 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30899 // look up the pre-computed shift values.
30900 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30901 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30902 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30903 unsigned NumLanes = VT.getSizeInBits() / 128u;
30904 unsigned NumEltsPerLane = NumElts / NumLanes;
30906 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30907 unsigned LoElt = Lane * NumEltsPerLane;
30908 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30909 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30910 if (!KnownLane.isConstant())
30911 break;
30912 const APInt &LaneSplat = KnownLane.getConstant();
30913 for (unsigned I = 0; I != 8; ++I) {
30914 if (Opc == ISD::SHL)
30915 LUT.push_back(LaneSplat.shl(I));
30916 else if (Opc == ISD::SRL)
30917 LUT.push_back(LaneSplat.lshr(I));
30918 else if (Opc == ISD::SRA)
30919 LUT.push_back(LaneSplat.ashr(I));
30920 }
30921 LUT.append(8, APInt::getZero(8));
30922 }
30923 if (LUT.size() == NumElts) {
30924 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30925 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30926 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30927 }
30928 }
30929
30930 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30931 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30932 // make the existing SSE solution better.
30933 // NOTE: We honor prefered vector width before promoting to 512-bits.
30934 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30935 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30936 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30937 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30938 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30939 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30940 "Unexpected vector type");
30941 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30942 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30943 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30944 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30945 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30946 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30947 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30948 }
30949
30950 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30951 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30952 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30953 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30954 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30955 !Subtarget.hasXOP()) {
30956 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30957 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30958
30959 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30960 // isn't legal).
30961 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30962 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30963 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30964 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30966 "Constant build vector expected");
30967
30968 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30969 bool IsSigned = Opc == ISD::SRA;
30970 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30971 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30972 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30973 return DAG.getZExtOrTrunc(R, dl, VT);
30974 }
30975
30976 SmallVector<SDValue, 16> LoAmt, HiAmt;
30977 for (unsigned i = 0; i != NumElts; i += 16) {
30978 for (int j = 0; j != 8; ++j) {
30979 LoAmt.push_back(Amt.getOperand(i + j));
30980 HiAmt.push_back(Amt.getOperand(i + j + 8));
30981 }
30982 }
30983
30984 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30985 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30986
30987 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30988 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30989 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30990 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30991 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30992 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30993 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30994 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30995 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30996 }
30997
30998 if (VT == MVT::v16i8 ||
30999 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
31000 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
31001 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31002
31003 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31004 if (VT.is512BitVector()) {
31005 // On AVX512BW targets we make use of the fact that VSELECT lowers
31006 // to a masked blend which selects bytes based just on the sign bit
31007 // extracted to a mask.
31008 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31009 V0 = DAG.getBitcast(VT, V0);
31010 V1 = DAG.getBitcast(VT, V1);
31011 Sel = DAG.getBitcast(VT, Sel);
31012 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31013 ISD::SETGT);
31014 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31015 } else if (Subtarget.hasSSE41()) {
31016 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31017 // on the sign bit.
31018 V0 = DAG.getBitcast(VT, V0);
31019 V1 = DAG.getBitcast(VT, V1);
31020 Sel = DAG.getBitcast(VT, Sel);
31021 return DAG.getBitcast(SelVT,
31022 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31023 }
31024 // On pre-SSE41 targets we test for the sign bit by comparing to
31025 // zero - a negative value will set all bits of the lanes to true
31026 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31027 SDValue Z = DAG.getConstant(0, dl, SelVT);
31028 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31029 return DAG.getSelect(dl, SelVT, C, V0, V1);
31030 };
31031
31032 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31033 // We can safely do this using i16 shifts as we're only interested in
31034 // the 3 lower bits of each byte.
31035 Amt = DAG.getBitcast(ExtVT, Amt);
31036 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31037 Amt = DAG.getBitcast(VT, Amt);
31038
31039 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31040 // r = VSELECT(r, shift(r, 4), a);
31041 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31042 R = SignBitSelect(VT, Amt, M, R);
31043
31044 // a += a
31045 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31046
31047 // r = VSELECT(r, shift(r, 2), a);
31048 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31049 R = SignBitSelect(VT, Amt, M, R);
31050
31051 // a += a
31052 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31053
31054 // return VSELECT(r, shift(r, 1), a);
31055 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31056 R = SignBitSelect(VT, Amt, M, R);
31057 return R;
31058 }
31059
31060 if (Opc == ISD::SRA) {
31061 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31062 // so we can correctly sign extend. We don't care what happens to the
31063 // lower byte.
31064 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31065 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31066 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31067 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31068 ALo = DAG.getBitcast(ExtVT, ALo);
31069 AHi = DAG.getBitcast(ExtVT, AHi);
31070 RLo = DAG.getBitcast(ExtVT, RLo);
31071 RHi = DAG.getBitcast(ExtVT, RHi);
31072
31073 // r = VSELECT(r, shift(r, 4), a);
31074 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31075 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31076 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31077 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31078
31079 // a += a
31080 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31081 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31082
31083 // r = VSELECT(r, shift(r, 2), a);
31084 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31085 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31086 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31087 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31088
31089 // a += a
31090 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31091 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31092
31093 // r = VSELECT(r, shift(r, 1), a);
31094 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31095 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31096 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31097 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31098
31099 // Logical shift the result back to the lower byte, leaving a zero upper
31100 // byte meaning that we can safely pack with PACKUSWB.
31101 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31102 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31103 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31104 }
31105 }
31106
31107 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31108 MVT ExtVT = MVT::v8i32;
31109 SDValue Z = DAG.getConstant(0, dl, VT);
31110 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31111 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31112 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31113 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31114 ALo = DAG.getBitcast(ExtVT, ALo);
31115 AHi = DAG.getBitcast(ExtVT, AHi);
31116 RLo = DAG.getBitcast(ExtVT, RLo);
31117 RHi = DAG.getBitcast(ExtVT, RHi);
31118 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31119 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31120 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31121 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31122 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31123 }
31124
31125 if (VT == MVT::v8i16) {
31126 // If we have a constant shift amount, the non-SSE41 path is best as
31127 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31128 bool UseSSE41 = Subtarget.hasSSE41() &&
31130
31131 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31132 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31133 // the sign bit.
31134 if (UseSSE41) {
31135 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31136 V0 = DAG.getBitcast(ExtVT, V0);
31137 V1 = DAG.getBitcast(ExtVT, V1);
31138 Sel = DAG.getBitcast(ExtVT, Sel);
31139 return DAG.getBitcast(
31140 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31141 }
31142 // On pre-SSE41 targets we splat the sign bit - a negative value will
31143 // set all bits of the lanes to true and VSELECT uses that in
31144 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31145 SDValue C =
31146 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31147 return DAG.getSelect(dl, VT, C, V0, V1);
31148 };
31149
31150 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31151 if (UseSSE41) {
31152 // On SSE41 targets we need to replicate the shift mask in both
31153 // bytes for PBLENDVB.
31154 Amt = DAG.getNode(
31155 ISD::OR, dl, VT,
31156 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31157 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31158 } else {
31159 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31160 }
31161
31162 // r = VSELECT(r, shift(r, 8), a);
31163 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31164 R = SignBitSelect(Amt, M, R);
31165
31166 // a += a
31167 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31168
31169 // r = VSELECT(r, shift(r, 4), a);
31170 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31171 R = SignBitSelect(Amt, M, R);
31172
31173 // a += a
31174 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31175
31176 // r = VSELECT(r, shift(r, 2), a);
31177 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31178 R = SignBitSelect(Amt, M, R);
31179
31180 // a += a
31181 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31182
31183 // return VSELECT(r, shift(r, 1), a);
31184 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31185 R = SignBitSelect(Amt, M, R);
31186 return R;
31187 }
31188
31189 // Decompose 256-bit shifts into 128-bit shifts.
31190 if (VT.is256BitVector())
31191 return splitVectorIntBinary(Op, DAG, dl);
31192
31193 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31194 return splitVectorIntBinary(Op, DAG, dl);
31195
31196 return SDValue();
31197}
31198
31200 SelectionDAG &DAG) {
31201 MVT VT = Op.getSimpleValueType();
31202 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31203 "Unexpected funnel shift opcode!");
31204
31205 SDLoc DL(Op);
31206 SDValue Op0 = Op.getOperand(0);
31207 SDValue Op1 = Op.getOperand(1);
31208 SDValue Amt = Op.getOperand(2);
31209 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31210 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31211
31212 if (VT.isVector()) {
31213 APInt APIntShiftAmt;
31214 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31215 unsigned NumElts = VT.getVectorNumElements();
31216
31217 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31218
31219 if (IsCstSplat) {
31220 if (IsFSHR)
31221 std::swap(Op0, Op1);
31222 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31223 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31224 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31225 {Op0, Op1, Imm}, DAG, Subtarget);
31226 }
31227 return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
31228 {Op0, Op1, Amt}, DAG, Subtarget);
31229 }
31230 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31231 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31232 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31233 "Unexpected funnel shift type!");
31234
31235 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31236 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31237 if (IsCstSplat) {
31238 // TODO: Can't use generic expansion as UNDEF amt elements can be
31239 // converted to other values when folded to shift amounts, losing the
31240 // splat.
31241 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31242 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31243 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31244 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31245 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31246
31247 if (EltSizeInBits == 8 &&
31248 (Subtarget.hasXOP() ||
31249 (useVPTERNLOG(Subtarget, VT) &&
31250 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31251 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31252 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31253 // the original vector width to handle cases where we split.
31254 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31255 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31256 SDValue ShX =
31257 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31258 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31259 SDValue ShY =
31260 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31261 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31262 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31263 DAG.getConstant(MaskX, DL, VT));
31264 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31265 DAG.getConstant(MaskY, DL, VT));
31266 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31267 }
31268
31269 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31270 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31271 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31272 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31273 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31274 }
31275
31276 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31277 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31278 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31279
31280 // Constant vXi16 funnel shifts can be efficiently handled by default.
31281 if (IsCst && EltSizeInBits == 16)
31282 return SDValue();
31283
31284 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31285 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31286 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31287
31288 // Split 256-bit integers on XOP/pre-AVX2 targets.
31289 // Split 512-bit integers on non 512-bit BWI targets.
31290 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31291 !Subtarget.hasAVX2())) ||
31292 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31293 EltSizeInBits < 32)) {
31294 // Pre-mask the amount modulo using the wider vector.
31295 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31296 return splitVectorOp(Op, DAG, DL);
31297 }
31298
31299 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31300 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31301 int ScalarAmtIdx = -1;
31302 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31303 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31304 if (EltSizeInBits == 16)
31305 return SDValue();
31306
31307 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31308 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31309 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31310 ScalarAmtIdx, Subtarget, DAG);
31311 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31312 ScalarAmtIdx, Subtarget, DAG);
31313 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31314 }
31315 }
31316
31317 MVT WideSVT = MVT::getIntegerVT(
31318 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31319 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31320
31321 // If per-element shifts are legal, fallback to generic expansion.
31322 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31323 return SDValue();
31324
31325 // Attempt to fold as:
31326 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31327 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31328 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31329 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31330 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31331 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31332 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31333 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31334 EltSizeInBits, DAG);
31335 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31336 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31337 if (!IsFSHR)
31338 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31339 EltSizeInBits, DAG);
31340 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31341 }
31342
31343 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31344 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31345 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31346 SDValue Z = DAG.getConstant(0, DL, VT);
31347 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31348 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31349 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31350 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31351 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31352 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31353 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31354 }
31355
31356 // Fallback to generic expansion.
31357 return SDValue();
31358 }
31359 assert(
31360 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31361 "Unexpected funnel shift type!");
31362
31363 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31364 bool OptForSize = DAG.shouldOptForSize();
31365 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31366
31367 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31368 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31369 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31370 !isa<ConstantSDNode>(Amt)) {
31371 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31372 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31373 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31374 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31375 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31376 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31377 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31378 if (IsFSHR) {
31379 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31380 } else {
31381 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31382 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31383 }
31384 return DAG.getZExtOrTrunc(Res, DL, VT);
31385 }
31386
31387 if (VT == MVT::i8 || ExpandFunnel)
31388 return SDValue();
31389
31390 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31391 if (VT == MVT::i16) {
31392 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31393 DAG.getConstant(15, DL, Amt.getValueType()));
31394 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31395 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31396 }
31397
31398 return Op;
31399}
31400
31401static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31402 SelectionDAG &DAG) {
31403 MVT VT = Op.getSimpleValueType();
31404 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31405
31406 SDLoc DL(Op);
31407 SDValue R = Op.getOperand(0);
31408 SDValue Amt = Op.getOperand(1);
31409 unsigned Opcode = Op.getOpcode();
31410 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31411 int NumElts = VT.getVectorNumElements();
31412 bool IsROTL = Opcode == ISD::ROTL;
31413
31414 // Check for constant splat rotation amount.
31415 APInt CstSplatValue;
31416 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31417
31418 // Check for splat rotate by zero.
31419 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31420 return R;
31421
31422 // AVX512 implicitly uses modulo rotation amounts.
31423 if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
31424 // Attempt to rotate by immediate.
31425 if (IsCstSplat) {
31426 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31427 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31428 return DAG.getNode(RotOpc, DL, VT, R,
31429 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31430 }
31431
31432 // Else, fall-back on VPROLV/VPRORV.
31433 return Op;
31434 }
31435
31436 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31437 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31438 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31439 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31440 }
31441
31442 SDValue Z = DAG.getConstant(0, DL, VT);
31443
31444 if (!IsROTL) {
31445 // If the ISD::ROTR amount is constant, we're always better converting to
31446 // ISD::ROTL.
31447 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31448 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31449
31450 // XOP targets always prefers ISD::ROTL.
31451 if (Subtarget.hasXOP())
31452 return DAG.getNode(ISD::ROTL, DL, VT, R,
31453 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31454 }
31455
31456 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31457 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31459 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31460 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31461 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31462 DAG.getTargetConstant(0, DL, MVT::i8));
31463 }
31464
31465 // Split 256-bit integers on XOP/pre-AVX2 targets.
31466 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31467 return splitVectorIntBinary(Op, DAG, DL);
31468
31469 // XOP has 128-bit vector variable + immediate rotates.
31470 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31471 // XOP implicitly uses modulo rotation amounts.
31472 if (Subtarget.hasXOP()) {
31473 assert(IsROTL && "Only ROTL expected");
31474 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31475
31476 // Attempt to rotate by immediate.
31477 if (IsCstSplat) {
31478 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31479 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31480 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31481 }
31482
31483 // Use general rotate by variable (per-element).
31484 return Op;
31485 }
31486
31487 // Rotate by an uniform constant - expand back to shifts.
31488 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31489 // to other values when folded to shift amounts, losing the splat.
31490 if (IsCstSplat) {
31491 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31492 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31493 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31494 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31495 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31496 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31497 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31498 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31499 }
31500
31501 // Split 512-bit integers on non 512-bit BWI targets.
31502 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31503 return splitVectorIntBinary(Op, DAG, DL);
31504
31505 assert(
31506 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31507 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31508 Subtarget.hasAVX2()) ||
31509 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31510 "Only vXi32/vXi16/vXi8 vector rotates supported");
31511
31512 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31513 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31514
31515 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31516 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31517
31518 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31519 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31520 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31521 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31522 int BaseRotAmtIdx = -1;
31523 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31524 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31525 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31526 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31527 }
31528 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31529 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31530 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31531 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31532 BaseRotAmtIdx, Subtarget, DAG);
31533 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31534 BaseRotAmtIdx, Subtarget, DAG);
31535 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31536 }
31537 }
31538
31539 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31540 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31541
31542 // Attempt to fold as unpack(x,x) << zext(y):
31543 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31544 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31545 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31546 if (!(ConstantAmt && EltSizeInBits != 8) &&
31547 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31548 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31549 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31550 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31551 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31552 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31553 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31554 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31555 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31556 }
31557
31558 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31559 // the amount bit.
31560 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31561 if (EltSizeInBits == 8) {
31562 MVT WideVT =
31563 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31564
31565 // Attempt to fold as:
31566 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31567 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31568 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31569 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31570 // If we're rotating by constant, just use default promotion.
31571 if (ConstantAmt)
31572 return SDValue();
31573 // See if we can perform this by widening to vXi16 or vXi32.
31574 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31575 R = DAG.getNode(
31576 ISD::OR, DL, WideVT, R,
31577 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31578 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31579 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31580 if (IsROTL)
31581 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31582 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31583 }
31584
31585 // We don't need ModuloAmt here as we just peek at individual bits.
31586 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31587 if (Subtarget.hasSSE41()) {
31588 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31589 // on the sign bit.
31590 V0 = DAG.getBitcast(VT, V0);
31591 V1 = DAG.getBitcast(VT, V1);
31592 Sel = DAG.getBitcast(VT, Sel);
31593 return DAG.getBitcast(SelVT,
31594 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31595 }
31596 // On pre-SSE41 targets we test for the sign bit by comparing to
31597 // zero - a negative value will set all bits of the lanes to true
31598 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31599 SDValue Z = DAG.getConstant(0, DL, SelVT);
31600 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31601 return DAG.getSelect(DL, SelVT, C, V0, V1);
31602 };
31603
31604 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31605 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31606 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31607 IsROTL = true;
31608 }
31609
31610 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31611 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31612
31613 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31614 // We can safely do this using i16 shifts as we're only interested in
31615 // the 3 lower bits of each byte.
31616 Amt = DAG.getBitcast(ExtVT, Amt);
31617 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31618 Amt = DAG.getBitcast(VT, Amt);
31619
31620 // r = VSELECT(r, rot(r, 4), a);
31621 SDValue M;
31622 M = DAG.getNode(
31623 ISD::OR, DL, VT,
31624 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31625 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31626 R = SignBitSelect(VT, Amt, M, R);
31627
31628 // a += a
31629 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31630
31631 // r = VSELECT(r, rot(r, 2), a);
31632 M = DAG.getNode(
31633 ISD::OR, DL, VT,
31634 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31635 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31636 R = SignBitSelect(VT, Amt, M, R);
31637
31638 // a += a
31639 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31640
31641 // return VSELECT(r, rot(r, 1), a);
31642 M = DAG.getNode(
31643 ISD::OR, DL, VT,
31644 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31645 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31646 return SignBitSelect(VT, Amt, M, R);
31647 }
31648
31649 bool IsSplatAmt = DAG.isSplatValue(Amt);
31650 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31651 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31652
31653 // Fallback for splats + all supported variable shifts.
31654 // Fallback for non-constants AVX2 vXi16 as well.
31655 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31656 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31657 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31658 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31659 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31660 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31661 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31662 }
31663
31664 // Everything below assumes ISD::ROTL.
31665 if (!IsROTL) {
31666 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31667 IsROTL = true;
31668 }
31669
31670 // ISD::ROT* uses modulo rotate amounts.
31671 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31672
31673 assert(IsROTL && "Only ROTL supported");
31674
31675 // As with shifts, attempt to convert the rotation amount to a multiplication
31676 // factor, fallback to general expansion.
31677 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31678 if (!Scale)
31679 return SDValue();
31680
31681 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31682 if (EltSizeInBits == 16) {
31683 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31684 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31685 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31686 }
31687
31688 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31689 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31690 // that can then be OR'd with the lower 32-bits.
31691 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31692 static const int OddMask[] = {1, 1, 3, 3};
31693 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31694 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31695
31696 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31697 DAG.getBitcast(MVT::v2i64, R),
31698 DAG.getBitcast(MVT::v2i64, Scale));
31699 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31700 DAG.getBitcast(MVT::v2i64, R13),
31701 DAG.getBitcast(MVT::v2i64, Scale13));
31702 Res02 = DAG.getBitcast(VT, Res02);
31703 Res13 = DAG.getBitcast(VT, Res13);
31704
31705 return DAG.getNode(ISD::OR, DL, VT,
31706 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31707 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31708}
31709
31710/// Returns true if the operand type is exactly twice the native width, and
31711/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31712/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31713/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31714bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31715 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31716
31717 if (OpWidth == 64)
31718 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31719 if (OpWidth == 128)
31720 return Subtarget.canUseCMPXCHG16B();
31721
31722 return false;
31723}
31724
31726X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31727 Type *MemType = SI->getValueOperand()->getType();
31728
31729 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31730 !Subtarget.useSoftFloat()) {
31731 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31732 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31734
31735 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31736 Subtarget.hasAVX())
31738 }
31739
31740 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31742}
31743
31744// Note: this turns large loads into lock cmpxchg8b/16b.
31746X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31747 Type *MemType = LI->getType();
31748
31749 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31750 !Subtarget.useSoftFloat()) {
31751 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31752 // can use movq to do the load. If we have X87 we can load into an 80-bit
31753 // X87 register and store it to a stack temporary.
31754 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31755 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31757
31758 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31759 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31760 Subtarget.hasAVX())
31762 }
31763
31764 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31766}
31767
31775
31776static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31777 using namespace llvm::PatternMatch;
31778 BitTestKind BTK = UndefBit;
31779 if (auto *C = dyn_cast<ConstantInt>(V)) {
31780 // Check if V is a power of 2 or NOT power of 2.
31781 if (isPowerOf2_64(C->getZExtValue()))
31782 BTK = ConstantBit;
31783 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31784 BTK = NotConstantBit;
31785 return {V, BTK};
31786 }
31787
31788 // Check if V is some power of 2 pattern known to be non-zero
31789 if (auto *I = dyn_cast<Instruction>(V)) {
31790 bool Not = false;
31791 // Check if we have a NOT
31792 Value *PeekI;
31793 if (match(I, m_Not(m_Value(PeekI))) ||
31794 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31795 Not = true;
31796 I = dyn_cast<Instruction>(PeekI);
31797
31798 // If I is constant, it will fold and we can evaluate later. If its an
31799 // argument or something of that nature, we can't analyze.
31800 if (I == nullptr)
31801 return {nullptr, UndefBit};
31802 }
31803 // We can only use 1 << X without more sophisticated analysis. C << X where
31804 // C is a power of 2 but not 1 can result in zero which cannot be translated
31805 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31806 if (I->getOpcode() == Instruction::Shl) {
31807 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31808 // -X` and some other provable power of 2 patterns that we can use CTZ on
31809 // may be profitable.
31810 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31811 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31812 // be provably a non-zero power of 2.
31813 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31814 // transformable to bittest.
31815 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31816 if (!ShiftVal)
31817 return {nullptr, UndefBit};
31818 if (ShiftVal->equalsInt(1))
31819 BTK = Not ? NotShiftBit : ShiftBit;
31820
31821 if (BTK == UndefBit)
31822 return {nullptr, UndefBit};
31823
31824 Value *BitV = I->getOperand(1);
31825
31826 // Read past a shiftmask instruction to find count
31827 Value *AndOp;
31828 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31829 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31830 BitV = AndOp;
31831
31832 return {BitV, BTK};
31833 }
31834 }
31835 return {nullptr, UndefBit};
31836}
31837
31839X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31840 using namespace llvm::PatternMatch;
31841 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31842 // prefix to a normal instruction for these operations.
31843 if (AI->use_empty())
31845
31846 if (AI->getOperation() == AtomicRMWInst::Xor) {
31847 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31848 // preferable to both `cmpxchg` and `btc`.
31849 if (match(AI->getOperand(1), m_SignMask()))
31851 }
31852
31853 // If the atomicrmw's result is used by a single bit AND, we may use
31854 // bts/btr/btc instruction for these operations.
31855 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31856 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31857 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31858 // detect it.
31859 Instruction *I = AI->user_back();
31860 auto BitChange = FindSingleBitChange(AI->getValOperand());
31861 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31862 I->getOpcode() != Instruction::And ||
31863 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31864 AI->getParent() != I->getParent())
31866
31867 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31868
31869 // This is a redundant AND, it should get cleaned up elsewhere.
31870 if (AI == I->getOperand(OtherIdx))
31872
31873 // The following instruction must be a AND single bit.
31874 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31875 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31876 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31877 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31879 }
31880 if (AI->getOperation() == AtomicRMWInst::And) {
31881 return ~C1->getValue() == C2->getValue()
31884 }
31887 }
31888
31889 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31890
31891 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31892 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31894
31895 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31896
31897 // If shift amounts are not the same we can't use BitTestIntrinsic.
31898 if (BitChange.first != BitTested.first)
31900
31901 // If atomic AND need to be masking all be one bit and testing the one bit
31902 // unset in the mask.
31903 if (AI->getOperation() == AtomicRMWInst::And)
31904 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31907
31908 // If atomic XOR/OR need to be setting and testing the same bit.
31909 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31912}
31913
31914void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31915 IRBuilder<> Builder(AI);
31916 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31919 switch (AI->getOperation()) {
31920 default:
31921 llvm_unreachable("Unknown atomic operation");
31922 case AtomicRMWInst::Or:
31923 IID_C = Intrinsic::x86_atomic_bts;
31924 IID_I = Intrinsic::x86_atomic_bts_rm;
31925 break;
31926 case AtomicRMWInst::Xor:
31927 IID_C = Intrinsic::x86_atomic_btc;
31928 IID_I = Intrinsic::x86_atomic_btc_rm;
31929 break;
31930 case AtomicRMWInst::And:
31931 IID_C = Intrinsic::x86_atomic_btr;
31932 IID_I = Intrinsic::x86_atomic_btr_rm;
31933 break;
31934 }
31935 Instruction *I = AI->user_back();
31936 LLVMContext &Ctx = AI->getContext();
31937 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31939 Value *Result = nullptr;
31940 auto BitTested = FindSingleBitChange(AI->getValOperand());
31941 assert(BitTested.first != nullptr);
31942
31943 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31944 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31945
31946 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31947 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31948 {Addr, Builder.getInt8(Imm)});
31949 } else {
31950 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31951
31952 Value *SI = BitTested.first;
31953 assert(SI != nullptr);
31954
31955 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31956 // mask it.
31957 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31958 Value *BitPos =
31959 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31960 // Todo(1): In many cases it may be provable that SI is less than
31961 // ShiftBits in which case this mask is unnecessary
31962 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31963 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31964 // favor of just a raw BT{S|R|C}.
31965
31966 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31967 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31968
31969 // If the result is only used for zero/non-zero status then we don't need to
31970 // shift value back. Otherwise do so.
31971 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31972 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31973 if (ICmp->isEquality()) {
31974 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31975 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31976 if (C0 || C1) {
31977 assert(C0 == nullptr || C1 == nullptr);
31978 if ((C0 ? C0 : C1)->isZero())
31979 continue;
31980 }
31981 }
31982 }
31983 Result = Builder.CreateShl(Result, BitPos);
31984 break;
31985 }
31986 }
31987
31988 I->replaceAllUsesWith(Result);
31989 I->eraseFromParent();
31990 AI->eraseFromParent();
31991}
31992
31994 using namespace llvm::PatternMatch;
31995 if (!AI->hasOneUse())
31996 return false;
31997
31998 Value *Op = AI->getOperand(1);
31999 CmpPredicate Pred;
32000 Instruction *I = AI->user_back();
32002 if (Opc == AtomicRMWInst::Add) {
32003 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32004 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32005 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32006 if (match(I->user_back(),
32008 return true;
32009 if (match(I->user_back(),
32011 return true;
32012 }
32013 return false;
32014 }
32015 if (Opc == AtomicRMWInst::Sub) {
32016 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32017 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32018 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32019 if (match(I->user_back(),
32021 return true;
32022 if (match(I->user_back(),
32024 return true;
32025 }
32026 return false;
32027 }
32028 if ((Opc == AtomicRMWInst::Or &&
32030 (Opc == AtomicRMWInst::And &&
32032 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32033 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32034 Pred == CmpInst::ICMP_SLT;
32035 if (match(I->user_back(),
32037 return true;
32038 return false;
32039 }
32040 if (Opc == AtomicRMWInst::Xor) {
32041 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32042 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32043 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32044 if (match(I->user_back(),
32046 return true;
32047 if (match(I->user_back(),
32049 return true;
32050 }
32051 return false;
32052 }
32053
32054 return false;
32055}
32056
32057void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32058 AtomicRMWInst *AI) const {
32059 IRBuilder<> Builder(AI);
32060 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32061 Instruction *TempI = nullptr;
32062 LLVMContext &Ctx = AI->getContext();
32063 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32064 if (!ICI) {
32065 TempI = AI->user_back();
32066 assert(TempI->hasOneUse() && "Must have one use");
32067 ICI = cast<ICmpInst>(TempI->user_back());
32068 }
32070 ICmpInst::Predicate Pred = ICI->getPredicate();
32071 switch (Pred) {
32072 default:
32073 llvm_unreachable("Not supported Pred");
32074 case CmpInst::ICMP_EQ:
32075 CC = X86::COND_E;
32076 break;
32077 case CmpInst::ICMP_NE:
32078 CC = X86::COND_NE;
32079 break;
32080 case CmpInst::ICMP_SLT:
32081 CC = X86::COND_S;
32082 break;
32083 case CmpInst::ICMP_SGT:
32084 CC = X86::COND_NS;
32085 break;
32086 }
32088 switch (AI->getOperation()) {
32089 default:
32090 llvm_unreachable("Unknown atomic operation");
32091 case AtomicRMWInst::Add:
32092 IID = Intrinsic::x86_atomic_add_cc;
32093 break;
32094 case AtomicRMWInst::Sub:
32095 IID = Intrinsic::x86_atomic_sub_cc;
32096 break;
32097 case AtomicRMWInst::Or:
32098 IID = Intrinsic::x86_atomic_or_cc;
32099 break;
32100 case AtomicRMWInst::And:
32101 IID = Intrinsic::x86_atomic_and_cc;
32102 break;
32103 case AtomicRMWInst::Xor:
32104 IID = Intrinsic::x86_atomic_xor_cc;
32105 break;
32106 }
32107 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32109 Value *Call = Builder.CreateIntrinsic(
32110 IID, AI->getType(),
32111 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32112 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32113 ICI->replaceAllUsesWith(Result);
32114 ICI->eraseFromParent();
32115 if (TempI)
32116 TempI->eraseFromParent();
32117 AI->eraseFromParent();
32118}
32119
32121X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32122 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32123 Type *MemType = AI->getType();
32124
32125 // If the operand is too big, we must see if cmpxchg8/16b is available
32126 // and default to library calls otherwise.
32127 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32128 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32130 }
32131
32133 switch (Op) {
32136 case AtomicRMWInst::Add:
32137 case AtomicRMWInst::Sub:
32140 // It's better to use xadd, xsub or xchg for these in other cases.
32142 case AtomicRMWInst::Or:
32143 case AtomicRMWInst::And:
32144 case AtomicRMWInst::Xor:
32147 return shouldExpandLogicAtomicRMWInIR(AI);
32149 case AtomicRMWInst::Max:
32150 case AtomicRMWInst::Min:
32161 default:
32162 // These always require a non-trivial set of data operations on x86. We must
32163 // use a cmpxchg loop.
32165 }
32166}
32167
32168LoadInst *
32169X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32170 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32171 Type *MemType = AI->getType();
32172 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32173 // there is no benefit in turning such RMWs into loads, and it is actually
32174 // harmful as it introduces a mfence.
32175 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32176 return nullptr;
32177
32178 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32179 // lowering available in lowerAtomicArith.
32180 // TODO: push more cases through this path.
32181 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32182 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32183 AI->use_empty())
32184 return nullptr;
32185
32186 IRBuilder<> Builder(AI);
32187 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32188 auto SSID = AI->getSyncScopeID();
32189 // We must restrict the ordering to avoid generating loads with Release or
32190 // ReleaseAcquire orderings.
32192
32193 // Before the load we need a fence. Here is an example lifted from
32194 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32195 // is required:
32196 // Thread 0:
32197 // x.store(1, relaxed);
32198 // r1 = y.fetch_add(0, release);
32199 // Thread 1:
32200 // y.fetch_add(42, acquire);
32201 // r2 = x.load(relaxed);
32202 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32203 // lowered to just a load without a fence. A mfence flushes the store buffer,
32204 // making the optimization clearly correct.
32205 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32206 // otherwise, we might be able to be more aggressive on relaxed idempotent
32207 // rmw. In practice, they do not look useful, so we don't try to be
32208 // especially clever.
32209
32210 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32211 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32212 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32213
32214 // Finally we can emit the atomic load.
32215 LoadInst *Loaded = Builder.CreateAlignedLoad(
32216 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32217 Loaded->setAtomic(Order, SSID);
32218 AI->replaceAllUsesWith(Loaded);
32219 AI->eraseFromParent();
32220 return Loaded;
32221}
32222
32223/// Emit a locked operation on a stack location which does not change any
32224/// memory location, but does involve a lock prefix. Location is chosen to be
32225/// a) very likely accessed only by a single thread to minimize cache traffic,
32226/// and b) definitely dereferenceable. Returns the new Chain result.
32228 const X86Subtarget &Subtarget, SDValue Chain,
32229 const SDLoc &DL) {
32230 // Implementation notes:
32231 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32232 // operations issued by the current processor. As such, the location
32233 // referenced is not relevant for the ordering properties of the instruction.
32234 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32235 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32236 // 2) Using an immediate operand appears to be the best encoding choice
32237 // here since it doesn't require an extra register.
32238 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32239 // is small enough it might just be measurement noise.)
32240 // 4) When choosing offsets, there are several contributing factors:
32241 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32242 // line aligned stack object to improve this case.)
32243 // b) To minimize our chances of introducing a false dependence, we prefer
32244 // to offset the stack usage from TOS slightly.
32245 // c) To minimize concerns about cross thread stack usage - in particular,
32246 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32247 // captures state in the TOS frame and accesses it from many threads -
32248 // we want to use an offset such that the offset is in a distinct cache
32249 // line from the TOS frame.
32250 //
32251 // For a general discussion of the tradeoffs and benchmark results, see:
32252 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32253
32254 auto &MF = DAG.getMachineFunction();
32255 auto &TFL = *Subtarget.getFrameLowering();
32256 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32257
32258 if (Subtarget.is64Bit()) {
32259 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32260 SDValue Ops[] = {
32261 DAG.getRegister(X86::RSP, MVT::i64), // Base
32262 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32263 DAG.getRegister(0, MVT::i64), // Index
32264 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32265 DAG.getRegister(0, MVT::i16), // Segment.
32266 Zero,
32267 Chain};
32268 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32269 MVT::Other, Ops);
32270 return SDValue(Res, 1);
32271 }
32272
32273 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32274 SDValue Ops[] = {
32275 DAG.getRegister(X86::ESP, MVT::i32), // Base
32276 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32277 DAG.getRegister(0, MVT::i32), // Index
32278 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32279 DAG.getRegister(0, MVT::i16), // Segment.
32280 Zero,
32281 Chain
32282 };
32283 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32284 MVT::Other, Ops);
32285 return SDValue(Res, 1);
32286}
32287
32289 SelectionDAG &DAG) {
32290 SDLoc dl(Op);
32291 AtomicOrdering FenceOrdering =
32292 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32293 SyncScope::ID FenceSSID =
32294 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32295
32296 // The only fence that needs an instruction is a sequentially-consistent
32297 // cross-thread fence.
32298 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32299 FenceSSID == SyncScope::System) {
32300 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32301 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32302
32303 SDValue Chain = Op.getOperand(0);
32304 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32305 }
32306
32307 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32308 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32309}
32310
32312 SelectionDAG &DAG) {
32313 MVT T = Op.getSimpleValueType();
32314 SDLoc DL(Op);
32315 unsigned Reg = 0;
32316 unsigned size = 0;
32317 switch(T.SimpleTy) {
32318 default: llvm_unreachable("Invalid value type!");
32319 case MVT::i8: Reg = X86::AL; size = 1; break;
32320 case MVT::i16: Reg = X86::AX; size = 2; break;
32321 case MVT::i32: Reg = X86::EAX; size = 4; break;
32322 case MVT::i64:
32323 assert(Subtarget.is64Bit() && "Node not type legal!");
32324 Reg = X86::RAX; size = 8;
32325 break;
32326 }
32327 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32328 Op.getOperand(2), SDValue());
32329 SDValue Ops[] = { cpIn.getValue(0),
32330 Op.getOperand(1),
32331 Op.getOperand(3),
32332 DAG.getTargetConstant(size, DL, MVT::i8),
32333 cpIn.getValue(1) };
32334 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32335 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32337 Ops, T, MMO);
32338
32339 SDValue cpOut =
32340 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32341 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32342 MVT::i32, cpOut.getValue(2));
32343 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32344
32345 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32346 cpOut, Success, EFLAGS.getValue(1));
32347}
32348
32349// Create MOVMSKB, taking into account whether we need to split for AVX1.
32351 const X86Subtarget &Subtarget) {
32352 MVT InVT = V.getSimpleValueType();
32353
32354 if (InVT == MVT::v64i8) {
32355 SDValue Lo, Hi;
32356 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32357 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32358 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32359 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32360 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32361 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32362 DAG.getConstant(32, DL, MVT::i8));
32363 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32364 }
32365 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32366 SDValue Lo, Hi;
32367 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32368 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32369 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32370 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32371 DAG.getConstant(16, DL, MVT::i8));
32372 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32373 }
32374
32375 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32376}
32377
32378static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32379 SelectionDAG &DAG) {
32380 SDValue Src = Op.getOperand(0);
32381 MVT SrcVT = Src.getSimpleValueType();
32382 MVT DstVT = Op.getSimpleValueType();
32383
32384 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32385 // half to v32i1 and concatenating the result.
32386 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32387 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32388 assert(Subtarget.hasBWI() && "Expected BWI target");
32389 SDLoc dl(Op);
32390 SDValue Lo, Hi;
32391 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32392 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32393 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32394 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32395 }
32396
32397 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32398 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32399 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32400 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32401 SDLoc DL(Op);
32402 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32403 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32404 return DAG.getZExtOrTrunc(V, DL, DstVT);
32405 }
32406
32407 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32408 SrcVT == MVT::i64) && "Unexpected VT!");
32409
32410 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32411 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32412 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32413 // This conversion needs to be expanded.
32414 return SDValue();
32415
32416 SDLoc dl(Op);
32417 if (SrcVT.isVector()) {
32418 // Widen the vector in input in the case of MVT::v2i32.
32419 // Example: from MVT::v2i32 to MVT::v4i32.
32421 SrcVT.getVectorNumElements() * 2);
32422 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32423 DAG.getUNDEF(SrcVT));
32424 } else {
32425 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32426 "Unexpected source type in LowerBITCAST");
32427 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32428 }
32429
32430 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32431 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32432
32433 if (DstVT == MVT::x86mmx)
32434 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32435
32436 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32437 DAG.getVectorIdxConstant(0, dl));
32438}
32439
32440/// Compute the horizontal sum of bytes in V for the elements of VT.
32441///
32442/// Requires V to be a byte vector and VT to be an integer vector type with
32443/// wider elements than V's type. The width of the elements of VT determines
32444/// how many bytes of V are summed horizontally to produce each element of the
32445/// result.
32447 const X86Subtarget &Subtarget,
32448 SelectionDAG &DAG) {
32449 SDLoc DL(V);
32450 MVT ByteVecVT = V.getSimpleValueType();
32451 MVT EltVT = VT.getVectorElementType();
32452 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32453 "Expected value to have byte element type.");
32454 assert(EltVT != MVT::i8 &&
32455 "Horizontal byte sum only makes sense for wider elements!");
32456 unsigned VecSize = VT.getSizeInBits();
32457 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32458
32459 // PSADBW instruction horizontally add all bytes and leave the result in i64
32460 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32461 if (EltVT == MVT::i64) {
32462 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32463 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32464 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32465 return DAG.getBitcast(VT, V);
32466 }
32467
32468 if (EltVT == MVT::i32) {
32469 // We unpack the low half and high half into i32s interleaved with zeros so
32470 // that we can use PSADBW to horizontally sum them. The most useful part of
32471 // this is that it lines up the results of two PSADBW instructions to be
32472 // two v2i64 vectors which concatenated are the 4 population counts. We can
32473 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32474 SDValue Zeros = DAG.getConstant(0, DL, VT);
32475 SDValue V32 = DAG.getBitcast(VT, V);
32476 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32477 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32478
32479 // Do the horizontal sums into two v2i64s.
32480 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32481 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32482 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32483 DAG.getBitcast(ByteVecVT, Low), Zeros);
32484 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32485 DAG.getBitcast(ByteVecVT, High), Zeros);
32486
32487 // Merge them together.
32488 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32489 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32490 DAG.getBitcast(ShortVecVT, Low),
32491 DAG.getBitcast(ShortVecVT, High));
32492
32493 return DAG.getBitcast(VT, V);
32494 }
32495
32496 // The only element type left is i16.
32497 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32498
32499 // To obtain pop count for each i16 element starting from the pop count for
32500 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32501 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32502 // directly supported.
32503 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32504 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32505 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32506 DAG.getBitcast(ByteVecVT, V));
32507 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32508}
32509
32511 const X86Subtarget &Subtarget,
32512 SelectionDAG &DAG) {
32513 MVT VT = Op.getSimpleValueType();
32514 MVT EltVT = VT.getVectorElementType();
32515 int NumElts = VT.getVectorNumElements();
32516 (void)EltVT;
32517 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32518
32519 // Implement a lookup table in register by using an algorithm based on:
32520 // http://wm.ite.pl/articles/sse-popcount.html
32521 //
32522 // The general idea is that every lower byte nibble in the input vector is an
32523 // index into a in-register pre-computed pop count table. We then split up the
32524 // input vector in two new ones: (1) a vector with only the shifted-right
32525 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32526 // masked out higher ones) for each byte. PSHUFB is used separately with both
32527 // to index the in-register table. Next, both are added and the result is a
32528 // i8 vector where each element contains the pop count for input byte.
32529 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32530 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32531 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32532 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32533
32535 for (int i = 0; i < NumElts; ++i)
32536 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32537 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32538 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32539
32540 // High nibbles
32541 SDValue FourV = DAG.getConstant(4, DL, VT);
32542 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32543
32544 // Low nibbles
32545 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32546
32547 // The input vector is used as the shuffle mask that index elements into the
32548 // LUT. After counting low and high nibbles, add the vector to obtain the
32549 // final pop count per i8 element.
32550 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32551 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32552 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32553}
32554
32555// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32556// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32558 const X86Subtarget &Subtarget,
32559 SelectionDAG &DAG) {
32560 MVT VT = Op.getSimpleValueType();
32561 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32562 "Unknown CTPOP type to handle");
32563 SDValue Op0 = Op.getOperand(0);
32564
32565 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32566 if (Subtarget.hasVPOPCNTDQ()) {
32567 unsigned NumElems = VT.getVectorNumElements();
32568 assert((VT.getVectorElementType() == MVT::i8 ||
32569 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32570 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32571 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32572 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32573 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32574 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32575 }
32576 }
32577
32578 // Decompose 256-bit ops into smaller 128-bit ops.
32579 if (VT.is256BitVector() && !Subtarget.hasInt256())
32580 return splitVectorIntUnary(Op, DAG, DL);
32581
32582 // Decompose 512-bit ops into smaller 256-bit ops.
32583 if (VT.is512BitVector() && !Subtarget.hasBWI())
32584 return splitVectorIntUnary(Op, DAG, DL);
32585
32586 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32587 if (VT.getScalarType() != MVT::i8) {
32588 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32589 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32590 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32591 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32592 }
32593
32594 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32595 if (!Subtarget.hasSSSE3())
32596 return SDValue();
32597
32598 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32599}
32600
32601static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32602 SelectionDAG &DAG) {
32603 MVT VT = N.getSimpleValueType();
32604 SDValue Op = N.getOperand(0);
32605 SDLoc DL(N);
32606
32607 if (VT.isScalarInteger()) {
32608 // Compute the lower/upper bounds of the active bits of the value,
32609 // allowing us to shift the active bits down if necessary to fit into the
32610 // special cases below.
32611 KnownBits Known = DAG.computeKnownBits(Op);
32612 if (Known.isConstant())
32613 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32614 unsigned LZ = Known.countMinLeadingZeros();
32615 unsigned TZ = Known.countMinTrailingZeros();
32616 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32617 unsigned ActiveBits = Known.getBitWidth() - LZ;
32618 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32619
32620 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32621 if (ShiftedActiveBits <= 2) {
32622 if (ActiveBits > 2)
32623 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32624 DAG.getShiftAmountConstant(TZ, VT, DL));
32625 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32626 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32627 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32628 DAG.getShiftAmountConstant(1, VT, DL)));
32629 return DAG.getZExtOrTrunc(Op, DL, VT);
32630 }
32631
32632 // i3 CTPOP - perform LUT into i32 integer.
32633 if (ShiftedActiveBits <= 3) {
32634 if (ActiveBits > 3)
32635 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32636 DAG.getShiftAmountConstant(TZ, VT, DL));
32637 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32638 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32639 DAG.getShiftAmountConstant(1, VT, DL));
32640 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32641 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32642 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32643 DAG.getConstant(0x3, DL, MVT::i32));
32644 return DAG.getZExtOrTrunc(Op, DL, VT);
32645 }
32646
32647 // i4 CTPOP - perform LUT into i64 integer.
32648 if (ShiftedActiveBits <= 4 &&
32649 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32650 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32651 if (ActiveBits > 4)
32652 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32653 DAG.getShiftAmountConstant(TZ, VT, DL));
32654 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32655 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32656 DAG.getConstant(4, DL, MVT::i32));
32657 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32658 DAG.getShiftAmountOperand(MVT::i64, Op));
32659 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32660 DAG.getConstant(0x7, DL, MVT::i64));
32661 return DAG.getZExtOrTrunc(Op, DL, VT);
32662 }
32663
32664 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32665 if (ShiftedActiveBits <= 8) {
32666 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32667 if (ActiveBits > 8)
32668 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32669 DAG.getShiftAmountConstant(TZ, VT, DL));
32670 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32671 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32672 DAG.getConstant(0x08040201U, DL, MVT::i32));
32673 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32674 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32675 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32676 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32677 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32678 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32679 return DAG.getZExtOrTrunc(Op, DL, VT);
32680 }
32681
32682 return SDValue(); // fallback to generic expansion.
32683 }
32684
32685 assert(VT.isVector() &&
32686 "We only do custom lowering for vector population count.");
32687 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32688}
32689
32691 MVT VT = Op.getSimpleValueType();
32692 SDValue In = Op.getOperand(0);
32693 SDLoc DL(Op);
32694
32695 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32696 // perform the BITREVERSE.
32697 if (!VT.isVector()) {
32698 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32699 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32700 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32701 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32702 DAG.getVectorIdxConstant(0, DL));
32703 }
32704
32705 int NumElts = VT.getVectorNumElements();
32706 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32707
32708 // Decompose 256-bit ops into smaller 128-bit ops.
32709 if (VT.is256BitVector())
32710 return splitVectorIntUnary(Op, DAG, DL);
32711
32712 assert(VT.is128BitVector() &&
32713 "Only 128-bit vector bitreverse lowering supported.");
32714
32715 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32716 // perform the BSWAP in the shuffle.
32717 // Its best to shuffle using the second operand as this will implicitly allow
32718 // memory folding for multiple vectors.
32719 SmallVector<SDValue, 16> MaskElts;
32720 for (int i = 0; i != NumElts; ++i) {
32721 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32722 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32723 int PermuteByte = SourceByte | (2 << 5);
32724 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32725 }
32726 }
32727
32728 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32729 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32730 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32731 Res, Mask);
32732 return DAG.getBitcast(VT, Res);
32733}
32734
32736 SelectionDAG &DAG) {
32737 MVT VT = Op.getSimpleValueType();
32738
32739 if (Subtarget.hasXOP() && !VT.is512BitVector())
32740 return LowerBITREVERSE_XOP(Op, DAG);
32741
32742 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32743 "SSSE3 or GFNI required for BITREVERSE");
32744
32745 SDValue In = Op.getOperand(0);
32746 SDLoc DL(Op);
32747
32748 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32749 if (VT.is512BitVector() && !Subtarget.hasBWI())
32750 return splitVectorIntUnary(Op, DAG, DL);
32751
32752 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32753 if (VT.is256BitVector() && !Subtarget.hasInt256())
32754 return splitVectorIntUnary(Op, DAG, DL);
32755
32756 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32757 if (!VT.isVector()) {
32758 assert(
32759 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32760 "Only tested for i8/i16/i32/i64");
32761 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32762 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32763 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32764 DAG.getBitcast(MVT::v16i8, Res));
32765 Res =
32766 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32767 DAG.getVectorIdxConstant(0, DL));
32768 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32769 }
32770
32771 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32772
32773 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32774 if (VT.getScalarType() != MVT::i8) {
32775 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32776 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32777 Res = DAG.getBitcast(ByteVT, Res);
32778 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32779 return DAG.getBitcast(VT, Res);
32780 }
32781 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32782 "Only byte vector BITREVERSE supported");
32783
32784 unsigned NumElts = VT.getVectorNumElements();
32785
32786 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32787 if (Subtarget.hasGFNI()) {
32789 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32790 DAG.getTargetConstant(0, DL, MVT::i8));
32791 }
32792
32793 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32794 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32795 // 0-15 value (moved to the other nibble).
32796 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32797 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32798 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32799
32800 const int LoLUT[16] = {
32801 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32802 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32803 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32804 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32805 const int HiLUT[16] = {
32806 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32807 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32808 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32809 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32810
32811 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32812 for (unsigned i = 0; i < NumElts; ++i) {
32813 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32814 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32815 }
32816
32817 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32818 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32819 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32820 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32821 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32822}
32823
32824static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32825 SelectionDAG &DAG) {
32826 SDLoc DL(Op);
32827 SDValue X = Op.getOperand(0);
32828 MVT VT = Op.getSimpleValueType();
32829
32830 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32831 if (VT == MVT::i8 ||
32833 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32834 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32835 DAG.getConstant(0, DL, MVT::i8));
32836 // Copy the inverse of the parity flag into a register with setcc.
32837 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32838 // Extend to the original type.
32839 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32840 }
32841
32842 // If we have POPCNT, use the default expansion.
32843 if (Subtarget.hasPOPCNT())
32844 return SDValue();
32845
32846 if (VT == MVT::i64) {
32847 // Xor the high and low 16-bits together using a 32-bit operation.
32848 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32849 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32850 DAG.getConstant(32, DL, MVT::i8)));
32851 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32852 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32853 }
32854
32855 if (VT != MVT::i16) {
32856 // Xor the high and low 16-bits together using a 32-bit operation.
32857 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32858 DAG.getConstant(16, DL, MVT::i8));
32859 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32860 } else {
32861 // If the input is 16-bits, we need to extend to use an i32 shift below.
32862 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32863 }
32864
32865 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32866 // This should allow an h-reg to be used to save a shift.
32867 SDValue Hi = DAG.getNode(
32868 ISD::TRUNCATE, DL, MVT::i8,
32869 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32870 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32871 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32872 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32873
32874 // Copy the inverse of the parity flag into a register with setcc.
32875 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32876 // Extend to the original type.
32877 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32878}
32879
32881 const X86Subtarget &Subtarget) {
32882 unsigned NewOpc = 0;
32883 switch (N->getOpcode()) {
32884 case ISD::ATOMIC_LOAD_ADD:
32885 NewOpc = X86ISD::LADD;
32886 break;
32887 case ISD::ATOMIC_LOAD_SUB:
32888 NewOpc = X86ISD::LSUB;
32889 break;
32890 case ISD::ATOMIC_LOAD_OR:
32891 NewOpc = X86ISD::LOR;
32892 break;
32893 case ISD::ATOMIC_LOAD_XOR:
32894 NewOpc = X86ISD::LXOR;
32895 break;
32896 case ISD::ATOMIC_LOAD_AND:
32897 NewOpc = X86ISD::LAND;
32898 break;
32899 default:
32900 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32901 }
32902
32903 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32904
32905 return DAG.getMemIntrinsicNode(
32906 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32907 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32908 /*MemVT=*/N->getSimpleValueType(0), MMO);
32909}
32910
32911/// Lower atomic_load_ops into LOCK-prefixed operations.
32913 const X86Subtarget &Subtarget) {
32914 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32915 SDValue Chain = N->getOperand(0);
32916 SDValue LHS = N->getOperand(1);
32917 SDValue RHS = N->getOperand(2);
32918 unsigned Opc = N->getOpcode();
32919 MVT VT = N->getSimpleValueType(0);
32920 SDLoc DL(N);
32921
32922 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32923 // can only be lowered when the result is unused. They should have already
32924 // been transformed into a cmpxchg loop in AtomicExpand.
32925 if (N->hasAnyUseOfValue(0)) {
32926 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32927 // select LXADD if LOCK_SUB can't be selected.
32928 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32929 // can use LXADD as opposed to cmpxchg.
32930 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32931 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))
32932 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32933 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32934
32935 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
32936 "Used AtomicRMW ops other than Add should have been expanded!");
32937 return N;
32938 }
32939
32940 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32941 // The core idea here is that since the memory location isn't actually
32942 // changing, all we need is a lowering for the *ordering* impacts of the
32943 // atomicrmw. As such, we can chose a different operation and memory
32944 // location to minimize impact on other code.
32945 // The above holds unless the node is marked volatile in which
32946 // case it needs to be preserved according to the langref.
32947 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32948 // On X86, the only ordering which actually requires an instruction is
32949 // seq_cst which isn't SingleThread, everything just needs to be preserved
32950 // during codegen and then dropped. Note that we expect (but don't assume),
32951 // that orderings other than seq_cst and acq_rel have been canonicalized to
32952 // a store or load.
32955 // Prefer a locked operation against a stack location to minimize cache
32956 // traffic. This assumes that stack locations are very likely to be
32957 // accessed only by the owning thread.
32958 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32959 assert(!N->hasAnyUseOfValue(0));
32960 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32961 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32962 DAG.getUNDEF(VT), NewChain);
32963 }
32964 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32965 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32966 assert(!N->hasAnyUseOfValue(0));
32967 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32968 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32969 DAG.getUNDEF(VT), NewChain);
32970 }
32971
32972 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32973 // RAUW the chain, but don't worry about the result, as it's unused.
32974 assert(!N->hasAnyUseOfValue(0));
32975 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32976 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32977 DAG.getUNDEF(VT), LockOp.getValue(1));
32978}
32979
32981 const X86Subtarget &Subtarget) {
32982 auto *Node = cast<AtomicSDNode>(Op.getNode());
32983 SDLoc dl(Node);
32984 EVT VT = Node->getMemoryVT();
32985
32986 bool IsSeqCst =
32987 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32988 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32989
32990 // If this store is not sequentially consistent and the type is legal
32991 // we can just keep it.
32992 if (!IsSeqCst && IsTypeLegal)
32993 return Op;
32994
32995 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32997 Attribute::NoImplicitFloat)) {
32998 SDValue Chain;
32999 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
33000 // vector store.
33001 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
33002 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
33003 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
33004 Node->getMemOperand());
33005 }
33006
33007 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33008 // is enabled.
33009 if (VT == MVT::i64) {
33010 if (Subtarget.hasSSE1()) {
33011 SDValue SclToVec =
33012 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
33013 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33014 SclToVec = DAG.getBitcast(StVT, SclToVec);
33015 SDVTList Tys = DAG.getVTList(MVT::Other);
33016 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33017 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33018 MVT::i64, Node->getMemOperand());
33019 } else if (Subtarget.hasX87()) {
33020 // First load this into an 80-bit X87 register using a stack temporary.
33021 // This will put the whole integer into the significand.
33022 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33023 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33024 MachinePointerInfo MPI =
33026 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33028 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33029 SDValue LdOps[] = {Chain, StackPtr};
33031 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33032 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33033 Chain = Value.getValue(1);
33034
33035 // Now use an FIST to do the atomic store.
33036 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33037 Chain =
33038 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33039 StoreOps, MVT::i64, Node->getMemOperand());
33040 }
33041 }
33042
33043 if (Chain) {
33044 // If this is a sequentially consistent store, also emit an appropriate
33045 // barrier.
33046 if (IsSeqCst)
33047 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33048
33049 return Chain;
33050 }
33051 }
33052
33053 // Convert seq_cst store -> xchg
33054 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33055 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33056 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33057 Node->getOperand(0), Node->getOperand(2),
33058 Node->getOperand(1), Node->getMemOperand());
33059 return Swap.getValue(1);
33060}
33061
33063 SDNode *N = Op.getNode();
33064 MVT VT = N->getSimpleValueType(0);
33065 unsigned Opc = Op.getOpcode();
33066
33067 // Let legalize expand this if it isn't a legal type yet.
33068 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33069 return SDValue();
33070
33071 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33072 SDLoc DL(N);
33073
33074 // Set the carry flag.
33075 SDValue Carry = Op.getOperand(2);
33076 EVT CarryVT = Carry.getValueType();
33077 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33078 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33079
33080 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33081 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33082 Op.getOperand(0), Op.getOperand(1),
33083 Carry.getValue(1));
33084
33085 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33086 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33087 Sum.getValue(1), DL, DAG);
33088 if (N->getValueType(1) == MVT::i1)
33089 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33090
33091 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33092}
33093
33094static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33095 SelectionDAG &DAG) {
33096 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33097
33098 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33099 // which returns the values as { float, float } (in XMM0) or
33100 // { double, double } (which is returned in XMM0, XMM1).
33101 SDLoc dl(Op);
33102 SDValue Arg = Op.getOperand(0);
33103 EVT ArgVT = Arg.getValueType();
33104 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33105
33107 Args.emplace_back(Arg, ArgTy);
33108
33109 bool isF64 = ArgVT == MVT::f64;
33110 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33111 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33112 // the results are returned via SRet in memory.
33113 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33114 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33115 const char *LibcallName = TLI.getLibcallName(LC);
33116 SDValue Callee =
33117 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33118
33119 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33120 : (Type *)FixedVectorType::get(ArgTy, 4);
33121
33123 CLI.setDebugLoc(dl)
33124 .setChain(DAG.getEntryNode())
33125 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33126
33127 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33128
33129 if (isF64)
33130 // Returned in xmm0 and xmm1.
33131 return CallResult.first;
33132
33133 // Returned in bits 0:31 and 32:64 xmm0.
33134 SDValue SinVal =
33135 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33136 DAG.getVectorIdxConstant(0, dl));
33137 SDValue CosVal =
33138 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33139 DAG.getVectorIdxConstant(1, dl));
33140 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33141 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33142}
33143
33144/// Widen a vector input to a vector of NVT. The
33145/// input vector must have the same element type as NVT.
33147 bool FillWithZeroes = false) {
33148 // Check if InOp already has the right width.
33149 MVT InVT = InOp.getSimpleValueType();
33150 if (InVT == NVT)
33151 return InOp;
33152
33153 if (InOp.isUndef())
33154 return DAG.getUNDEF(NVT);
33155
33157 "input and widen element type must match");
33158
33159 unsigned InNumElts = InVT.getVectorNumElements();
33160 unsigned WidenNumElts = NVT.getVectorNumElements();
33161 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33162 "Unexpected request for vector widening");
33163
33164 SDLoc dl(InOp);
33165 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33166 SDValue N1 = InOp.getOperand(1);
33167 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33168 N1.isUndef()) {
33169 InOp = InOp.getOperand(0);
33170 InVT = InOp.getSimpleValueType();
33171 InNumElts = InVT.getVectorNumElements();
33172 }
33173 }
33176 EVT EltVT = InOp.getOperand(0).getValueType();
33177 SDValue FillVal =
33178 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33180 Ops.append(WidenNumElts - InNumElts, FillVal);
33181 return DAG.getBuildVector(NVT, dl, Ops);
33182 }
33183 SDValue FillVal =
33184 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33185 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33186 DAG.getVectorIdxConstant(0, dl));
33187}
33188
33190 SelectionDAG &DAG) {
33191 assert(Subtarget.hasAVX512() &&
33192 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33193
33195 SDValue Src = N->getValue();
33196 MVT VT = Src.getSimpleValueType();
33197 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33198 SDLoc dl(Op);
33199
33200 SDValue Scale = N->getScale();
33201 SDValue Index = N->getIndex();
33202 SDValue Mask = N->getMask();
33203 SDValue Chain = N->getChain();
33204 SDValue BasePtr = N->getBasePtr();
33205
33206 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33207 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33208 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33209 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33210 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33211 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33212 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33213 SDVTList VTs = DAG.getVTList(MVT::Other);
33214 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33215 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33216 N->getMemoryVT(), N->getMemOperand());
33217 }
33218 return SDValue();
33219 }
33220
33221 MVT IndexVT = Index.getSimpleValueType();
33222
33223 // If the index is v2i32, we're being called by type legalization and we
33224 // should just let the default handling take care of it.
33225 if (IndexVT == MVT::v2i32)
33226 return SDValue();
33227
33228 // If we don't have VLX and neither the passthru or index is 512-bits, we
33229 // need to widen until one is.
33230 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33231 !Index.getSimpleValueType().is512BitVector()) {
33232 // Determine how much we need to widen by to get a 512-bit type.
33233 unsigned Factor = std::min(512/VT.getSizeInBits(),
33234 512/IndexVT.getSizeInBits());
33235 unsigned NumElts = VT.getVectorNumElements() * Factor;
33236
33237 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33238 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33239 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33240
33241 Src = ExtendToType(Src, VT, DAG);
33242 Index = ExtendToType(Index, IndexVT, DAG);
33243 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33244 }
33245
33246 SDVTList VTs = DAG.getVTList(MVT::Other);
33247 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33248 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33249 N->getMemoryVT(), N->getMemOperand());
33250}
33251
33252static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33253 SelectionDAG &DAG) {
33254
33256 MVT VT = Op.getSimpleValueType();
33257 MVT ScalarVT = VT.getScalarType();
33258 SDValue Mask = N->getMask();
33259 MVT MaskVT = Mask.getSimpleValueType();
33260 SDValue PassThru = N->getPassThru();
33261 SDLoc dl(Op);
33262
33263 // Handle AVX masked loads which don't support passthru other than 0.
33264 if (MaskVT.getVectorElementType() != MVT::i1) {
33265 // We also allow undef in the isel pattern.
33266 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33267 return Op;
33268
33269 SDValue NewLoad = DAG.getMaskedLoad(
33270 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33271 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33272 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33273 N->isExpandingLoad());
33274 // Emit a blend.
33275 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33276 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33277 }
33278
33279 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33280 "Expanding masked load is supported on AVX-512 target only!");
33281
33282 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33283 "Expanding masked load is supported for 32 and 64-bit types only!");
33284
33285 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33286 "Cannot lower masked load op.");
33287
33288 assert((ScalarVT.getSizeInBits() >= 32 ||
33289 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33290 ScalarVT == MVT::f16))) &&
33291 "Unsupported masked load op.");
33292
33293 // This operation is legal for targets with VLX, but without
33294 // VLX the vector should be widened to 512 bit
33295 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33296 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33297 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33298
33299 // Mask element has to be i1.
33300 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33301 "Unexpected mask type");
33302
33303 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33304
33305 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33306 SDValue NewLoad = DAG.getMaskedLoad(
33307 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33308 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33309 N->getExtensionType(), N->isExpandingLoad());
33310
33311 SDValue Extract =
33312 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33313 DAG.getVectorIdxConstant(0, dl));
33314 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33315 return DAG.getMergeValues(RetOps, dl);
33316}
33317
33318static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33319 SelectionDAG &DAG) {
33321 SDValue DataToStore = N->getValue();
33322 MVT VT = DataToStore.getSimpleValueType();
33323 MVT ScalarVT = VT.getScalarType();
33324 SDValue Mask = N->getMask();
33325 SDLoc dl(Op);
33326
33327 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33328 "Expanding masked load is supported on AVX-512 target only!");
33329
33330 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33331 "Expanding masked load is supported for 32 and 64-bit types only!");
33332
33333 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33334 "Cannot lower masked store op.");
33335
33336 assert((ScalarVT.getSizeInBits() >= 32 ||
33337 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33338 ScalarVT == MVT::f16))) &&
33339 "Unsupported masked store op.");
33340
33341 // This operation is legal for targets with VLX, but without
33342 // VLX the vector should be widened to 512 bit
33343 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33344 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33345
33346 // Mask element has to be i1.
33347 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33348 "Unexpected mask type");
33349
33350 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33351
33352 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33353 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33354 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33355 N->getOffset(), Mask, N->getMemoryVT(),
33356 N->getMemOperand(), N->getAddressingMode(),
33357 N->isTruncatingStore(), N->isCompressingStore());
33358}
33359
33360static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33361 SelectionDAG &DAG) {
33362 assert(Subtarget.hasAVX2() &&
33363 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33364
33366 SDLoc dl(Op);
33367 MVT VT = Op.getSimpleValueType();
33368 SDValue Index = N->getIndex();
33369 SDValue Mask = N->getMask();
33370 SDValue PassThru = N->getPassThru();
33371 MVT IndexVT = Index.getSimpleValueType();
33372
33373 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33374
33375 // If the index is v2i32, we're being called by type legalization.
33376 if (IndexVT == MVT::v2i32)
33377 return SDValue();
33378
33379 // If we don't have VLX and neither the passthru or index is 512-bits, we
33380 // need to widen until one is.
33381 MVT OrigVT = VT;
33382 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33383 !IndexVT.is512BitVector()) {
33384 // Determine how much we need to widen by to get a 512-bit type.
33385 unsigned Factor = std::min(512/VT.getSizeInBits(),
33386 512/IndexVT.getSizeInBits());
33387
33388 unsigned NumElts = VT.getVectorNumElements() * Factor;
33389
33390 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33391 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33392 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33393
33394 PassThru = ExtendToType(PassThru, VT, DAG);
33395 Index = ExtendToType(Index, IndexVT, DAG);
33396 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33397 }
33398
33399 // Break dependency on the data register.
33400 if (PassThru.isUndef())
33401 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33402
33403 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33404 N->getScale() };
33405 SDValue NewGather = DAG.getMemIntrinsicNode(
33406 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33407 N->getMemOperand());
33408 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33409 DAG.getVectorIdxConstant(0, dl));
33410 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33411}
33412
33414 SDLoc dl(Op);
33415 SDValue Src = Op.getOperand(0);
33416 MVT DstVT = Op.getSimpleValueType();
33417
33419 unsigned SrcAS = N->getSrcAddressSpace();
33420
33421 assert(SrcAS != N->getDestAddressSpace() &&
33422 "addrspacecast must be between different address spaces");
33423
33424 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33425 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33426 } else if (DstVT == MVT::i64) {
33427 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33428 } else if (DstVT == MVT::i32) {
33429 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33430 } else {
33431 report_fatal_error("Bad address space in addrspacecast");
33432 }
33433 return Op;
33434}
33435
33436SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33437 SelectionDAG &DAG) const {
33438 // TODO: Eventually, the lowering of these nodes should be informed by or
33439 // deferred to the GC strategy for the function in which they appear. For
33440 // now, however, they must be lowered to something. Since they are logically
33441 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33442 // require special handling for these nodes), lower them as literal NOOPs for
33443 // the time being.
33445 Ops.push_back(Op.getOperand(0));
33446 if (Op->getGluedNode())
33447 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33448
33449 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33450 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33451}
33452
33453// Custom split CVTPS2PH with wide types.
33455 SDLoc dl(Op);
33456 EVT VT = Op.getValueType();
33457 SDValue Lo, Hi;
33458 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33459 EVT LoVT, HiVT;
33460 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33461 SDValue RC = Op.getOperand(1);
33462 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33463 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33464 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33465}
33466
33468 SelectionDAG &DAG) {
33469 unsigned IsData = Op.getConstantOperandVal(4);
33470
33471 // We don't support non-data prefetch without PREFETCHI.
33472 // Just preserve the chain.
33473 if (!IsData && !Subtarget.hasPREFETCHI())
33474 return Op.getOperand(0);
33475
33476 return Op;
33477}
33478
33480 SDNode *N = Op.getNode();
33481 SDValue Operand = N->getOperand(0);
33482 EVT VT = Operand.getValueType();
33483 SDLoc dl(N);
33484
33485 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33486
33487 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33488 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33489 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33490 // promote this operator's result!
33491 SDValue Chain = DAG.getEntryNode();
33492 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33493 {Chain, Operand, One});
33494 return StrictFmul;
33495}
33496
33498 unsigned OpNo) {
33499 const APInt Operand(32, OpNo);
33500 std::string OpNoStr = llvm::toString(Operand, 10, false);
33501 std::string Str(" $");
33502
33503 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33504 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33505
33506 auto I = StringRef::npos;
33507 for (auto &AsmStr : AsmStrs) {
33508 // Match the OpNo string. We should match exactly to exclude match
33509 // sub-string, e.g. "$12" contain "$1"
33510 if (AsmStr.ends_with(OpNoStr1))
33511 I = AsmStr.size() - OpNoStr1.size();
33512
33513 // Get the index of operand in AsmStr.
33514 if (I == StringRef::npos)
33515 I = AsmStr.find(OpNoStr1 + ",");
33516 if (I == StringRef::npos)
33517 I = AsmStr.find(OpNoStr2);
33518
33519 if (I == StringRef::npos)
33520 continue;
33521
33522 assert(I > 0 && "Unexpected inline asm string!");
33523 // Remove the operand string and label (if exsit).
33524 // For example:
33525 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33526 // ==>
33527 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33528 // ==>
33529 // "call dword ptr "
33530 auto TmpStr = AsmStr.substr(0, I);
33531 I = TmpStr.rfind(':');
33532 if (I != StringRef::npos)
33533 TmpStr = TmpStr.substr(I + 1);
33534 return TmpStr.take_while(llvm::isAlpha);
33535 }
33536
33537 return StringRef();
33538}
33539
33541 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33542 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33543 // changed from indirect TargetLowering::C_Memory to direct
33544 // TargetLowering::C_Address.
33545 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33546 // location.
33547 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33548 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33549}
33550
33552 SDValue Mask) {
33553 EVT Ty = MVT::i8;
33554 auto V = DAG.getBitcast(MVT::i1, Mask);
33555 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33556 auto Zero = DAG.getConstant(0, DL, Ty);
33557 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33558 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33559 return SDValue(CmpZero.getNode(), 1);
33560}
33561
33563 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33564 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33565 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33566 // ->
33567 // _, flags = SUB 0, mask
33568 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33569 // bit_cast_to_vector<res>
33570 EVT VTy = PassThru.getValueType();
33571 EVT Ty = VTy.getVectorElementType();
33572 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33573 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33574 : DAG.getBitcast(Ty, PassThru);
33575 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33576 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33577 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33578 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33579 return DAG.getBitcast(VTy, NewLoad);
33580}
33581
33583 SDValue Chain,
33585 SDValue Val, SDValue Mask) const {
33586 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33587 // ->
33588 // _, flags = SUB 0, mask
33589 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33591 SDVTList Tys = DAG.getVTList(MVT::Other);
33592 auto ScalarVal = DAG.getBitcast(Ty, Val);
33593 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33594 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33595 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33596 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33597}
33598
33599/// Provide custom lowering hooks for some operations.
33601 switch (Op.getOpcode()) {
33602 // clang-format off
33603 default: llvm_unreachable("Should not custom lower this!");
33604 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33605 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33606 return LowerCMP_SWAP(Op, Subtarget, DAG);
33607 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33608 case ISD::ATOMIC_LOAD_ADD:
33609 case ISD::ATOMIC_LOAD_SUB:
33610 case ISD::ATOMIC_LOAD_OR:
33611 case ISD::ATOMIC_LOAD_XOR:
33612 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33613 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33614 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33615 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33616 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33617 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33618 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33619 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33620 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33621 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33622 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33623 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33624 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33625 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33626 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33627 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33628 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33629 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33630 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33631 case ISD::SHL_PARTS:
33632 case ISD::SRA_PARTS:
33633 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33634 case ISD::FSHL:
33635 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33636 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33638 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33640 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33641 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33642 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33643 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33644 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33647 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33648 case ISD::FP_TO_SINT:
33650 case ISD::FP_TO_UINT:
33651 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33653 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33654 case ISD::FP_EXTEND:
33655 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33656 case ISD::FP_ROUND:
33657 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33658 case ISD::FP16_TO_FP:
33659 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33660 case ISD::FP_TO_FP16:
33661 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33662 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33663 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33664 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33665 case ISD::FADD:
33666 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33667 case ISD::FROUND: return LowerFROUND(Op, DAG);
33668 case ISD::FABS:
33669 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33670 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33671 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33672 case ISD::LRINT:
33673 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33674 case ISD::SETCC:
33675 case ISD::STRICT_FSETCC:
33676 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33677 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33678 case ISD::SELECT: return LowerSELECT(Op, DAG);
33679 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33680 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33681 case ISD::VASTART: return LowerVASTART(Op, DAG);
33682 case ISD::VAARG: return LowerVAARG(Op, DAG);
33683 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33684 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33686 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33687 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33688 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33689 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33691 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33692 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33693 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33694 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33695 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33697 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33698 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33699 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33700 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33701 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33702 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33703 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33704 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33705 case ISD::CTLZ:
33706 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33707 case ISD::CTTZ:
33708 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33709 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33710 case ISD::MULHS:
33711 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33712 case ISD::ROTL:
33713 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33714 case ISD::SRA:
33715 case ISD::SRL:
33716 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33717 case ISD::SADDO:
33718 case ISD::UADDO:
33719 case ISD::SSUBO:
33720 case ISD::USUBO: return LowerXALUO(Op, DAG);
33721 case ISD::SMULO:
33722 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33723 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33724 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33725 case ISD::SADDO_CARRY:
33726 case ISD::SSUBO_CARRY:
33727 case ISD::UADDO_CARRY:
33728 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33729 case ISD::ADD:
33730 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33731 case ISD::UADDSAT:
33732 case ISD::SADDSAT:
33733 case ISD::USUBSAT:
33734 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33735 case ISD::SMAX:
33736 case ISD::SMIN:
33737 case ISD::UMAX:
33738 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33739 case ISD::FMINIMUM:
33740 case ISD::FMAXIMUM:
33741 case ISD::FMINIMUMNUM:
33742 case ISD::FMAXIMUMNUM:
33743 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33744 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33745 case ISD::ABDS:
33746 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33747 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33748 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33749 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33750 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33751 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33752 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33753 case ISD::GC_TRANSITION_START:
33754 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33755 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33756 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33757 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33758 // clang-format on
33759 }
33760}
33761
33762/// Replace a node with an illegal result type with a new node built out of
33763/// custom code.
33766 SelectionDAG &DAG) const {
33767 SDLoc dl(N);
33768 unsigned Opc = N->getOpcode();
33769 switch (Opc) {
33770 default:
33771#ifndef NDEBUG
33772 dbgs() << "ReplaceNodeResults: ";
33773 N->dump(&DAG);
33774#endif
33775 llvm_unreachable("Do not know how to custom type legalize this operation!");
33776 case X86ISD::CVTPH2PS: {
33777 EVT VT = N->getValueType(0);
33778 SDValue Lo, Hi;
33779 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33780 EVT LoVT, HiVT;
33781 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33782 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33783 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33784 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33785 Results.push_back(Res);
33786 return;
33787 }
33789 EVT VT = N->getValueType(0);
33790 SDValue Lo, Hi;
33791 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33792 EVT LoVT, HiVT;
33793 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33794 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33795 {N->getOperand(0), Lo});
33796 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33797 {N->getOperand(0), Hi});
33798 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33799 Lo.getValue(1), Hi.getValue(1));
33800 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33801 Results.push_back(Res);
33802 Results.push_back(Chain);
33803 return;
33804 }
33805 case X86ISD::CVTPS2PH:
33806 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33807 return;
33808 case ISD::CTPOP: {
33809 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33810 // If we have at most 32 active bits, then perform as i32 CTPOP.
33811 // TODO: Perform this in generic legalizer?
33812 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33813 unsigned LZ = Known.countMinLeadingZeros();
33814 unsigned TZ = Known.countMinTrailingZeros();
33815 if ((LZ + TZ) >= 32) {
33816 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33817 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33818 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33819 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33820 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33821 Results.push_back(Op);
33822 return;
33823 }
33824 // Use a v2i64 if possible.
33825 bool NoImplicitFloatOps =
33827 Attribute::NoImplicitFloat);
33828 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33829 SDValue Wide =
33830 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33831 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33832 // Bit count should fit in 32-bits, extract it as that and then zero
33833 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33834 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33835 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33836 DAG.getVectorIdxConstant(0, dl));
33837 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33838 Results.push_back(Wide);
33839 }
33840 return;
33841 }
33842 case ISD::MUL: {
33843 EVT VT = N->getValueType(0);
33845 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33846 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33847 // elements are needed.
33848 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33849 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33850 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33851 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33852 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33853 unsigned NumConcats = 16 / VT.getVectorNumElements();
33854 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33855 ConcatOps[0] = Res;
33856 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33857 Results.push_back(Res);
33858 return;
33859 }
33860 case ISD::SMULO:
33861 case ISD::UMULO: {
33862 EVT VT = N->getValueType(0);
33864 VT == MVT::v2i32 && "Unexpected VT!");
33865 bool IsSigned = Opc == ISD::SMULO;
33866 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33867 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33868 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33869 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33870 // Extract the high 32 bits from each result using PSHUFD.
33871 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33872 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33873 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33874 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33875 DAG.getVectorIdxConstant(0, dl));
33876
33877 // Truncate the low bits of the result. This will become PSHUFD.
33878 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33879
33880 SDValue HiCmp;
33881 if (IsSigned) {
33882 // SMULO overflows if the high bits don't match the sign of the low.
33883 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33884 } else {
33885 // UMULO overflows if the high bits are non-zero.
33886 HiCmp = DAG.getConstant(0, dl, VT);
33887 }
33888 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33889
33890 // Widen the result with by padding with undef.
33891 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33892 DAG.getUNDEF(VT));
33893 Results.push_back(Res);
33894 Results.push_back(Ovf);
33895 return;
33896 }
33897 case X86ISD::VPMADDWD: {
33898 // Legalize types for X86ISD::VPMADDWD by widening.
33899 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33900
33901 EVT VT = N->getValueType(0);
33902 EVT InVT = N->getOperand(0).getValueType();
33903 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33904 "Expected a VT that divides into 128 bits.");
33906 "Unexpected type action!");
33907 unsigned NumConcat = 128 / InVT.getSizeInBits();
33908
33909 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33910 InVT.getVectorElementType(),
33911 NumConcat * InVT.getVectorNumElements());
33912 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33914 NumConcat * VT.getVectorNumElements());
33915
33916 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33917 Ops[0] = N->getOperand(0);
33918 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33919 Ops[0] = N->getOperand(1);
33920 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33921
33922 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33923 Results.push_back(Res);
33924 return;
33925 }
33926 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33927 case X86ISD::FMINC:
33928 case X86ISD::FMIN:
33929 case X86ISD::FMAXC:
33930 case X86ISD::FMAX:
33932 case X86ISD::STRICT_FMAX: {
33933 EVT VT = N->getValueType(0);
33934 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33935 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33936 SDValue UNDEF = DAG.getUNDEF(VT);
33937 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33938 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33939 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33940 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33941 SDValue Res;
33942 if (IsStrict)
33943 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33944 {N->getOperand(0), LHS, RHS});
33945 else
33946 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33947 Results.push_back(Res);
33948 if (IsStrict)
33949 Results.push_back(Res.getValue(1));
33950 return;
33951 }
33952 case ISD::SDIV:
33953 case ISD::UDIV:
33954 case ISD::SREM:
33955 case ISD::UREM: {
33956 EVT VT = N->getValueType(0);
33957 if (VT.isVector()) {
33959 "Unexpected type action!");
33960 // If this RHS is a constant splat vector we can widen this and let
33961 // division/remainder by constant optimize it.
33962 // TODO: Can we do something for non-splat?
33963 APInt SplatVal;
33964 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33965 unsigned NumConcats = 128 / VT.getSizeInBits();
33966 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33967 Ops0[0] = N->getOperand(0);
33968 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33969 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33970 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33971 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33972 Results.push_back(Res);
33973 }
33974 return;
33975 }
33976
33977 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33978 Results.push_back(V);
33979 return;
33980 }
33981 case ISD::TRUNCATE: {
33982 MVT VT = N->getSimpleValueType(0);
33983 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33984 return;
33985
33986 // The generic legalizer will try to widen the input type to the same
33987 // number of elements as the widened result type. But this isn't always
33988 // the best thing so do some custom legalization to avoid some cases.
33989 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33990 SDValue In = N->getOperand(0);
33991 EVT InVT = In.getValueType();
33992 EVT InEltVT = InVT.getVectorElementType();
33993 EVT EltVT = VT.getVectorElementType();
33994 unsigned MinElts = VT.getVectorNumElements();
33995 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33996 unsigned InBits = InVT.getSizeInBits();
33997
33998 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
33999 unsigned PackOpcode;
34000 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
34001 Subtarget, N->getFlags())) {
34002 if (SDValue Res =
34003 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
34004 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
34005 Results.push_back(Res);
34006 return;
34007 }
34008 }
34009
34010 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
34011 // 128 bit and smaller inputs should avoid truncate all together and
34012 // use a shuffle.
34013 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
34014 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
34015 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
34016 for (unsigned I = 0; I < MinElts; ++I)
34017 TruncMask[I] = Scale * I;
34018 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
34019 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
34020 "Illegal vector type in truncation");
34021 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
34022 Results.push_back(
34023 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
34024 return;
34025 }
34026 }
34027
34028 // With AVX512 there are some cases that can use a target specific
34029 // truncate node to go from 256/512 to less than 128 with zeros in the
34030 // upper elements of the 128 bit result.
34031 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34032 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34033 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34034 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34035 return;
34036 }
34037 // There's one case we can widen to 512 bits and use VTRUNC.
34038 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34039 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34040 DAG.getUNDEF(MVT::v4i64));
34041 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34042 return;
34043 }
34044 }
34045 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34046 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34047 isTypeLegal(MVT::v4i64)) {
34048 // Input needs to be split and output needs to widened. Let's use two
34049 // VTRUNCs, and shuffle their results together into the wider type.
34050 SDValue Lo, Hi;
34051 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34052
34053 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34054 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34055 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34056 { 0, 1, 2, 3, 16, 17, 18, 19,
34057 -1, -1, -1, -1, -1, -1, -1, -1 });
34058 Results.push_back(Res);
34059 return;
34060 }
34061
34062 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34063 // this via type legalization.
34064 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34065 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34066 (!Subtarget.hasSSSE3() ||
34067 (!isTypeLegal(InVT) &&
34068 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34069 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34070 InEltVT.getSizeInBits() * WidenNumElts);
34071 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34072 return;
34073 }
34074
34075 return;
34076 }
34077 case ISD::ANY_EXTEND:
34078 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34079 // It's intended to custom handle the input type.
34080 assert(N->getValueType(0) == MVT::v8i8 &&
34081 "Do not know how to legalize this Node");
34082 return;
34083 case ISD::SIGN_EXTEND:
34084 case ISD::ZERO_EXTEND: {
34085 EVT VT = N->getValueType(0);
34086 SDValue In = N->getOperand(0);
34087 EVT InVT = In.getValueType();
34088 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34089 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34091 "Unexpected type action!");
34092 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34093 // Custom split this so we can extend i8/i16->i32 invec. This is better
34094 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34095 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34096 // we allow the sra from the extend to i32 to be shared by the split.
34097 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34098
34099 // Fill a vector with sign bits for each element.
34100 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34101 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34102
34103 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34104 // to v2i64.
34105 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34106 {0, 4, 1, 5});
34107 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34108 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34109 {2, 6, 3, 7});
34110 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34111
34112 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34113 Results.push_back(Res);
34114 return;
34115 }
34116
34117 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34118 if (!InVT.is128BitVector()) {
34119 // Not a 128 bit vector, but maybe type legalization will promote
34120 // it to 128 bits.
34121 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34122 return;
34123 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34124 if (!InVT.is128BitVector())
34125 return;
34126
34127 // Promote the input to 128 bits. Type legalization will turn this into
34128 // zext_inreg/sext_inreg.
34129 In = DAG.getNode(Opc, dl, InVT, In);
34130 }
34131
34132 // Perform custom splitting instead of the two stage extend we would get
34133 // by default.
34134 EVT LoVT, HiVT;
34135 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34136 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34137
34138 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34139
34140 // We need to shift the input over by half the number of elements.
34141 unsigned NumElts = InVT.getVectorNumElements();
34142 unsigned HalfNumElts = NumElts / 2;
34143 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34144 for (unsigned i = 0; i != HalfNumElts; ++i)
34145 ShufMask[i] = i + HalfNumElts;
34146
34147 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34148 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34149
34150 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34151 Results.push_back(Res);
34152 }
34153 return;
34154 }
34156 case ISD::FP_TO_UINT_SAT: {
34157 if (!Subtarget.hasAVX10_2())
34158 return;
34159
34160 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34161 EVT VT = N->getValueType(0);
34162 SDValue Op = N->getOperand(0);
34163 EVT OpVT = Op.getValueType();
34164 SDValue Res;
34165
34166 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34167 if (IsSigned)
34168 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34169 else
34170 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34171 Results.push_back(Res);
34172 }
34173 return;
34174 }
34175 case ISD::FP_TO_SINT:
34177 case ISD::FP_TO_UINT:
34179 bool IsStrict = N->isStrictFPOpcode();
34180 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34181 EVT VT = N->getValueType(0);
34182 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34183 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34184 EVT SrcVT = Src.getValueType();
34185
34186 SDValue Res;
34187 if (isSoftF16(SrcVT, Subtarget)) {
34188 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34189 if (IsStrict) {
34190 Res =
34191 DAG.getNode(Opc, dl, {VT, MVT::Other},
34192 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34193 {NVT, MVT::Other}, {Chain, Src})});
34194 Chain = Res.getValue(1);
34195 } else {
34196 Res =
34197 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34198 }
34199 Results.push_back(Res);
34200 if (IsStrict)
34201 Results.push_back(Chain);
34202
34203 return;
34204 }
34205
34206 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34207 SrcVT.getVectorElementType() == MVT::f16) {
34208 EVT EleVT = VT.getVectorElementType();
34209 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34210
34211 if (SrcVT != MVT::v8f16) {
34212 SDValue Tmp =
34213 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34214 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34215 Ops[0] = Src;
34216 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34217 }
34218
34219 if (IsStrict) {
34221 Res =
34222 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34223 Chain = Res.getValue(1);
34224 } else {
34225 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34226 Res = DAG.getNode(Opc, dl, ResVT, Src);
34227 }
34228
34229 // TODO: Need to add exception check code for strict FP.
34230 if (EleVT.getSizeInBits() < 16) {
34231 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34232 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34233
34234 // Now widen to 128 bits.
34235 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34236 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34237 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34238 ConcatOps[0] = Res;
34239 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34240 }
34241
34242 Results.push_back(Res);
34243 if (IsStrict)
34244 Results.push_back(Chain);
34245
34246 return;
34247 }
34248
34249 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34251 "Unexpected type action!");
34252
34253 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34254 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34255 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34257 SDValue Res;
34258 SDValue Chain;
34259 if (IsStrict) {
34260 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34261 {N->getOperand(0), Src});
34262 Chain = Res.getValue(1);
34263 } else
34264 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34265
34266 // Preserve what we know about the size of the original result. If the
34267 // result is v2i32, we have to manually widen the assert.
34268 if (PromoteVT == MVT::v2i32)
34269 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34270 DAG.getUNDEF(MVT::v2i32));
34271
34272 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34273 Res.getValueType(), Res,
34275
34276 if (PromoteVT == MVT::v2i32)
34277 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34278 DAG.getVectorIdxConstant(0, dl));
34279
34280 // Truncate back to the original width.
34281 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34282
34283 // Now widen to 128 bits.
34284 unsigned NumConcats = 128 / VT.getSizeInBits();
34286 VT.getVectorNumElements() * NumConcats);
34287 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34288 ConcatOps[0] = Res;
34289 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34290 Results.push_back(Res);
34291 if (IsStrict)
34292 Results.push_back(Chain);
34293 return;
34294 }
34295
34296
34297 if (VT == MVT::v2i32) {
34298 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34299 "Strict unsigned conversion requires AVX512");
34300 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34302 "Unexpected type action!");
34303 if (Src.getValueType() == MVT::v2f64) {
34304 if (!IsSigned && !Subtarget.hasAVX512()) {
34305 SDValue Res =
34306 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34307 Results.push_back(Res);
34308 return;
34309 }
34310
34311 if (IsStrict)
34313 else
34314 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34315
34316 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34317 if (!IsSigned && !Subtarget.hasVLX()) {
34318 // Otherwise we can defer to the generic legalizer which will widen
34319 // the input as well. This will be further widened during op
34320 // legalization to v8i32<-v8f64.
34321 // For strict nodes we'll need to widen ourselves.
34322 // FIXME: Fix the type legalizer to safely widen strict nodes?
34323 if (!IsStrict)
34324 return;
34325 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34326 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34327 Opc = N->getOpcode();
34328 }
34329 SDValue Res;
34330 SDValue Chain;
34331 if (IsStrict) {
34332 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34333 {N->getOperand(0), Src});
34334 Chain = Res.getValue(1);
34335 } else {
34336 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34337 }
34338 Results.push_back(Res);
34339 if (IsStrict)
34340 Results.push_back(Chain);
34341 return;
34342 }
34343
34344 // Custom widen strict v2f32->v2i32 by padding with zeros.
34345 // FIXME: Should generic type legalizer do this?
34346 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34347 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34348 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34349 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34350 {N->getOperand(0), Src});
34351 Results.push_back(Res);
34352 Results.push_back(Res.getValue(1));
34353 return;
34354 }
34355
34356 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34357 // so early out here.
34358 return;
34359 }
34360
34361 assert(!VT.isVector() && "Vectors should have been handled above!");
34362
34363 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34364 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34365 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34366 assert(!Subtarget.is64Bit() && "i64 should be legal");
34367 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34368 // If we use a 128-bit result we might need to use a target specific node.
34369 unsigned SrcElts =
34370 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34371 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34372 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34373 if (NumElts != SrcElts) {
34374 if (IsStrict)
34376 else
34377 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34378 }
34379
34380 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34381 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34382 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34383 ZeroIdx);
34384 SDValue Chain;
34385 if (IsStrict) {
34386 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34387 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34388 Chain = Res.getValue(1);
34389 } else
34390 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34391 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34392 Results.push_back(Res);
34393 if (IsStrict)
34394 Results.push_back(Chain);
34395 return;
34396 }
34397
34398 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34399 SDValue Chain;
34400 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34401 Results.push_back(V);
34402 if (IsStrict)
34403 Results.push_back(Chain);
34404 return;
34405 }
34406
34407 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34408 Results.push_back(V);
34409 if (IsStrict)
34410 Results.push_back(Chain);
34411 }
34412 return;
34413 }
34414 case ISD::LRINT:
34415 if (N->getValueType(0) == MVT::v2i32) {
34416 SDValue Src = N->getOperand(0);
34417 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34418 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34419 DAG.getUNDEF(MVT::v2f16));
34420 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34421 DAG.getUNDEF(MVT::v4f16));
34422 } else if (Src.getValueType() != MVT::v2f64) {
34423 return;
34424 }
34425 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34426 return;
34427 }
34428 [[fallthrough]];
34429 case ISD::LLRINT: {
34430 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34431 Results.push_back(V);
34432 return;
34433 }
34434
34435 case ISD::SINT_TO_FP:
34437 case ISD::UINT_TO_FP:
34439 bool IsStrict = N->isStrictFPOpcode();
34440 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34441 EVT VT = N->getValueType(0);
34442 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34443 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34444 Subtarget.hasVLX()) {
34445 if (Src.getValueType().getVectorElementType() == MVT::i16)
34446 return;
34447
34448 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34449 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34450 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34451 : DAG.getUNDEF(MVT::v2i32));
34452 if (IsStrict) {
34453 unsigned Opc =
34455 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34456 {N->getOperand(0), Src});
34457 Results.push_back(Res);
34458 Results.push_back(Res.getValue(1));
34459 } else {
34460 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34461 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34462 }
34463 return;
34464 }
34465 if (VT != MVT::v2f32)
34466 return;
34467 EVT SrcVT = Src.getValueType();
34468 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34469 if (IsStrict) {
34470 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34472 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34473 {N->getOperand(0), Src});
34474 Results.push_back(Res);
34475 Results.push_back(Res.getValue(1));
34476 } else {
34477 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34478 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34479 }
34480 return;
34481 }
34482 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34483 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34484 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34485 SDValue One = DAG.getConstant(1, dl, SrcVT);
34486 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34487 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34488 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34489 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34490 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34491 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34492 for (int i = 0; i != 2; ++i) {
34493 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34494 SignSrc, DAG.getVectorIdxConstant(i, dl));
34495 if (IsStrict)
34496 SignCvts[i] =
34497 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34498 {N->getOperand(0), Elt});
34499 else
34500 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34501 };
34502 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34503 SDValue Slow, Chain;
34504 if (IsStrict) {
34505 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34506 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34507 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34508 {Chain, SignCvt, SignCvt});
34509 Chain = Slow.getValue(1);
34510 } else {
34511 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34512 }
34513 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34514 IsNeg =
34515 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34516 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34517 Results.push_back(Cvt);
34518 if (IsStrict)
34519 Results.push_back(Chain);
34520 return;
34521 }
34522
34523 if (SrcVT != MVT::v2i32)
34524 return;
34525
34526 if (IsSigned || Subtarget.hasAVX512()) {
34527 if (!IsStrict)
34528 return;
34529
34530 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34531 // FIXME: Should generic type legalizer do this?
34532 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34533 DAG.getConstant(0, dl, MVT::v2i32));
34534 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34535 {N->getOperand(0), Src});
34536 Results.push_back(Res);
34537 Results.push_back(Res.getValue(1));
34538 return;
34539 }
34540
34541 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34542 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34543 SDValue VBias = DAG.getConstantFP(
34544 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34545 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34546 DAG.getBitcast(MVT::v2i64, VBias));
34547 Or = DAG.getBitcast(MVT::v2f64, Or);
34548 if (IsStrict) {
34549 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34550 {N->getOperand(0), Or, VBias});
34552 {MVT::v4f32, MVT::Other},
34553 {Sub.getValue(1), Sub});
34554 Results.push_back(Res);
34555 Results.push_back(Res.getValue(1));
34556 } else {
34557 // TODO: Are there any fast-math-flags to propagate here?
34558 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34559 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34560 }
34561 return;
34562 }
34564 case ISD::FP_ROUND: {
34565 bool IsStrict = N->isStrictFPOpcode();
34566 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34567 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34568 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34569 EVT SrcVT = Src.getValueType();
34570 EVT VT = N->getValueType(0);
34571 SDValue V;
34572 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34573 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34574 : DAG.getUNDEF(MVT::v2f32);
34575 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34576 }
34577 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34578 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34579 if (SrcVT.getVectorElementType() != MVT::f32)
34580 return;
34581
34582 if (IsStrict)
34583 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34584 {Chain, Src, Rnd});
34585 else
34586 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34587
34588 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34589 if (IsStrict)
34590 Results.push_back(V.getValue(1));
34591 return;
34592 }
34593 if (!isTypeLegal(Src.getValueType()))
34594 return;
34595 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34596 if (IsStrict)
34597 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34598 {Chain, Src});
34599 else
34600 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34601 Results.push_back(V);
34602 if (IsStrict)
34603 Results.push_back(V.getValue(1));
34604 return;
34605 }
34606 case ISD::FP_EXTEND:
34607 case ISD::STRICT_FP_EXTEND: {
34608 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34609 // No other ValueType for FP_EXTEND should reach this point.
34610 assert(N->getValueType(0) == MVT::v2f32 &&
34611 "Do not know how to legalize this Node");
34612 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34613 return;
34614 bool IsStrict = N->isStrictFPOpcode();
34615 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34616 if (Src.getValueType().getVectorElementType() != MVT::f16)
34617 return;
34618 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34619 : DAG.getUNDEF(MVT::v2f16);
34620 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34621 if (IsStrict)
34622 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34623 {N->getOperand(0), V});
34624 else
34625 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34626 Results.push_back(V);
34627 if (IsStrict)
34628 Results.push_back(V.getValue(1));
34629 return;
34630 }
34632 unsigned IntNo = N->getConstantOperandVal(1);
34633 switch (IntNo) {
34634 default : llvm_unreachable("Do not know how to custom type "
34635 "legalize this intrinsic operation!");
34636 case Intrinsic::x86_rdtsc:
34637 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34638 Results);
34639 case Intrinsic::x86_rdtscp:
34640 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34641 Results);
34642 case Intrinsic::x86_rdpmc:
34643 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34644 Results);
34645 return;
34646 case Intrinsic::x86_rdpru:
34647 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34648 Results);
34649 return;
34650 case Intrinsic::x86_xgetbv:
34651 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34652 Results);
34653 return;
34654 }
34655 }
34656 case ISD::READCYCLECOUNTER: {
34657 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34658 }
34659 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34660 EVT T = N->getValueType(0);
34661 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34662 bool Regs64bit = T == MVT::i128;
34663 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34664 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34665 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34666 SDValue cpInL, cpInH;
34667 std::tie(cpInL, cpInH) =
34668 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34669 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34670 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34671 cpInH =
34672 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34673 cpInH, cpInL.getValue(1));
34674 SDValue swapInL, swapInH;
34675 std::tie(swapInL, swapInH) =
34676 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34677 swapInH =
34678 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34679 swapInH, cpInH.getValue(1));
34680
34681 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34682 // until later. So we keep the RBX input in a vreg and use a custom
34683 // inserter.
34684 // Since RBX will be a reserved register the register allocator will not
34685 // make sure its value will be properly saved and restored around this
34686 // live-range.
34687 SDValue Result;
34688 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34689 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34690 if (Regs64bit) {
34691 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34692 swapInH.getValue(1)};
34693 Result =
34694 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34695 } else {
34696 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34697 swapInH.getValue(1));
34698 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34699 swapInL.getValue(1)};
34700 Result =
34701 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34702 }
34703
34704 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34705 Regs64bit ? X86::RAX : X86::EAX,
34706 HalfT, Result.getValue(1));
34707 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34708 Regs64bit ? X86::RDX : X86::EDX,
34709 HalfT, cpOutL.getValue(2));
34710 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34711
34712 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34713 MVT::i32, cpOutH.getValue(2));
34714 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34715 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34716
34717 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34718 Results.push_back(Success);
34719 Results.push_back(EFLAGS.getValue(1));
34720 return;
34721 }
34722 case ISD::ATOMIC_LOAD: {
34723 assert(
34724 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34725 "Unexpected VT!");
34726 bool NoImplicitFloatOps =
34728 Attribute::NoImplicitFloat);
34729 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34730 auto *Node = cast<AtomicSDNode>(N);
34731
34732 if (N->getValueType(0) == MVT::i128) {
34733 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34734 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34735 Node->getBasePtr(), Node->getMemOperand());
34736 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34737 DAG.getVectorIdxConstant(0, dl));
34738 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34739 DAG.getVectorIdxConstant(1, dl));
34740 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34741 {ResL, ResH}));
34742 Results.push_back(Ld.getValue(1));
34743 return;
34744 }
34745 break;
34746 }
34747 if (Subtarget.hasSSE1()) {
34748 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34749 // Then extract the lower 64-bits.
34750 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34751 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34752 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34754 MVT::i64, Node->getMemOperand());
34755 if (Subtarget.hasSSE2()) {
34756 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34757 DAG.getVectorIdxConstant(0, dl));
34758 Results.push_back(Res);
34759 Results.push_back(Ld.getValue(1));
34760 return;
34761 }
34762 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34763 // then casts to i64. This avoids a 128-bit stack temporary being
34764 // created by type legalization if we were to cast v4f32->v2i64.
34765 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34766 DAG.getVectorIdxConstant(0, dl));
34767 Res = DAG.getBitcast(MVT::i64, Res);
34768 Results.push_back(Res);
34769 Results.push_back(Ld.getValue(1));
34770 return;
34771 }
34772 if (Subtarget.hasX87()) {
34773 // First load this into an 80-bit X87 register. This will put the whole
34774 // integer into the significand.
34775 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34776 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34778 dl, Tys, Ops, MVT::i64,
34779 Node->getMemOperand());
34780 SDValue Chain = Result.getValue(1);
34781
34782 // Now store the X87 register to a stack temporary and convert to i64.
34783 // This store is not atomic and doesn't need to be.
34784 // FIXME: We don't need a stack temporary if the result of the load
34785 // is already being stored. We could just directly store there.
34786 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34787 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34788 MachinePointerInfo MPI =
34790 SDValue StoreOps[] = { Chain, Result, StackPtr };
34791 Chain = DAG.getMemIntrinsicNode(
34792 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34793 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34794
34795 // Finally load the value back from the stack temporary and return it.
34796 // This load is not atomic and doesn't need to be.
34797 // This load will be further type legalized.
34798 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34799 Results.push_back(Result);
34800 Results.push_back(Result.getValue(1));
34801 return;
34802 }
34803 }
34804 // TODO: Use MOVLPS when SSE1 is available?
34805 // Delegate to generic TypeLegalization. Situations we can really handle
34806 // should have already been dealt with by AtomicExpandPass.cpp.
34807 break;
34808 }
34809 case ISD::ATOMIC_SWAP:
34810 case ISD::ATOMIC_LOAD_ADD:
34811 case ISD::ATOMIC_LOAD_SUB:
34812 case ISD::ATOMIC_LOAD_AND:
34813 case ISD::ATOMIC_LOAD_OR:
34814 case ISD::ATOMIC_LOAD_XOR:
34815 case ISD::ATOMIC_LOAD_NAND:
34816 case ISD::ATOMIC_LOAD_MIN:
34817 case ISD::ATOMIC_LOAD_MAX:
34818 case ISD::ATOMIC_LOAD_UMIN:
34819 case ISD::ATOMIC_LOAD_UMAX:
34820 // Delegate to generic TypeLegalization. Situations we can really handle
34821 // should have already been dealt with by AtomicExpandPass.cpp.
34822 break;
34823
34824 case ISD::BITCAST: {
34825 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34826 EVT DstVT = N->getValueType(0);
34827 EVT SrcVT = N->getOperand(0).getValueType();
34828
34829 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34830 // we can split using the k-register rather than memory.
34831 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34832 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34833 SDValue Lo, Hi;
34834 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34835 Lo = DAG.getBitcast(MVT::i32, Lo);
34836 Hi = DAG.getBitcast(MVT::i32, Hi);
34837 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34838 Results.push_back(Res);
34839 return;
34840 }
34841
34842 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34843 // FIXME: Use v4f32 for SSE1?
34844 assert(Subtarget.hasSSE2() && "Requires SSE2");
34845 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34846 "Unexpected type action!");
34847 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34848 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34849 N->getOperand(0));
34850 Res = DAG.getBitcast(WideVT, Res);
34851 Results.push_back(Res);
34852 return;
34853 }
34854
34855 return;
34856 }
34857 case ISD::MGATHER: {
34858 EVT VT = N->getValueType(0);
34859 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34860 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34861 auto *Gather = cast<MaskedGatherSDNode>(N);
34862 SDValue Index = Gather->getIndex();
34863 if (Index.getValueType() != MVT::v2i64)
34864 return;
34866 "Unexpected type action!");
34867 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34868 SDValue Mask = Gather->getMask();
34869 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34870 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34871 Gather->getPassThru(),
34872 DAG.getUNDEF(VT));
34873 if (!Subtarget.hasVLX()) {
34874 // We need to widen the mask, but the instruction will only use 2
34875 // of its elements. So we can use undef.
34876 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34877 DAG.getUNDEF(MVT::v2i1));
34878 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34879 }
34880 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34881 Gather->getBasePtr(), Index, Gather->getScale() };
34882 SDValue Res = DAG.getMemIntrinsicNode(
34883 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34884 Gather->getMemoryVT(), Gather->getMemOperand());
34885 Results.push_back(Res);
34886 Results.push_back(Res.getValue(1));
34887 return;
34888 }
34889 return;
34890 }
34891 case ISD::LOAD: {
34892 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34893 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34894 // cast since type legalization will try to use an i64 load.
34895 MVT VT = N->getSimpleValueType(0);
34896 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34898 "Unexpected type action!");
34899 if (!ISD::isNON_EXTLoad(N))
34900 return;
34901 auto *Ld = cast<LoadSDNode>(N);
34902 if (Subtarget.hasSSE2()) {
34903 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34904 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34905 Ld->getPointerInfo(), Ld->getBaseAlign(),
34906 Ld->getMemOperand()->getFlags());
34907 SDValue Chain = Res.getValue(1);
34908 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34909 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34910 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34911 Res = DAG.getBitcast(WideVT, Res);
34912 Results.push_back(Res);
34913 Results.push_back(Chain);
34914 return;
34915 }
34916 assert(Subtarget.hasSSE1() && "Expected SSE");
34917 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34918 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34920 MVT::i64, Ld->getMemOperand());
34921 Results.push_back(Res);
34922 Results.push_back(Res.getValue(1));
34923 return;
34924 }
34925 case ISD::ADDRSPACECAST: {
34926 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34927 Results.push_back(V);
34928 return;
34929 }
34930 case ISD::BITREVERSE: {
34931 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34932 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34933 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34934 // We'll need to move the scalar in two i32 pieces.
34935 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34936 return;
34937 }
34939 // f16 = extract vXf16 %vec, i64 %idx
34940 assert(N->getSimpleValueType(0) == MVT::f16 &&
34941 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34942 assert(Subtarget.hasFP16() && "Expected FP16");
34943 SDValue VecOp = N->getOperand(0);
34945 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34946 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34947 N->getOperand(1));
34948 Split = DAG.getBitcast(MVT::f16, Split);
34949 Results.push_back(Split);
34950 return;
34951 }
34952 }
34953}
34954
34955const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34956 switch ((X86ISD::NodeType)Opcode) {
34957 case X86ISD::FIRST_NUMBER: break;
34958#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34959 NODE_NAME_CASE(BSF)
34960 NODE_NAME_CASE(BSR)
34961 NODE_NAME_CASE(FSHL)
34962 NODE_NAME_CASE(FSHR)
34963 NODE_NAME_CASE(FAND)
34964 NODE_NAME_CASE(FANDN)
34965 NODE_NAME_CASE(FOR)
34966 NODE_NAME_CASE(FXOR)
34967 NODE_NAME_CASE(FILD)
34968 NODE_NAME_CASE(FIST)
34969 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34970 NODE_NAME_CASE(FLD)
34971 NODE_NAME_CASE(FST)
34972 NODE_NAME_CASE(CALL)
34973 NODE_NAME_CASE(CALL_RVMARKER)
34974 NODE_NAME_CASE(IMP_CALL)
34976 NODE_NAME_CASE(CMP)
34977 NODE_NAME_CASE(FCMP)
34978 NODE_NAME_CASE(STRICT_FCMP)
34979 NODE_NAME_CASE(STRICT_FCMPS)
34981 NODE_NAME_CASE(UCOMI)
34982 NODE_NAME_CASE(COMX)
34983 NODE_NAME_CASE(UCOMX)
34984 NODE_NAME_CASE(CMPM)
34985 NODE_NAME_CASE(CMPMM)
34986 NODE_NAME_CASE(STRICT_CMPM)
34987 NODE_NAME_CASE(CMPMM_SAE)
34988 NODE_NAME_CASE(SETCC)
34989 NODE_NAME_CASE(SETCC_CARRY)
34990 NODE_NAME_CASE(FSETCC)
34991 NODE_NAME_CASE(FSETCCM)
34992 NODE_NAME_CASE(FSETCCM_SAE)
34993 NODE_NAME_CASE(CMOV)
34994 NODE_NAME_CASE(BRCOND)
34995 NODE_NAME_CASE(RET_GLUE)
34996 NODE_NAME_CASE(IRET)
34997 NODE_NAME_CASE(REP_STOS)
34998 NODE_NAME_CASE(REP_MOVS)
34999 NODE_NAME_CASE(GlobalBaseReg)
35001 NODE_NAME_CASE(WrapperRIP)
35002 NODE_NAME_CASE(MOVQ2DQ)
35003 NODE_NAME_CASE(MOVDQ2Q)
35004 NODE_NAME_CASE(MMX_MOVD2W)
35005 NODE_NAME_CASE(MMX_MOVW2D)
35006 NODE_NAME_CASE(PEXTRB)
35007 NODE_NAME_CASE(PEXTRW)
35008 NODE_NAME_CASE(INSERTPS)
35009 NODE_NAME_CASE(PINSRB)
35010 NODE_NAME_CASE(PINSRW)
35011 NODE_NAME_CASE(PSHUFB)
35012 NODE_NAME_CASE(ANDNP)
35013 NODE_NAME_CASE(BLENDI)
35015 NODE_NAME_CASE(HADD)
35016 NODE_NAME_CASE(HSUB)
35017 NODE_NAME_CASE(FHADD)
35018 NODE_NAME_CASE(FHSUB)
35019 NODE_NAME_CASE(CONFLICT)
35020 NODE_NAME_CASE(FMAX)
35021 NODE_NAME_CASE(FMAXS)
35022 NODE_NAME_CASE(FMAX_SAE)
35023 NODE_NAME_CASE(FMAXS_SAE)
35024 NODE_NAME_CASE(STRICT_FMAX)
35025 NODE_NAME_CASE(FMIN)
35026 NODE_NAME_CASE(FMINS)
35027 NODE_NAME_CASE(FMIN_SAE)
35028 NODE_NAME_CASE(FMINS_SAE)
35029 NODE_NAME_CASE(STRICT_FMIN)
35030 NODE_NAME_CASE(FMAXC)
35031 NODE_NAME_CASE(FMINC)
35032 NODE_NAME_CASE(FRSQRT)
35033 NODE_NAME_CASE(FRCP)
35034 NODE_NAME_CASE(EXTRQI)
35035 NODE_NAME_CASE(INSERTQI)
35036 NODE_NAME_CASE(TLSADDR)
35037 NODE_NAME_CASE(TLSBASEADDR)
35038 NODE_NAME_CASE(TLSCALL)
35039 NODE_NAME_CASE(TLSDESC)
35040 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35041 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35042 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35043 NODE_NAME_CASE(EH_RETURN)
35044 NODE_NAME_CASE(TC_RETURN)
35045 NODE_NAME_CASE(FNSTCW16m)
35046 NODE_NAME_CASE(FLDCW16m)
35047 NODE_NAME_CASE(FNSTENVm)
35048 NODE_NAME_CASE(FLDENVm)
35049 NODE_NAME_CASE(LCMPXCHG_DAG)
35050 NODE_NAME_CASE(LCMPXCHG8_DAG)
35051 NODE_NAME_CASE(LCMPXCHG16_DAG)
35052 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35053 NODE_NAME_CASE(LADD)
35054 NODE_NAME_CASE(LSUB)
35055 NODE_NAME_CASE(LOR)
35056 NODE_NAME_CASE(LXOR)
35057 NODE_NAME_CASE(LAND)
35058 NODE_NAME_CASE(LBTS)
35059 NODE_NAME_CASE(LBTC)
35060 NODE_NAME_CASE(LBTR)
35061 NODE_NAME_CASE(LBTS_RM)
35062 NODE_NAME_CASE(LBTC_RM)
35063 NODE_NAME_CASE(LBTR_RM)
35064 NODE_NAME_CASE(AADD)
35065 NODE_NAME_CASE(AOR)
35066 NODE_NAME_CASE(AXOR)
35067 NODE_NAME_CASE(AAND)
35068 NODE_NAME_CASE(VZEXT_MOVL)
35069 NODE_NAME_CASE(VZEXT_LOAD)
35070 NODE_NAME_CASE(VEXTRACT_STORE)
35071 NODE_NAME_CASE(VTRUNC)
35072 NODE_NAME_CASE(VTRUNCS)
35073 NODE_NAME_CASE(VTRUNCUS)
35074 NODE_NAME_CASE(VMTRUNC)
35075 NODE_NAME_CASE(VMTRUNCS)
35076 NODE_NAME_CASE(VMTRUNCUS)
35077 NODE_NAME_CASE(VTRUNCSTORES)
35078 NODE_NAME_CASE(VTRUNCSTOREUS)
35079 NODE_NAME_CASE(VMTRUNCSTORES)
35080 NODE_NAME_CASE(VMTRUNCSTOREUS)
35081 NODE_NAME_CASE(VFPEXT)
35082 NODE_NAME_CASE(STRICT_VFPEXT)
35083 NODE_NAME_CASE(VFPEXT_SAE)
35084 NODE_NAME_CASE(VFPEXTS)
35085 NODE_NAME_CASE(VFPEXTS_SAE)
35086 NODE_NAME_CASE(VFPROUND)
35087 NODE_NAME_CASE(VFPROUND2)
35088 NODE_NAME_CASE(VFPROUND2_RND)
35089 NODE_NAME_CASE(STRICT_VFPROUND)
35090 NODE_NAME_CASE(VMFPROUND)
35091 NODE_NAME_CASE(VFPROUND_RND)
35092 NODE_NAME_CASE(VFPROUNDS)
35093 NODE_NAME_CASE(VFPROUNDS_RND)
35094 NODE_NAME_CASE(VSHLDQ)
35095 NODE_NAME_CASE(VSRLDQ)
35096 NODE_NAME_CASE(VSHL)
35097 NODE_NAME_CASE(VSRL)
35098 NODE_NAME_CASE(VSRA)
35099 NODE_NAME_CASE(VSHLI)
35100 NODE_NAME_CASE(VSRLI)
35101 NODE_NAME_CASE(VSRAI)
35102 NODE_NAME_CASE(VSHLV)
35103 NODE_NAME_CASE(VSRLV)
35104 NODE_NAME_CASE(VSRAV)
35105 NODE_NAME_CASE(VROTLI)
35106 NODE_NAME_CASE(VROTRI)
35107 NODE_NAME_CASE(VPPERM)
35108 NODE_NAME_CASE(CMPP)
35109 NODE_NAME_CASE(STRICT_CMPP)
35110 NODE_NAME_CASE(PCMPEQ)
35111 NODE_NAME_CASE(PCMPGT)
35112 NODE_NAME_CASE(PHMINPOS)
35113 NODE_NAME_CASE(ADD)
35114 NODE_NAME_CASE(SUB)
35115 NODE_NAME_CASE(ADC)
35116 NODE_NAME_CASE(SBB)
35117 NODE_NAME_CASE(SMUL)
35118 NODE_NAME_CASE(UMUL)
35119 NODE_NAME_CASE(OR)
35120 NODE_NAME_CASE(XOR)
35121 NODE_NAME_CASE(AND)
35122 NODE_NAME_CASE(BEXTR)
35124 NODE_NAME_CASE(BZHI)
35125 NODE_NAME_CASE(PDEP)
35126 NODE_NAME_CASE(PEXT)
35127 NODE_NAME_CASE(MUL_IMM)
35128 NODE_NAME_CASE(MOVMSK)
35129 NODE_NAME_CASE(PTEST)
35130 NODE_NAME_CASE(TESTP)
35131 NODE_NAME_CASE(KORTEST)
35132 NODE_NAME_CASE(KTEST)
35133 NODE_NAME_CASE(KADD)
35134 NODE_NAME_CASE(KSHIFTL)
35135 NODE_NAME_CASE(KSHIFTR)
35136 NODE_NAME_CASE(PACKSS)
35137 NODE_NAME_CASE(PACKUS)
35138 NODE_NAME_CASE(PALIGNR)
35139 NODE_NAME_CASE(VALIGN)
35140 NODE_NAME_CASE(VSHLD)
35141 NODE_NAME_CASE(VSHRD)
35142 NODE_NAME_CASE(PSHUFD)
35143 NODE_NAME_CASE(PSHUFHW)
35144 NODE_NAME_CASE(PSHUFLW)
35145 NODE_NAME_CASE(SHUFP)
35146 NODE_NAME_CASE(SHUF128)
35147 NODE_NAME_CASE(MOVLHPS)
35148 NODE_NAME_CASE(MOVHLPS)
35149 NODE_NAME_CASE(MOVDDUP)
35150 NODE_NAME_CASE(MOVSHDUP)
35151 NODE_NAME_CASE(MOVSLDUP)
35152 NODE_NAME_CASE(MOVSD)
35153 NODE_NAME_CASE(MOVSS)
35154 NODE_NAME_CASE(MOVSH)
35155 NODE_NAME_CASE(UNPCKL)
35156 NODE_NAME_CASE(UNPCKH)
35157 NODE_NAME_CASE(VBROADCAST)
35158 NODE_NAME_CASE(VBROADCAST_LOAD)
35159 NODE_NAME_CASE(VBROADCASTM)
35160 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35161 NODE_NAME_CASE(VPERMILPV)
35162 NODE_NAME_CASE(VPERMILPI)
35163 NODE_NAME_CASE(VPERM2X128)
35164 NODE_NAME_CASE(VPERMV)
35165 NODE_NAME_CASE(VPERMV3)
35166 NODE_NAME_CASE(VPERMI)
35167 NODE_NAME_CASE(VPTERNLOG)
35168 NODE_NAME_CASE(FP_TO_SINT_SAT)
35169 NODE_NAME_CASE(FP_TO_UINT_SAT)
35170 NODE_NAME_CASE(VFIXUPIMM)
35171 NODE_NAME_CASE(VFIXUPIMM_SAE)
35172 NODE_NAME_CASE(VFIXUPIMMS)
35173 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35174 NODE_NAME_CASE(VRANGE)
35175 NODE_NAME_CASE(VRANGE_SAE)
35176 NODE_NAME_CASE(VRANGES)
35177 NODE_NAME_CASE(VRANGES_SAE)
35178 NODE_NAME_CASE(PMULUDQ)
35179 NODE_NAME_CASE(PMULDQ)
35180 NODE_NAME_CASE(PSADBW)
35181 NODE_NAME_CASE(DBPSADBW)
35182 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35183 NODE_NAME_CASE(VAARG_64)
35184 NODE_NAME_CASE(VAARG_X32)
35185 NODE_NAME_CASE(DYN_ALLOCA)
35186 NODE_NAME_CASE(MFENCE)
35187 NODE_NAME_CASE(SEG_ALLOCA)
35188 NODE_NAME_CASE(PROBED_ALLOCA)
35191 NODE_NAME_CASE(RDPKRU)
35192 NODE_NAME_CASE(WRPKRU)
35193 NODE_NAME_CASE(VPMADDUBSW)
35194 NODE_NAME_CASE(VPMADDWD)
35195 NODE_NAME_CASE(VPSHA)
35196 NODE_NAME_CASE(VPSHL)
35197 NODE_NAME_CASE(VPCOM)
35198 NODE_NAME_CASE(VPCOMU)
35199 NODE_NAME_CASE(VPERMIL2)
35201 NODE_NAME_CASE(STRICT_FMSUB)
35203 NODE_NAME_CASE(STRICT_FNMADD)
35205 NODE_NAME_CASE(STRICT_FNMSUB)
35206 NODE_NAME_CASE(FMADDSUB)
35207 NODE_NAME_CASE(FMSUBADD)
35208 NODE_NAME_CASE(FMADD_RND)
35209 NODE_NAME_CASE(FNMADD_RND)
35210 NODE_NAME_CASE(FMSUB_RND)
35211 NODE_NAME_CASE(FNMSUB_RND)
35212 NODE_NAME_CASE(FMADDSUB_RND)
35213 NODE_NAME_CASE(FMSUBADD_RND)
35214 NODE_NAME_CASE(VFMADDC)
35215 NODE_NAME_CASE(VFMADDC_RND)
35216 NODE_NAME_CASE(VFCMADDC)
35217 NODE_NAME_CASE(VFCMADDC_RND)
35218 NODE_NAME_CASE(VFMULC)
35219 NODE_NAME_CASE(VFMULC_RND)
35220 NODE_NAME_CASE(VFCMULC)
35221 NODE_NAME_CASE(VFCMULC_RND)
35222 NODE_NAME_CASE(VFMULCSH)
35223 NODE_NAME_CASE(VFMULCSH_RND)
35224 NODE_NAME_CASE(VFCMULCSH)
35225 NODE_NAME_CASE(VFCMULCSH_RND)
35226 NODE_NAME_CASE(VFMADDCSH)
35227 NODE_NAME_CASE(VFMADDCSH_RND)
35228 NODE_NAME_CASE(VFCMADDCSH)
35229 NODE_NAME_CASE(VFCMADDCSH_RND)
35230 NODE_NAME_CASE(VPMADD52H)
35231 NODE_NAME_CASE(VPMADD52L)
35232 NODE_NAME_CASE(VRNDSCALE)
35233 NODE_NAME_CASE(STRICT_VRNDSCALE)
35234 NODE_NAME_CASE(VRNDSCALE_SAE)
35235 NODE_NAME_CASE(VRNDSCALES)
35236 NODE_NAME_CASE(VRNDSCALES_SAE)
35237 NODE_NAME_CASE(VREDUCE)
35238 NODE_NAME_CASE(VREDUCE_SAE)
35239 NODE_NAME_CASE(VREDUCES)
35240 NODE_NAME_CASE(VREDUCES_SAE)
35241 NODE_NAME_CASE(VGETMANT)
35242 NODE_NAME_CASE(VGETMANT_SAE)
35243 NODE_NAME_CASE(VGETMANTS)
35244 NODE_NAME_CASE(VGETMANTS_SAE)
35245 NODE_NAME_CASE(PCMPESTR)
35246 NODE_NAME_CASE(PCMPISTR)
35248 NODE_NAME_CASE(COMPRESS)
35250 NODE_NAME_CASE(SELECTS)
35251 NODE_NAME_CASE(ADDSUB)
35252 NODE_NAME_CASE(RCP14)
35253 NODE_NAME_CASE(RCP14S)
35254 NODE_NAME_CASE(RSQRT14)
35255 NODE_NAME_CASE(RSQRT14S)
35256 NODE_NAME_CASE(FADD_RND)
35257 NODE_NAME_CASE(FADDS)
35258 NODE_NAME_CASE(FADDS_RND)
35259 NODE_NAME_CASE(FSUB_RND)
35260 NODE_NAME_CASE(FSUBS)
35261 NODE_NAME_CASE(FSUBS_RND)
35262 NODE_NAME_CASE(FMUL_RND)
35263 NODE_NAME_CASE(FMULS)
35264 NODE_NAME_CASE(FMULS_RND)
35265 NODE_NAME_CASE(FDIV_RND)
35266 NODE_NAME_CASE(FDIVS)
35267 NODE_NAME_CASE(FDIVS_RND)
35268 NODE_NAME_CASE(FSQRT_RND)
35269 NODE_NAME_CASE(FSQRTS)
35270 NODE_NAME_CASE(FSQRTS_RND)
35271 NODE_NAME_CASE(FGETEXP)
35272 NODE_NAME_CASE(FGETEXP_SAE)
35273 NODE_NAME_CASE(FGETEXPS)
35274 NODE_NAME_CASE(FGETEXPS_SAE)
35275 NODE_NAME_CASE(SCALEF)
35276 NODE_NAME_CASE(SCALEF_RND)
35277 NODE_NAME_CASE(SCALEFS)
35278 NODE_NAME_CASE(SCALEFS_RND)
35279 NODE_NAME_CASE(MULHRS)
35280 NODE_NAME_CASE(SINT_TO_FP_RND)
35281 NODE_NAME_CASE(UINT_TO_FP_RND)
35282 NODE_NAME_CASE(CVTTP2SI)
35283 NODE_NAME_CASE(CVTTP2UI)
35284 NODE_NAME_CASE(STRICT_CVTTP2SI)
35285 NODE_NAME_CASE(STRICT_CVTTP2UI)
35286 NODE_NAME_CASE(MCVTTP2SI)
35287 NODE_NAME_CASE(MCVTTP2UI)
35288 NODE_NAME_CASE(CVTTP2SI_SAE)
35289 NODE_NAME_CASE(CVTTP2UI_SAE)
35290 NODE_NAME_CASE(CVTTS2SI)
35291 NODE_NAME_CASE(CVTTS2UI)
35292 NODE_NAME_CASE(CVTTS2SI_SAE)
35293 NODE_NAME_CASE(CVTTS2UI_SAE)
35294 NODE_NAME_CASE(CVTSI2P)
35295 NODE_NAME_CASE(CVTUI2P)
35296 NODE_NAME_CASE(STRICT_CVTSI2P)
35297 NODE_NAME_CASE(STRICT_CVTUI2P)
35298 NODE_NAME_CASE(MCVTSI2P)
35299 NODE_NAME_CASE(MCVTUI2P)
35300 NODE_NAME_CASE(VFPCLASS)
35301 NODE_NAME_CASE(VFPCLASSS)
35302 NODE_NAME_CASE(MULTISHIFT)
35303 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35304 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35305 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35306 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35307 NODE_NAME_CASE(CVTPS2PH)
35308 NODE_NAME_CASE(STRICT_CVTPS2PH)
35309 NODE_NAME_CASE(CVTPS2PH_SAE)
35310 NODE_NAME_CASE(MCVTPS2PH)
35311 NODE_NAME_CASE(MCVTPS2PH_SAE)
35312 NODE_NAME_CASE(CVTPH2PS)
35313 NODE_NAME_CASE(STRICT_CVTPH2PS)
35314 NODE_NAME_CASE(CVTPH2PS_SAE)
35315 NODE_NAME_CASE(CVTP2SI)
35316 NODE_NAME_CASE(CVTP2UI)
35317 NODE_NAME_CASE(MCVTP2SI)
35318 NODE_NAME_CASE(MCVTP2UI)
35319 NODE_NAME_CASE(CVTP2SI_RND)
35320 NODE_NAME_CASE(CVTP2UI_RND)
35321 NODE_NAME_CASE(CVTS2SI)
35322 NODE_NAME_CASE(CVTS2UI)
35323 NODE_NAME_CASE(CVTS2SI_RND)
35324 NODE_NAME_CASE(CVTS2UI_RND)
35325 NODE_NAME_CASE(CVTNEPS2BF16)
35326 NODE_NAME_CASE(MCVTNEPS2BF16)
35327 NODE_NAME_CASE(DPBF16PS)
35328 NODE_NAME_CASE(DPFP16PS)
35329 NODE_NAME_CASE(MPSADBW)
35330 NODE_NAME_CASE(LWPINS)
35331 NODE_NAME_CASE(MGATHER)
35332 NODE_NAME_CASE(MSCATTER)
35333 NODE_NAME_CASE(VPDPBUSD)
35334 NODE_NAME_CASE(VPDPBUSDS)
35335 NODE_NAME_CASE(VPDPWSSD)
35336 NODE_NAME_CASE(VPDPWSSDS)
35337 NODE_NAME_CASE(VPSHUFBITQMB)
35338 NODE_NAME_CASE(GF2P8MULB)
35339 NODE_NAME_CASE(GF2P8AFFINEQB)
35340 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35341 NODE_NAME_CASE(NT_CALL)
35342 NODE_NAME_CASE(NT_BRIND)
35343 NODE_NAME_CASE(UMWAIT)
35344 NODE_NAME_CASE(TPAUSE)
35345 NODE_NAME_CASE(ENQCMD)
35346 NODE_NAME_CASE(ENQCMDS)
35347 NODE_NAME_CASE(VP2INTERSECT)
35348 NODE_NAME_CASE(VPDPBSUD)
35349 NODE_NAME_CASE(VPDPBSUDS)
35350 NODE_NAME_CASE(VPDPBUUD)
35351 NODE_NAME_CASE(VPDPBUUDS)
35352 NODE_NAME_CASE(VPDPBSSD)
35353 NODE_NAME_CASE(VPDPBSSDS)
35354 NODE_NAME_CASE(VPDPWSUD)
35355 NODE_NAME_CASE(VPDPWSUDS)
35356 NODE_NAME_CASE(VPDPWUSD)
35357 NODE_NAME_CASE(VPDPWUSDS)
35358 NODE_NAME_CASE(VPDPWUUD)
35359 NODE_NAME_CASE(VPDPWUUDS)
35360 NODE_NAME_CASE(VMINMAX)
35361 NODE_NAME_CASE(VMINMAX_SAE)
35362 NODE_NAME_CASE(VMINMAXS)
35363 NODE_NAME_CASE(VMINMAXS_SAE)
35364 NODE_NAME_CASE(CVTP2IBS)
35365 NODE_NAME_CASE(CVTP2IUBS)
35366 NODE_NAME_CASE(CVTP2IBS_RND)
35367 NODE_NAME_CASE(CVTP2IUBS_RND)
35368 NODE_NAME_CASE(CVTTP2IBS)
35369 NODE_NAME_CASE(CVTTP2IUBS)
35370 NODE_NAME_CASE(CVTTP2IBS_SAE)
35371 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35372 NODE_NAME_CASE(VCVT2PH2BF8)
35373 NODE_NAME_CASE(VCVT2PH2BF8S)
35374 NODE_NAME_CASE(VCVT2PH2HF8)
35375 NODE_NAME_CASE(VCVT2PH2HF8S)
35376 NODE_NAME_CASE(VCVTBIASPH2BF8)
35377 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35378 NODE_NAME_CASE(VCVTBIASPH2HF8)
35379 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35380 NODE_NAME_CASE(VCVTPH2BF8)
35381 NODE_NAME_CASE(VCVTPH2BF8S)
35382 NODE_NAME_CASE(VCVTPH2HF8)
35383 NODE_NAME_CASE(VCVTPH2HF8S)
35384 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35385 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35386 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35387 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35388 NODE_NAME_CASE(VMCVTPH2BF8)
35389 NODE_NAME_CASE(VMCVTPH2BF8S)
35390 NODE_NAME_CASE(VMCVTPH2HF8)
35391 NODE_NAME_CASE(VMCVTPH2HF8S)
35392 NODE_NAME_CASE(VCVTHF82PH)
35393 NODE_NAME_CASE(AESENC128KL)
35394 NODE_NAME_CASE(AESDEC128KL)
35395 NODE_NAME_CASE(AESENC256KL)
35396 NODE_NAME_CASE(AESDEC256KL)
35397 NODE_NAME_CASE(AESENCWIDE128KL)
35398 NODE_NAME_CASE(AESDECWIDE128KL)
35399 NODE_NAME_CASE(AESENCWIDE256KL)
35400 NODE_NAME_CASE(AESDECWIDE256KL)
35401 NODE_NAME_CASE(CMPCCXADD)
35402 NODE_NAME_CASE(TESTUI)
35403 NODE_NAME_CASE(FP80_ADD)
35404 NODE_NAME_CASE(STRICT_FP80_ADD)
35405 NODE_NAME_CASE(CCMP)
35406 NODE_NAME_CASE(CTEST)
35407 NODE_NAME_CASE(CLOAD)
35408 NODE_NAME_CASE(CSTORE)
35409 NODE_NAME_CASE(CVTTS2SIS)
35410 NODE_NAME_CASE(CVTTS2UIS)
35411 NODE_NAME_CASE(CVTTS2SIS_SAE)
35412 NODE_NAME_CASE(CVTTS2UIS_SAE)
35413 NODE_NAME_CASE(CVTTP2SIS)
35414 NODE_NAME_CASE(MCVTTP2SIS)
35415 NODE_NAME_CASE(CVTTP2UIS_SAE)
35416 NODE_NAME_CASE(CVTTP2SIS_SAE)
35417 NODE_NAME_CASE(CVTTP2UIS)
35418 NODE_NAME_CASE(MCVTTP2UIS)
35419 NODE_NAME_CASE(POP_FROM_X87_REG)
35420 }
35421 return nullptr;
35422#undef NODE_NAME_CASE
35423}
35424
35425/// Return true if the addressing mode represented by AM is legal for this
35426/// target, for a load/store of the specified type.
35428 const AddrMode &AM, Type *Ty,
35429 unsigned AS,
35430 Instruction *I) const {
35431 // X86 supports extremely general addressing modes.
35433
35434 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35435 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35436 return false;
35437
35438 if (AM.BaseGV) {
35439 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35440
35441 // If a reference to this global requires an extra load, we can't fold it.
35442 if (isGlobalStubReference(GVFlags))
35443 return false;
35444
35445 // If BaseGV requires a register for the PIC base, we cannot also have a
35446 // BaseReg specified.
35447 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35448 return false;
35449
35450 // If lower 4G is not available, then we must use rip-relative addressing.
35451 if ((M != CodeModel::Small || isPositionIndependent()) &&
35452 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35453 return false;
35454 }
35455
35456 switch (AM.Scale) {
35457 case 0:
35458 case 1:
35459 case 2:
35460 case 4:
35461 case 8:
35462 // These scales always work.
35463 break;
35464 case 3:
35465 case 5:
35466 case 9:
35467 // These scales are formed with basereg+scalereg. Only accept if there is
35468 // no basereg yet.
35469 if (AM.HasBaseReg)
35470 return false;
35471 break;
35472 default: // Other stuff never works.
35473 return false;
35474 }
35475
35476 return true;
35477}
35478
35479bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35480 switch (Opcode) {
35481 // These are non-commutative binops.
35482 // TODO: Add more X86ISD opcodes once we have test coverage.
35483 case X86ISD::ANDNP:
35484 case X86ISD::PCMPGT:
35485 case X86ISD::FMAX:
35486 case X86ISD::FMIN:
35487 case X86ISD::FANDN:
35488 case X86ISD::VPSHA:
35489 case X86ISD::VPSHL:
35490 case X86ISD::VSHLV:
35491 case X86ISD::VSRLV:
35492 case X86ISD::VSRAV:
35493 return true;
35494 }
35495
35496 return TargetLoweringBase::isBinOp(Opcode);
35497}
35498
35499bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35500 switch (Opcode) {
35501 // TODO: Add more X86ISD opcodes once we have test coverage.
35502 case X86ISD::PCMPEQ:
35503 case X86ISD::PMULDQ:
35504 case X86ISD::PMULUDQ:
35505 case X86ISD::FMAXC:
35506 case X86ISD::FMINC:
35507 case X86ISD::FAND:
35508 case X86ISD::FOR:
35509 case X86ISD::FXOR:
35510 return true;
35511 }
35512
35514}
35515
35517 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35518 return false;
35519 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35520 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35521 return NumBits1 > NumBits2;
35522}
35523
35525 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35526 return false;
35527
35528 if (!isTypeLegal(EVT::getEVT(Ty1)))
35529 return false;
35530
35531 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35532
35533 // Assuming the caller doesn't have a zeroext or signext return parameter,
35534 // truncation all the way down to i1 is valid.
35535 return true;
35536}
35537
35539 return isInt<32>(Imm);
35540}
35541
35543 // Can also use sub to handle negated immediates.
35544 return isInt<32>(Imm);
35545}
35546
35548 return isInt<32>(Imm);
35549}
35550
35552 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35553 return false;
35554 unsigned NumBits1 = VT1.getSizeInBits();
35555 unsigned NumBits2 = VT2.getSizeInBits();
35556 return NumBits1 > NumBits2;
35557}
35558
35560 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35561 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35562}
35563
35565 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35566 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35567}
35568
35570 EVT VT1 = Val.getValueType();
35571 if (isZExtFree(VT1, VT2))
35572 return true;
35573
35574 if (Val.getOpcode() != ISD::LOAD)
35575 return false;
35576
35577 if (!VT1.isSimple() || !VT1.isInteger() ||
35578 !VT2.isSimple() || !VT2.isInteger())
35579 return false;
35580
35581 switch (VT1.getSimpleVT().SimpleTy) {
35582 default: break;
35583 case MVT::i8:
35584 case MVT::i16:
35585 case MVT::i32:
35586 // X86 has 8, 16, and 32-bit zero-extending loads.
35587 return true;
35588 }
35589
35590 return false;
35591}
35592
35594 if (!Subtarget.is64Bit())
35595 return false;
35596 return TargetLowering::shouldConvertPhiType(From, To);
35597}
35598
35600 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35601 return false;
35602
35603 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35604
35605 // There is no extending load for vXi1.
35606 if (SrcVT.getScalarType() == MVT::i1)
35607 return false;
35608
35609 return true;
35610}
35611
35613 EVT VT) const {
35614 if (Subtarget.useSoftFloat())
35615 return false;
35616
35617 if (!Subtarget.hasAnyFMA())
35618 return false;
35619
35620 VT = VT.getScalarType();
35621
35622 if (!VT.isSimple())
35623 return false;
35624
35625 switch (VT.getSimpleVT().SimpleTy) {
35626 case MVT::f16:
35627 return Subtarget.hasFP16();
35628 case MVT::f32:
35629 case MVT::f64:
35630 return true;
35631 default:
35632 break;
35633 }
35634
35635 return false;
35636}
35637
35639 EVT DestVT) const {
35640 // i16 instructions are longer (0x66 prefix) and potentially slower.
35641 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35642}
35643
35645 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35646 SDValue Y) const {
35647 if (SelectOpcode == ISD::SELECT) {
35648 if (VT.isVector())
35649 return false;
35650 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35651 return false;
35652 using namespace llvm::SDPatternMatch;
35653 // BLSI
35654 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35656 return true;
35657 // BLSR
35658 if (BinOpcode == ISD::AND &&
35661 return true;
35662 // BLSMSK
35663 if (BinOpcode == ISD::XOR &&
35666 return true;
35667
35668 return false;
35669 }
35670 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35671 // benefit. The transform may also be profitable for scalar code.
35672 if (!Subtarget.hasAVX512())
35673 return false;
35674 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35675 return false;
35676 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35677 return false;
35678
35679 return true;
35680}
35681
35682/// Targets can use this to indicate that they only support *some*
35683/// VECTOR_SHUFFLE operations, those with specific masks.
35684/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35685/// are assumed to be legal.
35687 if (!VT.isSimple())
35688 return false;
35689
35690 // Not for i1 vectors
35691 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35692 return false;
35693
35694 // Very little shuffling can be done for 64-bit vectors right now.
35695 if (VT.getSimpleVT().getSizeInBits() == 64)
35696 return false;
35697
35698 // We only care that the types being shuffled are legal. The lowering can
35699 // handle any possible shuffle mask that results.
35700 return isTypeLegal(VT.getSimpleVT());
35701}
35702
35704 EVT VT) const {
35705 // Don't convert an 'and' into a shuffle that we don't directly support.
35706 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35707 if (!Subtarget.hasAVX2())
35708 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35709 return false;
35710
35711 // Just delegate to the generic legality, clear masks aren't special.
35712 return isShuffleMaskLegal(Mask, VT);
35713}
35714
35716 // If the subtarget is using thunks, we need to not generate jump tables.
35717 if (Subtarget.useIndirectThunkBranches())
35718 return false;
35719
35720 // Otherwise, fallback on the generic logic.
35722}
35723
35725 EVT ConditionVT) const {
35726 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35727 // zero-extensions.
35728 if (ConditionVT.getSizeInBits() < 32)
35729 return MVT::i32;
35731 ConditionVT);
35732}
35733
35734//===----------------------------------------------------------------------===//
35735// X86 Scheduler Hooks
35736//===----------------------------------------------------------------------===//
35737
35738/// Utility function to emit xbegin specifying the start of an RTM region.
35740 const TargetInstrInfo *TII) {
35741 const MIMetadata MIMD(MI);
35742
35743 const BasicBlock *BB = MBB->getBasicBlock();
35744 MachineFunction::iterator I = ++MBB->getIterator();
35745
35746 // For the v = xbegin(), we generate
35747 //
35748 // thisMBB:
35749 // xbegin sinkMBB
35750 //
35751 // mainMBB:
35752 // s0 = -1
35753 //
35754 // fallBB:
35755 // eax = # XABORT_DEF
35756 // s1 = eax
35757 //
35758 // sinkMBB:
35759 // v = phi(s0/mainBB, s1/fallBB)
35760
35761 MachineBasicBlock *thisMBB = MBB;
35762 MachineFunction *MF = MBB->getParent();
35763 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35764 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35765 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35766 MF->insert(I, mainMBB);
35767 MF->insert(I, fallMBB);
35768 MF->insert(I, sinkMBB);
35769
35770 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35771 mainMBB->addLiveIn(X86::EFLAGS);
35772 fallMBB->addLiveIn(X86::EFLAGS);
35773 sinkMBB->addLiveIn(X86::EFLAGS);
35774 }
35775
35776 // Transfer the remainder of BB and its successor edges to sinkMBB.
35777 sinkMBB->splice(sinkMBB->begin(), MBB,
35778 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35780
35782 Register DstReg = MI.getOperand(0).getReg();
35783 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35784 Register mainDstReg = MRI.createVirtualRegister(RC);
35785 Register fallDstReg = MRI.createVirtualRegister(RC);
35786
35787 // thisMBB:
35788 // xbegin fallMBB
35789 // # fallthrough to mainMBB
35790 // # abortion to fallMBB
35791 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35792 thisMBB->addSuccessor(mainMBB);
35793 thisMBB->addSuccessor(fallMBB);
35794
35795 // mainMBB:
35796 // mainDstReg := -1
35797 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35798 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35799 mainMBB->addSuccessor(sinkMBB);
35800
35801 // fallMBB:
35802 // ; pseudo instruction to model hardware's definition from XABORT
35803 // EAX := XABORT_DEF
35804 // fallDstReg := EAX
35805 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35806 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35807 .addReg(X86::EAX);
35808 fallMBB->addSuccessor(sinkMBB);
35809
35810 // sinkMBB:
35811 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35812 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35813 .addReg(mainDstReg).addMBB(mainMBB)
35814 .addReg(fallDstReg).addMBB(fallMBB);
35815
35816 MI.eraseFromParent();
35817 return sinkMBB;
35818}
35819
35821X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35822 MachineBasicBlock *MBB) const {
35823 // Emit va_arg instruction on X86-64.
35824
35825 // Operands to this pseudo-instruction:
35826 // 0 ) Output : destination address (reg)
35827 // 1-5) Input : va_list address (addr, i64mem)
35828 // 6 ) ArgSize : Size (in bytes) of vararg type
35829 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35830 // 8 ) Align : Alignment of type
35831 // 9 ) EFLAGS (implicit-def)
35832
35833 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35834 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35835
35836 Register DestReg = MI.getOperand(0).getReg();
35837 MachineOperand &Base = MI.getOperand(1);
35838 MachineOperand &Scale = MI.getOperand(2);
35839 MachineOperand &Index = MI.getOperand(3);
35840 MachineOperand &Disp = MI.getOperand(4);
35841 MachineOperand &Segment = MI.getOperand(5);
35842 unsigned ArgSize = MI.getOperand(6).getImm();
35843 unsigned ArgMode = MI.getOperand(7).getImm();
35844 Align Alignment = Align(MI.getOperand(8).getImm());
35845
35846 MachineFunction *MF = MBB->getParent();
35847
35848 // Memory Reference
35849 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35850
35851 MachineMemOperand *OldMMO = MI.memoperands().front();
35852
35853 // Clone the MMO into two separate MMOs for loading and storing
35854 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35855 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35856 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35857 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35858
35859 // Machine Information
35860 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35861 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35862 const TargetRegisterClass *AddrRegClass =
35864 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35865 const MIMetadata MIMD(MI);
35866
35867 // struct va_list {
35868 // i32 gp_offset
35869 // i32 fp_offset
35870 // i64 overflow_area (address)
35871 // i64 reg_save_area (address)
35872 // }
35873 // sizeof(va_list) = 24
35874 // alignment(va_list) = 8
35875
35876 unsigned TotalNumIntRegs = 6;
35877 unsigned TotalNumXMMRegs = 8;
35878 bool UseGPOffset = (ArgMode == 1);
35879 bool UseFPOffset = (ArgMode == 2);
35880 unsigned MaxOffset = TotalNumIntRegs * 8 +
35881 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35882
35883 /* Align ArgSize to a multiple of 8 */
35884 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35885 bool NeedsAlign = (Alignment > 8);
35886
35887 MachineBasicBlock *thisMBB = MBB;
35888 MachineBasicBlock *overflowMBB;
35889 MachineBasicBlock *offsetMBB;
35890 MachineBasicBlock *endMBB;
35891
35892 Register OffsetDestReg; // Argument address computed by offsetMBB
35893 Register OverflowDestReg; // Argument address computed by overflowMBB
35894 Register OffsetReg;
35895
35896 if (!UseGPOffset && !UseFPOffset) {
35897 // If we only pull from the overflow region, we don't create a branch.
35898 // We don't need to alter control flow.
35899 OffsetDestReg = Register(); // unused
35900 OverflowDestReg = DestReg;
35901
35902 offsetMBB = nullptr;
35903 overflowMBB = thisMBB;
35904 endMBB = thisMBB;
35905 } else {
35906 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35907 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35908 // If not, pull from overflow_area. (branch to overflowMBB)
35909 //
35910 // thisMBB
35911 // | .
35912 // | .
35913 // offsetMBB overflowMBB
35914 // | .
35915 // | .
35916 // endMBB
35917
35918 // Registers for the PHI in endMBB
35919 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35920 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35921
35922 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35923 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35924 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35925 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35926
35928
35929 // Insert the new basic blocks
35930 MF->insert(MBBIter, offsetMBB);
35931 MF->insert(MBBIter, overflowMBB);
35932 MF->insert(MBBIter, endMBB);
35933
35934 // Transfer the remainder of MBB and its successor edges to endMBB.
35935 endMBB->splice(endMBB->begin(), thisMBB,
35936 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35937 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35938
35939 // Make offsetMBB and overflowMBB successors of thisMBB
35940 thisMBB->addSuccessor(offsetMBB);
35941 thisMBB->addSuccessor(overflowMBB);
35942
35943 // endMBB is a successor of both offsetMBB and overflowMBB
35944 offsetMBB->addSuccessor(endMBB);
35945 overflowMBB->addSuccessor(endMBB);
35946
35947 // Load the offset value into a register
35948 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35949 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35950 .add(Base)
35951 .add(Scale)
35952 .add(Index)
35953 .addDisp(Disp, UseFPOffset ? 4 : 0)
35954 .add(Segment)
35955 .setMemRefs(LoadOnlyMMO);
35956
35957 // Check if there is enough room left to pull this argument.
35958 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35959 .addReg(OffsetReg)
35960 .addImm(MaxOffset + 8 - ArgSizeA8);
35961
35962 // Branch to "overflowMBB" if offset >= max
35963 // Fall through to "offsetMBB" otherwise
35964 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35965 .addMBB(overflowMBB).addImm(X86::COND_AE);
35966 }
35967
35968 // In offsetMBB, emit code to use the reg_save_area.
35969 if (offsetMBB) {
35970 assert(OffsetReg != 0);
35971
35972 // Read the reg_save_area address.
35973 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35974 BuildMI(
35975 offsetMBB, MIMD,
35976 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35977 RegSaveReg)
35978 .add(Base)
35979 .add(Scale)
35980 .add(Index)
35981 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35982 .add(Segment)
35983 .setMemRefs(LoadOnlyMMO);
35984
35985 if (Subtarget.isTarget64BitLP64()) {
35986 // Zero-extend the offset
35987 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35988 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35989 .addImm(0)
35990 .addReg(OffsetReg)
35991 .addImm(X86::sub_32bit);
35992
35993 // Add the offset to the reg_save_area to get the final address.
35994 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35995 .addReg(OffsetReg64)
35996 .addReg(RegSaveReg);
35997 } else {
35998 // Add the offset to the reg_save_area to get the final address.
35999 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
36000 .addReg(OffsetReg)
36001 .addReg(RegSaveReg);
36002 }
36003
36004 // Compute the offset for the next argument
36005 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36006 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
36007 .addReg(OffsetReg)
36008 .addImm(UseFPOffset ? 16 : 8);
36009
36010 // Store it back into the va_list.
36011 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
36012 .add(Base)
36013 .add(Scale)
36014 .add(Index)
36015 .addDisp(Disp, UseFPOffset ? 4 : 0)
36016 .add(Segment)
36017 .addReg(NextOffsetReg)
36018 .setMemRefs(StoreOnlyMMO);
36019
36020 // Jump to endMBB
36021 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
36022 .addMBB(endMBB);
36023 }
36024
36025 //
36026 // Emit code to use overflow area
36027 //
36028
36029 // Load the overflow_area address into a register.
36030 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36031 BuildMI(overflowMBB, MIMD,
36032 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36033 OverflowAddrReg)
36034 .add(Base)
36035 .add(Scale)
36036 .add(Index)
36037 .addDisp(Disp, 8)
36038 .add(Segment)
36039 .setMemRefs(LoadOnlyMMO);
36040
36041 // If we need to align it, do so. Otherwise, just copy the address
36042 // to OverflowDestReg.
36043 if (NeedsAlign) {
36044 // Align the overflow address
36045 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36046
36047 // aligned_addr = (addr + (align-1)) & ~(align-1)
36048 BuildMI(
36049 overflowMBB, MIMD,
36050 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36051 TmpReg)
36052 .addReg(OverflowAddrReg)
36053 .addImm(Alignment.value() - 1);
36054
36055 BuildMI(
36056 overflowMBB, MIMD,
36057 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36058 OverflowDestReg)
36059 .addReg(TmpReg)
36060 .addImm(~(uint64_t)(Alignment.value() - 1));
36061 } else {
36062 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36063 .addReg(OverflowAddrReg);
36064 }
36065
36066 // Compute the next overflow address after this argument.
36067 // (the overflow address should be kept 8-byte aligned)
36068 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36069 BuildMI(
36070 overflowMBB, MIMD,
36071 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36072 NextAddrReg)
36073 .addReg(OverflowDestReg)
36074 .addImm(ArgSizeA8);
36075
36076 // Store the new overflow address.
36077 BuildMI(overflowMBB, MIMD,
36078 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36079 .add(Base)
36080 .add(Scale)
36081 .add(Index)
36082 .addDisp(Disp, 8)
36083 .add(Segment)
36084 .addReg(NextAddrReg)
36085 .setMemRefs(StoreOnlyMMO);
36086
36087 // If we branched, emit the PHI to the front of endMBB.
36088 if (offsetMBB) {
36089 BuildMI(*endMBB, endMBB->begin(), MIMD,
36090 TII->get(X86::PHI), DestReg)
36091 .addReg(OffsetDestReg).addMBB(offsetMBB)
36092 .addReg(OverflowDestReg).addMBB(overflowMBB);
36093 }
36094
36095 // Erase the pseudo instruction
36096 MI.eraseFromParent();
36097
36098 return endMBB;
36099}
36100
36101// The EFLAGS operand of SelectItr might be missing a kill marker
36102// because there were multiple uses of EFLAGS, and ISel didn't know
36103// which to mark. Figure out whether SelectItr should have had a
36104// kill marker, and set it if it should. Returns the correct kill
36105// marker value.
36108 const TargetRegisterInfo* TRI) {
36109 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36110 return false;
36111
36112 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36113 // out. SelectMI should have a kill flag on EFLAGS.
36114 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36115 return true;
36116}
36117
36118// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36119// together with other CMOV pseudo-opcodes into a single basic-block with
36120// conditional jump around it.
36122 switch (MI.getOpcode()) {
36123 case X86::CMOV_FR16:
36124 case X86::CMOV_FR16X:
36125 case X86::CMOV_FR32:
36126 case X86::CMOV_FR32X:
36127 case X86::CMOV_FR64:
36128 case X86::CMOV_FR64X:
36129 case X86::CMOV_GR8:
36130 case X86::CMOV_GR16:
36131 case X86::CMOV_GR32:
36132 case X86::CMOV_RFP32:
36133 case X86::CMOV_RFP64:
36134 case X86::CMOV_RFP80:
36135 case X86::CMOV_VR64:
36136 case X86::CMOV_VR128:
36137 case X86::CMOV_VR128X:
36138 case X86::CMOV_VR256:
36139 case X86::CMOV_VR256X:
36140 case X86::CMOV_VR512:
36141 case X86::CMOV_VK1:
36142 case X86::CMOV_VK2:
36143 case X86::CMOV_VK4:
36144 case X86::CMOV_VK8:
36145 case X86::CMOV_VK16:
36146 case X86::CMOV_VK32:
36147 case X86::CMOV_VK64:
36148 return true;
36149
36150 default:
36151 return false;
36152 }
36153}
36154
36155// Helper function, which inserts PHI functions into SinkMBB:
36156// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36157// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36158// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36159// the last PHI function inserted.
36162 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36163 MachineBasicBlock *SinkMBB) {
36164 MachineFunction *MF = TrueMBB->getParent();
36166 const MIMetadata MIMD(*MIItBegin);
36167
36168 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36170
36171 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36172
36173 // As we are creating the PHIs, we have to be careful if there is more than
36174 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36175 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36176 // That also means that PHI construction must work forward from earlier to
36177 // later, and that the code must maintain a mapping from earlier PHI's
36178 // destination registers, and the registers that went into the PHI.
36181
36182 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36183 Register DestReg = MIIt->getOperand(0).getReg();
36184 Register Op1Reg = MIIt->getOperand(1).getReg();
36185 Register Op2Reg = MIIt->getOperand(2).getReg();
36186
36187 // If this CMOV we are generating is the opposite condition from
36188 // the jump we generated, then we have to swap the operands for the
36189 // PHI that is going to be generated.
36190 if (MIIt->getOperand(3).getImm() == OppCC)
36191 std::swap(Op1Reg, Op2Reg);
36192
36193 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36194 Op1Reg = It->second.first;
36195
36196 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36197 Op2Reg = It->second.second;
36198
36199 MIB =
36200 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36201 .addReg(Op1Reg)
36202 .addMBB(FalseMBB)
36203 .addReg(Op2Reg)
36204 .addMBB(TrueMBB);
36205
36206 // Add this PHI to the rewrite table.
36207 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36208 }
36209
36210 return MIB;
36211}
36212
36213// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36215X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36216 MachineInstr &SecondCascadedCMOV,
36217 MachineBasicBlock *ThisMBB) const {
36218 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36219 const MIMetadata MIMD(FirstCMOV);
36220
36221 // We lower cascaded CMOVs such as
36222 //
36223 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36224 //
36225 // to two successive branches.
36226 //
36227 // Without this, we would add a PHI between the two jumps, which ends up
36228 // creating a few copies all around. For instance, for
36229 //
36230 // (sitofp (zext (fcmp une)))
36231 //
36232 // we would generate:
36233 //
36234 // ucomiss %xmm1, %xmm0
36235 // movss <1.0f>, %xmm0
36236 // movaps %xmm0, %xmm1
36237 // jne .LBB5_2
36238 // xorps %xmm1, %xmm1
36239 // .LBB5_2:
36240 // jp .LBB5_4
36241 // movaps %xmm1, %xmm0
36242 // .LBB5_4:
36243 // retq
36244 //
36245 // because this custom-inserter would have generated:
36246 //
36247 // A
36248 // | \
36249 // | B
36250 // | /
36251 // C
36252 // | \
36253 // | D
36254 // | /
36255 // E
36256 //
36257 // A: X = ...; Y = ...
36258 // B: empty
36259 // C: Z = PHI [X, A], [Y, B]
36260 // D: empty
36261 // E: PHI [X, C], [Z, D]
36262 //
36263 // If we lower both CMOVs in a single step, we can instead generate:
36264 //
36265 // A
36266 // | \
36267 // | C
36268 // | /|
36269 // |/ |
36270 // | |
36271 // | D
36272 // | /
36273 // E
36274 //
36275 // A: X = ...; Y = ...
36276 // D: empty
36277 // E: PHI [X, A], [X, C], [Y, D]
36278 //
36279 // Which, in our sitofp/fcmp example, gives us something like:
36280 //
36281 // ucomiss %xmm1, %xmm0
36282 // movss <1.0f>, %xmm0
36283 // jne .LBB5_4
36284 // jp .LBB5_4
36285 // xorps %xmm0, %xmm0
36286 // .LBB5_4:
36287 // retq
36288 //
36289
36290 // We lower cascaded CMOV into two successive branches to the same block.
36291 // EFLAGS is used by both, so mark it as live in the second.
36292 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36293 MachineFunction *F = ThisMBB->getParent();
36294 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36295 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36296 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36297
36298 MachineFunction::iterator It = ++ThisMBB->getIterator();
36299 F->insert(It, FirstInsertedMBB);
36300 F->insert(It, SecondInsertedMBB);
36301 F->insert(It, SinkMBB);
36302
36303 // For a cascaded CMOV, we lower it to two successive branches to
36304 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36305 // the FirstInsertedMBB.
36306 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36307
36308 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36309 // live into the sink and copy blocks.
36310 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36311 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36312 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36313 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36314 SinkMBB->addLiveIn(X86::EFLAGS);
36315 }
36316
36317 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36318 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36319 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36320 ThisMBB->end());
36321 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36322
36323 // Fallthrough block for ThisMBB.
36324 ThisMBB->addSuccessor(FirstInsertedMBB);
36325 // The true block target of the first branch is always SinkMBB.
36326 ThisMBB->addSuccessor(SinkMBB);
36327 // Fallthrough block for FirstInsertedMBB.
36328 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36329 // The true block for the branch of FirstInsertedMBB.
36330 FirstInsertedMBB->addSuccessor(SinkMBB);
36331 // This is fallthrough.
36332 SecondInsertedMBB->addSuccessor(SinkMBB);
36333
36334 // Create the conditional branch instructions.
36335 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36336 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36337
36338 X86::CondCode SecondCC =
36339 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36340 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36341 .addMBB(SinkMBB)
36342 .addImm(SecondCC);
36343
36344 // SinkMBB:
36345 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36346 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36347 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36348 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36349 MachineInstrBuilder MIB =
36350 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36351 .addReg(Op1Reg)
36352 .addMBB(SecondInsertedMBB)
36353 .addReg(Op2Reg)
36354 .addMBB(ThisMBB);
36355
36356 // The second SecondInsertedMBB provides the same incoming value as the
36357 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36358 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36359
36360 // Now remove the CMOVs.
36361 FirstCMOV.eraseFromParent();
36362 SecondCascadedCMOV.eraseFromParent();
36363
36364 return SinkMBB;
36365}
36366
36368X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36369 MachineBasicBlock *ThisMBB) const {
36370 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36371 const MIMetadata MIMD(MI);
36372
36373 // To "insert" a SELECT_CC instruction, we actually have to insert the
36374 // diamond control-flow pattern. The incoming instruction knows the
36375 // destination vreg to set, the condition code register to branch on, the
36376 // true/false values to select between and a branch opcode to use.
36377
36378 // ThisMBB:
36379 // ...
36380 // TrueVal = ...
36381 // cmpTY ccX, r1, r2
36382 // bCC copy1MBB
36383 // fallthrough --> FalseMBB
36384
36385 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36386 // as described above, by inserting a BB, and then making a PHI at the join
36387 // point to select the true and false operands of the CMOV in the PHI.
36388 //
36389 // The code also handles two different cases of multiple CMOV opcodes
36390 // in a row.
36391 //
36392 // Case 1:
36393 // In this case, there are multiple CMOVs in a row, all which are based on
36394 // the same condition setting (or the exact opposite condition setting).
36395 // In this case we can lower all the CMOVs using a single inserted BB, and
36396 // then make a number of PHIs at the join point to model the CMOVs. The only
36397 // trickiness here, is that in a case like:
36398 //
36399 // t2 = CMOV cond1 t1, f1
36400 // t3 = CMOV cond1 t2, f2
36401 //
36402 // when rewriting this into PHIs, we have to perform some renaming on the
36403 // temps since you cannot have a PHI operand refer to a PHI result earlier
36404 // in the same block. The "simple" but wrong lowering would be:
36405 //
36406 // t2 = PHI t1(BB1), f1(BB2)
36407 // t3 = PHI t2(BB1), f2(BB2)
36408 //
36409 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36410 // renaming is to note that on the path through BB1, t2 is really just a
36411 // copy of t1, and do that renaming, properly generating:
36412 //
36413 // t2 = PHI t1(BB1), f1(BB2)
36414 // t3 = PHI t1(BB1), f2(BB2)
36415 //
36416 // Case 2:
36417 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36418 // function - EmitLoweredCascadedSelect.
36419
36420 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36422 MachineInstr *LastCMOV = &MI;
36424
36425 // Check for case 1, where there are multiple CMOVs with the same condition
36426 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36427 // number of jumps the most.
36428
36429 if (isCMOVPseudo(MI)) {
36430 // See if we have a string of CMOVS with the same condition. Skip over
36431 // intervening debug insts.
36432 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36433 (NextMIIt->getOperand(3).getImm() == CC ||
36434 NextMIIt->getOperand(3).getImm() == OppCC)) {
36435 LastCMOV = &*NextMIIt;
36436 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36437 }
36438 }
36439
36440 // This checks for case 2, but only do this if we didn't already find
36441 // case 1, as indicated by LastCMOV == MI.
36442 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36443 NextMIIt->getOpcode() == MI.getOpcode() &&
36444 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36445 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36446 NextMIIt->getOperand(1).isKill()) {
36447 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36448 }
36449
36450 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36451 MachineFunction *F = ThisMBB->getParent();
36452 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36453 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36454
36455 MachineFunction::iterator It = ++ThisMBB->getIterator();
36456 F->insert(It, FalseMBB);
36457 F->insert(It, SinkMBB);
36458
36459 // Set the call frame size on entry to the new basic blocks.
36460 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36461 FalseMBB->setCallFrameSize(CallFrameSize);
36462 SinkMBB->setCallFrameSize(CallFrameSize);
36463
36464 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36465 // live into the sink and copy blocks.
36466 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36467 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36468 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36469 FalseMBB->addLiveIn(X86::EFLAGS);
36470 SinkMBB->addLiveIn(X86::EFLAGS);
36471 }
36472
36473 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36475 MachineBasicBlock::iterator(LastCMOV));
36476 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36477 if (MI.isDebugInstr())
36478 SinkMBB->push_back(MI.removeFromParent());
36479
36480 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36481 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36482 std::next(MachineBasicBlock::iterator(LastCMOV)),
36483 ThisMBB->end());
36484 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36485
36486 // Fallthrough block for ThisMBB.
36487 ThisMBB->addSuccessor(FalseMBB);
36488 // The true block target of the first (or only) branch is always a SinkMBB.
36489 ThisMBB->addSuccessor(SinkMBB);
36490 // Fallthrough block for FalseMBB.
36491 FalseMBB->addSuccessor(SinkMBB);
36492
36493 // Create the conditional branch instruction.
36494 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36495
36496 // SinkMBB:
36497 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36498 // ...
36501 std::next(MachineBasicBlock::iterator(LastCMOV));
36502 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36503
36504 // Now remove the CMOV(s).
36505 ThisMBB->erase(MIItBegin, MIItEnd);
36506
36507 return SinkMBB;
36508}
36509
36510static unsigned getSUBriOpcode(bool IsLP64) {
36511 if (IsLP64)
36512 return X86::SUB64ri32;
36513 else
36514 return X86::SUB32ri;
36515}
36516
36518X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36519 MachineBasicBlock *MBB) const {
36520 MachineFunction *MF = MBB->getParent();
36521 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36522 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36523 const MIMetadata MIMD(MI);
36524 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36525
36526 const unsigned ProbeSize = getStackProbeSize(*MF);
36527
36528 MachineRegisterInfo &MRI = MF->getRegInfo();
36529 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36530 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36531 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36532
36534 MF->insert(MBBIter, testMBB);
36535 MF->insert(MBBIter, blockMBB);
36536 MF->insert(MBBIter, tailMBB);
36537
36538 Register sizeVReg = MI.getOperand(1).getReg();
36539
36540 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36541
36542 Register TmpStackPtr = MRI.createVirtualRegister(
36543 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36544 Register FinalStackPtr = MRI.createVirtualRegister(
36545 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36546
36547 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36548 .addReg(physSPReg);
36549 {
36550 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36551 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36552 .addReg(TmpStackPtr)
36553 .addReg(sizeVReg);
36554 }
36555
36556 // test rsp size
36557
36558 BuildMI(testMBB, MIMD,
36559 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36560 .addReg(FinalStackPtr)
36561 .addReg(physSPReg);
36562
36563 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36564 .addMBB(tailMBB)
36566 testMBB->addSuccessor(blockMBB);
36567 testMBB->addSuccessor(tailMBB);
36568
36569 // Touch the block then extend it. This is done on the opposite side of
36570 // static probe where we allocate then touch, to avoid the need of probing the
36571 // tail of the static alloca. Possible scenarios are:
36572 //
36573 // + ---- <- ------------ <- ------------- <- ------------ +
36574 // | |
36575 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36576 // | |
36577 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36578 //
36579 // The property we want to enforce is to never have more than [page alloc] between two probes.
36580
36581 const unsigned XORMIOpc =
36582 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36583 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36584 .addImm(0);
36585
36586 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36587 physSPReg)
36588 .addReg(physSPReg)
36589 .addImm(ProbeSize);
36590
36591 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36592 blockMBB->addSuccessor(testMBB);
36593
36594 // Replace original instruction by the expected stack ptr
36595 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36596 MI.getOperand(0).getReg())
36597 .addReg(FinalStackPtr);
36598
36599 tailMBB->splice(tailMBB->end(), MBB,
36600 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36602 MBB->addSuccessor(testMBB);
36603
36604 // Delete the original pseudo instruction.
36605 MI.eraseFromParent();
36606
36607 // And we're done.
36608 return tailMBB;
36609}
36610
36612X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36613 MachineBasicBlock *BB) const {
36614 MachineFunction *MF = BB->getParent();
36615 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36616 const MIMetadata MIMD(MI);
36617 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36618
36619 assert(MF->shouldSplitStack());
36620
36621 const bool Is64Bit = Subtarget.is64Bit();
36622 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36623
36624 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36625 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36626
36627 // BB:
36628 // ... [Till the alloca]
36629 // If stacklet is not large enough, jump to mallocMBB
36630 //
36631 // bumpMBB:
36632 // Allocate by subtracting from RSP
36633 // Jump to continueMBB
36634 //
36635 // mallocMBB:
36636 // Allocate by call to runtime
36637 //
36638 // continueMBB:
36639 // ...
36640 // [rest of original BB]
36641 //
36642
36643 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36644 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36645 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36646
36647 MachineRegisterInfo &MRI = MF->getRegInfo();
36648 const TargetRegisterClass *AddrRegClass =
36650
36651 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36652 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36653 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36654 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36655 sizeVReg = MI.getOperand(1).getReg(),
36656 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36657
36658 MachineFunction::iterator MBBIter = ++BB->getIterator();
36659
36660 MF->insert(MBBIter, bumpMBB);
36661 MF->insert(MBBIter, mallocMBB);
36662 MF->insert(MBBIter, continueMBB);
36663
36664 continueMBB->splice(continueMBB->begin(), BB,
36665 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36666 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36667
36668 // Add code to the main basic block to check if the stack limit has been hit,
36669 // and if so, jump to mallocMBB otherwise to bumpMBB.
36670 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36671 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36672 .addReg(tmpSPVReg).addReg(sizeVReg);
36673 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36674 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36675 .addReg(SPLimitVReg);
36676 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36677
36678 // bumpMBB simply decreases the stack pointer, since we know the current
36679 // stacklet has enough space.
36680 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36681 .addReg(SPLimitVReg);
36682 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36683 .addReg(SPLimitVReg);
36684 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36685
36686 // Calls into a routine in libgcc to allocate more space from the heap.
36687 const uint32_t *RegMask =
36688 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36689 if (IsLP64) {
36690 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36691 .addReg(sizeVReg);
36692 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36693 .addExternalSymbol("__morestack_allocate_stack_space")
36694 .addRegMask(RegMask)
36695 .addReg(X86::RDI, RegState::Implicit)
36696 .addReg(X86::RAX, RegState::ImplicitDefine);
36697 } else if (Is64Bit) {
36698 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36699 .addReg(sizeVReg);
36700 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36701 .addExternalSymbol("__morestack_allocate_stack_space")
36702 .addRegMask(RegMask)
36703 .addReg(X86::EDI, RegState::Implicit)
36704 .addReg(X86::EAX, RegState::ImplicitDefine);
36705 } else {
36706 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36707 .addImm(12);
36708 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36709 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36710 .addExternalSymbol("__morestack_allocate_stack_space")
36711 .addRegMask(RegMask)
36712 .addReg(X86::EAX, RegState::ImplicitDefine);
36713 }
36714
36715 if (!Is64Bit)
36716 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36717 .addImm(16);
36718
36719 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36720 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36721 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36722
36723 // Set up the CFG correctly.
36724 BB->addSuccessor(bumpMBB);
36725 BB->addSuccessor(mallocMBB);
36726 mallocMBB->addSuccessor(continueMBB);
36727 bumpMBB->addSuccessor(continueMBB);
36728
36729 // Take care of the PHI nodes.
36730 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36731 MI.getOperand(0).getReg())
36732 .addReg(mallocPtrVReg)
36733 .addMBB(mallocMBB)
36734 .addReg(bumpSPPtrVReg)
36735 .addMBB(bumpMBB);
36736
36737 // Delete the original pseudo instruction.
36738 MI.eraseFromParent();
36739
36740 // And we're done.
36741 return continueMBB;
36742}
36743
36745X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36746 MachineBasicBlock *BB) const {
36747 MachineFunction *MF = BB->getParent();
36748 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36749 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36750 const MIMetadata MIMD(MI);
36751
36754 "SEH does not use catchret!");
36755
36756 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36757 if (!Subtarget.is32Bit())
36758 return BB;
36759
36760 // C++ EH creates a new target block to hold the restore code, and wires up
36761 // the new block to the return destination with a normal JMP_4.
36762 MachineBasicBlock *RestoreMBB =
36764 assert(BB->succ_size() == 1);
36765 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36766 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36767 BB->addSuccessor(RestoreMBB);
36768 MI.getOperand(0).setMBB(RestoreMBB);
36769
36770 // Marking this as an EH pad but not a funclet entry block causes PEI to
36771 // restore stack pointers in the block.
36772 RestoreMBB->setIsEHPad(true);
36773
36774 auto RestoreMBBI = RestoreMBB->begin();
36775 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36776 return BB;
36777}
36778
36780X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36781 MachineBasicBlock *BB) const {
36782 // This is pretty easy. We're taking the value that we received from
36783 // our load from the relocation, sticking it in either RDI (x86-64)
36784 // or EAX and doing an indirect call. The return value will then
36785 // be in the normal return register.
36786 MachineFunction *F = BB->getParent();
36787 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36788 const MIMetadata MIMD(MI);
36789
36790 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36791 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36792
36793 // Get a register mask for the lowered call.
36794 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36795 // proper register mask.
36796 const uint32_t *RegMask =
36797 Subtarget.is64Bit() ?
36798 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36799 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36800 if (Subtarget.is64Bit()) {
36801 MachineInstrBuilder MIB =
36802 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36803 .addReg(X86::RIP)
36804 .addImm(0)
36805 .addReg(0)
36806 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36807 MI.getOperand(3).getTargetFlags())
36808 .addReg(0);
36809 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36810 addDirectMem(MIB, X86::RDI);
36811 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36812 } else if (!isPositionIndependent()) {
36813 MachineInstrBuilder MIB =
36814 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36815 .addReg(0)
36816 .addImm(0)
36817 .addReg(0)
36818 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36819 MI.getOperand(3).getTargetFlags())
36820 .addReg(0);
36821 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36822 addDirectMem(MIB, X86::EAX);
36823 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36824 } else {
36825 MachineInstrBuilder MIB =
36826 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36827 .addReg(TII->getGlobalBaseReg(F))
36828 .addImm(0)
36829 .addReg(0)
36830 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36831 MI.getOperand(3).getTargetFlags())
36832 .addReg(0);
36833 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36834 addDirectMem(MIB, X86::EAX);
36835 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36836 }
36837
36838 MI.eraseFromParent(); // The pseudo instruction is gone now.
36839 return BB;
36840}
36841
36842static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36843 switch (RPOpc) {
36844 case X86::INDIRECT_THUNK_CALL32:
36845 return X86::CALLpcrel32;
36846 case X86::INDIRECT_THUNK_CALL64:
36847 return X86::CALL64pcrel32;
36848 case X86::INDIRECT_THUNK_TCRETURN32:
36849 return X86::TCRETURNdi;
36850 case X86::INDIRECT_THUNK_TCRETURN64:
36851 return X86::TCRETURNdi64;
36852 }
36853 llvm_unreachable("not indirect thunk opcode");
36854}
36855
36856static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36857 Register Reg) {
36858 if (Subtarget.useRetpolineExternalThunk()) {
36859 // When using an external thunk for retpolines, we pick names that match the
36860 // names GCC happens to use as well. This helps simplify the implementation
36861 // of the thunks for kernels where they have no easy ability to create
36862 // aliases and are doing non-trivial configuration of the thunk's body. For
36863 // example, the Linux kernel will do boot-time hot patching of the thunk
36864 // bodies and cannot easily export aliases of these to loaded modules.
36865 //
36866 // Note that at any point in the future, we may need to change the semantics
36867 // of how we implement retpolines and at that time will likely change the
36868 // name of the called thunk. Essentially, there is no hard guarantee that
36869 // LLVM will generate calls to specific thunks, we merely make a best-effort
36870 // attempt to help out kernels and other systems where duplicating the
36871 // thunks is costly.
36872 switch (Reg.id()) {
36873 case X86::EAX:
36874 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36875 return "__x86_indirect_thunk_eax";
36876 case X86::ECX:
36877 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36878 return "__x86_indirect_thunk_ecx";
36879 case X86::EDX:
36880 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36881 return "__x86_indirect_thunk_edx";
36882 case X86::EDI:
36883 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36884 return "__x86_indirect_thunk_edi";
36885 case X86::R11:
36886 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36887 return "__x86_indirect_thunk_r11";
36888 }
36889 llvm_unreachable("unexpected reg for external indirect thunk");
36890 }
36891
36892 if (Subtarget.useRetpolineIndirectCalls() ||
36893 Subtarget.useRetpolineIndirectBranches()) {
36894 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36895 switch (Reg.id()) {
36896 case X86::EAX:
36897 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36898 return "__llvm_retpoline_eax";
36899 case X86::ECX:
36900 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36901 return "__llvm_retpoline_ecx";
36902 case X86::EDX:
36903 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36904 return "__llvm_retpoline_edx";
36905 case X86::EDI:
36906 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36907 return "__llvm_retpoline_edi";
36908 case X86::R11:
36909 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36910 return "__llvm_retpoline_r11";
36911 }
36912 llvm_unreachable("unexpected reg for retpoline");
36913 }
36914
36915 if (Subtarget.useLVIControlFlowIntegrity()) {
36916 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36917 return "__llvm_lvi_thunk_r11";
36918 }
36919 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36920}
36921
36923X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36924 MachineBasicBlock *BB) const {
36925 // Copy the virtual register into the R11 physical register and
36926 // call the retpoline thunk.
36927 const MIMetadata MIMD(MI);
36928 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36929 Register CalleeVReg = MI.getOperand(0).getReg();
36930 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36931
36932 // Find an available scratch register to hold the callee. On 64-bit, we can
36933 // just use R11, but we scan for uses anyway to ensure we don't generate
36934 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36935 // already a register use operand to the call to hold the callee. If none
36936 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36937 // register and ESI is the base pointer to realigned stack frames with VLAs.
36938 SmallVector<Register, 3> AvailableRegs;
36939 if (Subtarget.is64Bit())
36940 AvailableRegs.push_back(X86::R11);
36941 else
36942 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36943
36944 // Zero out any registers that are already used.
36945 for (const auto &MO : MI.operands()) {
36946 if (MO.isReg() && MO.isUse())
36947 llvm::replace(AvailableRegs, MO.getReg(), Register());
36948 }
36949
36950 // Choose the first remaining non-zero available register.
36951 Register AvailableReg;
36952 for (Register MaybeReg : AvailableRegs) {
36953 if (MaybeReg) {
36954 AvailableReg = MaybeReg;
36955 break;
36956 }
36957 }
36958 if (!AvailableReg)
36959 report_fatal_error("calling convention incompatible with retpoline, no "
36960 "available registers");
36961
36962 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36963
36964 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36965 .addReg(CalleeVReg);
36966 MI.getOperand(0).ChangeToES(Symbol);
36967 MI.setDesc(TII->get(Opc));
36968 MachineInstrBuilder(*BB->getParent(), &MI)
36969 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36970 return BB;
36971}
36972
36973/// SetJmp implies future control flow change upon calling the corresponding
36974/// LongJmp.
36975/// Instead of using the 'return' instruction, the long jump fixes the stack and
36976/// performs an indirect branch. To do so it uses the registers that were stored
36977/// in the jump buffer (when calling SetJmp).
36978/// In case the shadow stack is enabled we need to fix it as well, because some
36979/// return addresses will be skipped.
36980/// The function will save the SSP for future fixing in the function
36981/// emitLongJmpShadowStackFix.
36982/// \sa emitLongJmpShadowStackFix
36983/// \param [in] MI The temporary Machine Instruction for the builtin.
36984/// \param [in] MBB The Machine Basic Block that will be modified.
36985void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36986 MachineBasicBlock *MBB) const {
36987 const MIMetadata MIMD(MI);
36988 MachineFunction *MF = MBB->getParent();
36989 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36990 MachineRegisterInfo &MRI = MF->getRegInfo();
36991 MachineInstrBuilder MIB;
36992
36993 // Memory Reference.
36994 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36995
36996 // Initialize a register with zero.
36997 MVT PVT = getPointerTy(MF->getDataLayout());
36998 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36999 Register ZReg = MRI.createVirtualRegister(PtrRC);
37000 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
37001 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
37002 .addDef(ZReg)
37003 .addReg(ZReg, RegState::Undef)
37004 .addReg(ZReg, RegState::Undef);
37005
37006 // Read the current SSP Register value to the zeroed register.
37007 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37008 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37009 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37010
37011 // Write the SSP register value to offset 3 in input memory buffer.
37012 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37013 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
37014 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37015 const unsigned MemOpndSlot = 1;
37016 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37017 if (i == X86::AddrDisp)
37018 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37019 else
37020 MIB.add(MI.getOperand(MemOpndSlot + i));
37021 }
37022 MIB.addReg(SSPCopyReg);
37023 MIB.setMemRefs(MMOs);
37024}
37025
37027X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37028 MachineBasicBlock *MBB) const {
37029 const MIMetadata MIMD(MI);
37030 MachineFunction *MF = MBB->getParent();
37031 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37032 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37033 MachineRegisterInfo &MRI = MF->getRegInfo();
37034
37035 const BasicBlock *BB = MBB->getBasicBlock();
37037
37038 // Memory Reference
37039 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37040
37041 unsigned MemOpndSlot = 0;
37042
37043 unsigned CurOp = 0;
37044
37045 Register DstReg = MI.getOperand(CurOp++).getReg();
37046 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37047 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37048 (void)TRI;
37049 Register mainDstReg = MRI.createVirtualRegister(RC);
37050 Register restoreDstReg = MRI.createVirtualRegister(RC);
37051
37052 MemOpndSlot = CurOp;
37053
37054 MVT PVT = getPointerTy(MF->getDataLayout());
37055 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37056 "Invalid Pointer Size!");
37057
37058 // For v = setjmp(buf), we generate
37059 //
37060 // thisMBB:
37061 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37062 // SjLjSetup restoreMBB
37063 //
37064 // mainMBB:
37065 // v_main = 0
37066 //
37067 // sinkMBB:
37068 // v = phi(main, restore)
37069 //
37070 // restoreMBB:
37071 // if base pointer being used, load it from frame
37072 // v_restore = 1
37073
37074 MachineBasicBlock *thisMBB = MBB;
37075 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37076 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37077 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37078 MF->insert(I, mainMBB);
37079 MF->insert(I, sinkMBB);
37080 MF->push_back(restoreMBB);
37081 restoreMBB->setMachineBlockAddressTaken();
37082
37083 MachineInstrBuilder MIB;
37084
37085 // Transfer the remainder of BB and its successor edges to sinkMBB.
37086 sinkMBB->splice(sinkMBB->begin(), MBB,
37087 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37089
37090 // thisMBB:
37091 unsigned PtrStoreOpc = 0;
37092 Register LabelReg;
37093 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37094 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37096
37097 // Prepare IP either in reg or imm.
37098 if (!UseImmLabel) {
37099 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37100 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37101 LabelReg = MRI.createVirtualRegister(PtrRC);
37102 if (Subtarget.is64Bit()) {
37103 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37104 .addReg(X86::RIP)
37105 .addImm(0)
37106 .addReg(0)
37107 .addMBB(restoreMBB)
37108 .addReg(0);
37109 } else {
37110 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37111 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37112 .addReg(XII->getGlobalBaseReg(MF))
37113 .addImm(0)
37114 .addReg(0)
37115 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37116 .addReg(0);
37117 }
37118 } else
37119 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37120 // Store IP
37121 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37122 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37123 if (i == X86::AddrDisp)
37124 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37125 else
37126 MIB.add(MI.getOperand(MemOpndSlot + i));
37127 }
37128 if (!UseImmLabel)
37129 MIB.addReg(LabelReg);
37130 else
37131 MIB.addMBB(restoreMBB);
37132 MIB.setMemRefs(MMOs);
37133
37134 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37135 emitSetJmpShadowStackFix(MI, thisMBB);
37136 }
37137
37138 // Setup
37139 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37140 .addMBB(restoreMBB);
37141
37142 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37143 MIB.addRegMask(RegInfo->getNoPreservedMask());
37144 thisMBB->addSuccessor(mainMBB);
37145 thisMBB->addSuccessor(restoreMBB);
37146
37147 // mainMBB:
37148 // EAX = 0
37149 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37150 mainMBB->addSuccessor(sinkMBB);
37151
37152 // sinkMBB:
37153 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37154 .addReg(mainDstReg)
37155 .addMBB(mainMBB)
37156 .addReg(restoreDstReg)
37157 .addMBB(restoreMBB);
37158
37159 // restoreMBB:
37160 if (RegInfo->hasBasePointer(*MF)) {
37161 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37162 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37163 X86FI->setRestoreBasePointer(MF);
37164 Register FramePtr = RegInfo->getFrameRegister(*MF);
37165 Register BasePtr = RegInfo->getBaseRegister();
37166 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37167 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37168 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37170 }
37171 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37172 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37173 restoreMBB->addSuccessor(sinkMBB);
37174
37175 MI.eraseFromParent();
37176 return sinkMBB;
37177}
37178
37179/// Fix the shadow stack using the previously saved SSP pointer.
37180/// \sa emitSetJmpShadowStackFix
37181/// \param [in] MI The temporary Machine Instruction for the builtin.
37182/// \param [in] MBB The Machine Basic Block that will be modified.
37183/// \return The sink MBB that will perform the future indirect branch.
37185X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37186 MachineBasicBlock *MBB) const {
37187 const MIMetadata MIMD(MI);
37188 MachineFunction *MF = MBB->getParent();
37189 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37190 MachineRegisterInfo &MRI = MF->getRegInfo();
37191
37192 // Memory Reference
37193 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37194
37195 MVT PVT = getPointerTy(MF->getDataLayout());
37196 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37197
37198 // checkSspMBB:
37199 // xor vreg1, vreg1
37200 // rdssp vreg1
37201 // test vreg1, vreg1
37202 // je sinkMBB # Jump if Shadow Stack is not supported
37203 // fallMBB:
37204 // mov buf+24/12(%rip), vreg2
37205 // sub vreg1, vreg2
37206 // jbe sinkMBB # No need to fix the Shadow Stack
37207 // fixShadowMBB:
37208 // shr 3/2, vreg2
37209 // incssp vreg2 # fix the SSP according to the lower 8 bits
37210 // shr 8, vreg2
37211 // je sinkMBB
37212 // fixShadowLoopPrepareMBB:
37213 // shl vreg2
37214 // mov 128, vreg3
37215 // fixShadowLoopMBB:
37216 // incssp vreg3
37217 // dec vreg2
37218 // jne fixShadowLoopMBB # Iterate until you finish fixing
37219 // # the Shadow Stack
37220 // sinkMBB:
37221
37223 const BasicBlock *BB = MBB->getBasicBlock();
37224
37225 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37226 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37227 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37228 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37229 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37230 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37231 MF->insert(I, checkSspMBB);
37232 MF->insert(I, fallMBB);
37233 MF->insert(I, fixShadowMBB);
37234 MF->insert(I, fixShadowLoopPrepareMBB);
37235 MF->insert(I, fixShadowLoopMBB);
37236 MF->insert(I, sinkMBB);
37237
37238 // Transfer the remainder of BB and its successor edges to sinkMBB.
37239 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37240 MBB->end());
37242
37243 MBB->addSuccessor(checkSspMBB);
37244
37245 // Initialize a register with zero.
37246 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37247 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37248
37249 if (PVT == MVT::i64) {
37250 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37251 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37252 .addImm(0)
37253 .addReg(ZReg)
37254 .addImm(X86::sub_32bit);
37255 ZReg = TmpZReg;
37256 }
37257
37258 // Read the current SSP Register value to the zeroed register.
37259 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37260 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37261 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37262
37263 // Check whether the result of the SSP register is zero and jump directly
37264 // to the sink.
37265 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37266 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37267 .addReg(SSPCopyReg)
37268 .addReg(SSPCopyReg);
37269 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37270 .addMBB(sinkMBB)
37272 checkSspMBB->addSuccessor(sinkMBB);
37273 checkSspMBB->addSuccessor(fallMBB);
37274
37275 // Reload the previously saved SSP register value.
37276 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37277 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37278 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37279 MachineInstrBuilder MIB =
37280 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37281 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37282 const MachineOperand &MO = MI.getOperand(i);
37283 if (i == X86::AddrDisp)
37284 MIB.addDisp(MO, SPPOffset);
37285 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37286 // preserve kill flags.
37287 MIB.addReg(MO.getReg());
37288 else
37289 MIB.add(MO);
37290 }
37291 MIB.setMemRefs(MMOs);
37292
37293 // Subtract the current SSP from the previous SSP.
37294 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37295 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37296 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37297 .addReg(PrevSSPReg)
37298 .addReg(SSPCopyReg);
37299
37300 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37301 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37302 .addMBB(sinkMBB)
37304 fallMBB->addSuccessor(sinkMBB);
37305 fallMBB->addSuccessor(fixShadowMBB);
37306
37307 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37308 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37309 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37310 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37311 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37312 .addReg(SspSubReg)
37313 .addImm(Offset);
37314
37315 // Increase SSP when looking only on the lower 8 bits of the delta.
37316 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37317 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37318
37319 // Reset the lower 8 bits.
37320 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37321 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37322 .addReg(SspFirstShrReg)
37323 .addImm(8);
37324
37325 // Jump if the result of the shift is zero.
37326 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37327 .addMBB(sinkMBB)
37329 fixShadowMBB->addSuccessor(sinkMBB);
37330 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37331
37332 // Do a single shift left.
37333 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37334 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37335 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37336 .addReg(SspSecondShrReg)
37337 .addImm(1);
37338
37339 // Save the value 128 to a register (will be used next with incssp).
37340 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37341 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37342 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37343 .addImm(128);
37344 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37345
37346 // Since incssp only looks at the lower 8 bits, we might need to do several
37347 // iterations of incssp until we finish fixing the shadow stack.
37348 Register DecReg = MRI.createVirtualRegister(PtrRC);
37349 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37350 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37351 .addReg(SspAfterShlReg)
37352 .addMBB(fixShadowLoopPrepareMBB)
37353 .addReg(DecReg)
37354 .addMBB(fixShadowLoopMBB);
37355
37356 // Every iteration we increase the SSP by 128.
37357 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37358
37359 // Every iteration we decrement the counter by 1.
37360 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37361 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37362
37363 // Jump if the counter is not zero yet.
37364 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37365 .addMBB(fixShadowLoopMBB)
37367 fixShadowLoopMBB->addSuccessor(sinkMBB);
37368 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37369
37370 return sinkMBB;
37371}
37372
37374X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37375 MachineBasicBlock *MBB) const {
37376 const MIMetadata MIMD(MI);
37377 MachineFunction *MF = MBB->getParent();
37378 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37379 MachineRegisterInfo &MRI = MF->getRegInfo();
37380
37381 // Memory Reference
37382 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37383
37384 MVT PVT = getPointerTy(MF->getDataLayout());
37385 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37386 "Invalid Pointer Size!");
37387
37388 const TargetRegisterClass *RC =
37389 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37390 Register Tmp = MRI.createVirtualRegister(RC);
37391 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37392 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37393 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37394 Register SP = RegInfo->getStackRegister();
37395
37396 MachineInstrBuilder MIB;
37397
37398 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37399 const int64_t SPOffset = 2 * PVT.getStoreSize();
37400
37401 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37402 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37403
37404 MachineBasicBlock *thisMBB = MBB;
37405
37406 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37407 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37408 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37409 }
37410
37411 // Reload FP
37412 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37413 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37414 const MachineOperand &MO = MI.getOperand(i);
37415 if (MO.isReg()) // Don't add the whole operand, we don't want to
37416 // preserve kill flags.
37417 MIB.addReg(MO.getReg());
37418 else
37419 MIB.add(MO);
37420 }
37421 MIB.setMemRefs(MMOs);
37423
37424 // Reload IP
37425 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37426 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37427 const MachineOperand &MO = MI.getOperand(i);
37428 if (i == X86::AddrDisp)
37429 MIB.addDisp(MO, LabelOffset);
37430 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37431 // preserve kill flags.
37432 MIB.addReg(MO.getReg());
37433 else
37434 MIB.add(MO);
37435 }
37436 MIB.setMemRefs(MMOs);
37437
37438 // Reload SP
37439 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37440 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37441 if (i == X86::AddrDisp)
37442 MIB.addDisp(MI.getOperand(i), SPOffset);
37443 else
37444 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37445 // the last instruction of the expansion.
37446 }
37447 MIB.setMemRefs(MMOs);
37449
37450 // Jump
37451 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37452
37453 MI.eraseFromParent();
37454 return thisMBB;
37455}
37456
37457void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37459 MachineBasicBlock *DispatchBB,
37460 int FI) const {
37461 const MIMetadata MIMD(MI);
37462 MachineFunction *MF = MBB->getParent();
37463 MachineRegisterInfo *MRI = &MF->getRegInfo();
37464 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37465
37466 MVT PVT = getPointerTy(MF->getDataLayout());
37467 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37468
37469 unsigned Op = 0;
37470 Register VR;
37471
37472 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37474
37475 if (UseImmLabel) {
37476 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37477 } else {
37478 const TargetRegisterClass *TRC =
37479 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37480 VR = MRI->createVirtualRegister(TRC);
37481 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37482
37483 if (Subtarget.is64Bit())
37484 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37485 .addReg(X86::RIP)
37486 .addImm(1)
37487 .addReg(0)
37488 .addMBB(DispatchBB)
37489 .addReg(0);
37490 else
37491 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37492 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37493 .addImm(1)
37494 .addReg(0)
37495 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37496 .addReg(0);
37497 }
37498
37499 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37500 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37501 if (UseImmLabel)
37502 MIB.addMBB(DispatchBB);
37503 else
37504 MIB.addReg(VR);
37505}
37506
37508X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37509 MachineBasicBlock *BB) const {
37510 const MIMetadata MIMD(MI);
37511 MachineFunction *MF = BB->getParent();
37512 MachineRegisterInfo *MRI = &MF->getRegInfo();
37513 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37514 int FI = MF->getFrameInfo().getFunctionContextIndex();
37515
37516 // Get a mapping of the call site numbers to all of the landing pads they're
37517 // associated with.
37518 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37519 unsigned MaxCSNum = 0;
37520 for (auto &MBB : *MF) {
37521 if (!MBB.isEHPad())
37522 continue;
37523
37524 MCSymbol *Sym = nullptr;
37525 for (const auto &MI : MBB) {
37526 if (MI.isDebugInstr())
37527 continue;
37528
37529 assert(MI.isEHLabel() && "expected EH_LABEL");
37530 Sym = MI.getOperand(0).getMCSymbol();
37531 break;
37532 }
37533
37534 if (!MF->hasCallSiteLandingPad(Sym))
37535 continue;
37536
37537 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37538 CallSiteNumToLPad[CSI].push_back(&MBB);
37539 MaxCSNum = std::max(MaxCSNum, CSI);
37540 }
37541 }
37542
37543 // Get an ordered list of the machine basic blocks for the jump table.
37544 std::vector<MachineBasicBlock *> LPadList;
37545 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37546 LPadList.reserve(CallSiteNumToLPad.size());
37547
37548 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37549 for (auto &LP : CallSiteNumToLPad[CSI]) {
37550 LPadList.push_back(LP);
37551 InvokeBBs.insert_range(LP->predecessors());
37552 }
37553 }
37554
37555 assert(!LPadList.empty() &&
37556 "No landing pad destinations for the dispatch jump table!");
37557
37558 // Create the MBBs for the dispatch code.
37559
37560 // Shove the dispatch's address into the return slot in the function context.
37561 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37562 DispatchBB->setIsEHPad(true);
37563
37564 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37565 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37566 DispatchBB->addSuccessor(TrapBB);
37567
37568 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37569 DispatchBB->addSuccessor(DispContBB);
37570
37571 // Insert MBBs.
37572 MF->push_back(DispatchBB);
37573 MF->push_back(DispContBB);
37574 MF->push_back(TrapBB);
37575
37576 // Insert code into the entry block that creates and registers the function
37577 // context.
37578 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37579
37580 // Create the jump table and associated information
37581 unsigned JTE = getJumpTableEncoding();
37582 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37583 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37584
37585 const X86RegisterInfo &RI = TII->getRegisterInfo();
37586 // Add a register mask with no preserved registers. This results in all
37587 // registers being marked as clobbered.
37588 if (RI.hasBasePointer(*MF)) {
37589 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37590 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37591 MFI->setRestoreBasePointer(MF);
37592
37593 Register FP = RI.getFrameRegister(*MF);
37594 Register BP = RI.getBaseRegister();
37595 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37596 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37599 } else {
37600 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37602 }
37603
37604 // IReg is used as an index in a memory operand and therefore can't be SP
37605 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37606 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37607 Subtarget.is64Bit() ? 8 : 4);
37608 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37609 .addReg(IReg)
37610 .addImm(LPadList.size());
37611 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37612 .addMBB(TrapBB)
37614
37615 if (Subtarget.is64Bit()) {
37616 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37617 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37618
37619 // leaq .LJTI0_0(%rip), BReg
37620 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37621 .addReg(X86::RIP)
37622 .addImm(1)
37623 .addReg(0)
37624 .addJumpTableIndex(MJTI)
37625 .addReg(0);
37626 // movzx IReg64, IReg
37627 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37628 .addImm(0)
37629 .addReg(IReg)
37630 .addImm(X86::sub_32bit);
37631
37632 switch (JTE) {
37634 // jmpq *(BReg,IReg64,8)
37635 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37636 .addReg(BReg)
37637 .addImm(8)
37638 .addReg(IReg64)
37639 .addImm(0)
37640 .addReg(0);
37641 break;
37643 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37644 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37645 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37646
37647 // movl (BReg,IReg64,4), OReg
37648 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37649 .addReg(BReg)
37650 .addImm(4)
37651 .addReg(IReg64)
37652 .addImm(0)
37653 .addReg(0);
37654 // movsx OReg64, OReg
37655 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37656 .addReg(OReg);
37657 // addq BReg, OReg64, TReg
37658 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37659 .addReg(OReg64)
37660 .addReg(BReg);
37661 // jmpq *TReg
37662 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37663 break;
37664 }
37665 default:
37666 llvm_unreachable("Unexpected jump table encoding");
37667 }
37668 } else {
37669 // jmpl *.LJTI0_0(,IReg,4)
37670 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37671 .addReg(0)
37672 .addImm(4)
37673 .addReg(IReg)
37674 .addJumpTableIndex(MJTI)
37675 .addReg(0);
37676 }
37677
37678 // Add the jump table entries as successors to the MBB.
37679 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37680 for (auto &LP : LPadList)
37681 if (SeenMBBs.insert(LP).second)
37682 DispContBB->addSuccessor(LP);
37683
37684 // N.B. the order the invoke BBs are processed in doesn't matter here.
37686 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37687 for (MachineBasicBlock *MBB : InvokeBBs) {
37688 // Remove the landing pad successor from the invoke block and replace it
37689 // with the new dispatch block.
37690 // Keep a copy of Successors since it's modified inside the loop.
37691 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37692 MBB->succ_rend());
37693 // FIXME: Avoid quadratic complexity.
37694 for (auto *MBBS : Successors) {
37695 if (MBBS->isEHPad()) {
37696 MBB->removeSuccessor(MBBS);
37697 MBBLPads.push_back(MBBS);
37698 }
37699 }
37700
37701 MBB->addSuccessor(DispatchBB);
37702
37703 // Find the invoke call and mark all of the callee-saved registers as
37704 // 'implicit defined' so that they're spilled. This prevents code from
37705 // moving instructions to before the EH block, where they will never be
37706 // executed.
37707 for (auto &II : reverse(*MBB)) {
37708 if (!II.isCall())
37709 continue;
37710
37711 DenseSet<Register> DefRegs;
37712 for (auto &MOp : II.operands())
37713 if (MOp.isReg())
37714 DefRegs.insert(MOp.getReg());
37715
37716 MachineInstrBuilder MIB(*MF, &II);
37717 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37718 Register Reg = SavedRegs[RegIdx];
37719 if (!DefRegs.contains(Reg))
37721 }
37722
37723 break;
37724 }
37725 }
37726
37727 // Mark all former landing pads as non-landing pads. The dispatch is the only
37728 // landing pad now.
37729 for (auto &LP : MBBLPads)
37730 LP->setIsEHPad(false);
37731
37732 // The instruction is gone now.
37733 MI.eraseFromParent();
37734 return BB;
37735}
37736
37738X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37739 MachineBasicBlock *BB) const {
37740 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37741 // calls may require proper stack alignment.
37742 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37743 const MIMetadata MIMD(MI);
37744 MachineFunction &MF = *BB->getParent();
37745
37746 // Emit CALLSEQ_START right before the instruction.
37747 MF.getFrameInfo().setAdjustsStack(true);
37748 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37749 MachineInstrBuilder CallseqStart =
37750 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37751 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37752
37753 // Emit CALLSEQ_END right after the instruction.
37754 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37755 MachineInstrBuilder CallseqEnd =
37756 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37757 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37758
37759 return BB;
37760}
37761
37764 MachineBasicBlock *BB) const {
37765 MachineFunction *MF = BB->getParent();
37766 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37767 const MIMetadata MIMD(MI);
37768
37769 auto TMMImmToTMMReg = [](unsigned Imm) {
37770 assert (Imm < 8 && "Illegal tmm index");
37771 return X86::TMM0 + Imm;
37772 };
37773 auto TMMImmToTMMPair = [](unsigned Imm) {
37774 assert(Imm < 8 && "Illegal tmm pair index.");
37775 return X86::TMM0_TMM1 + Imm / 2;
37776 };
37777 switch (MI.getOpcode()) {
37778 default:
37779 llvm_unreachable("Unexpected instr type to insert");
37780 case X86::INDIRECT_THUNK_CALL32:
37781 case X86::INDIRECT_THUNK_CALL64:
37782 case X86::INDIRECT_THUNK_TCRETURN32:
37783 case X86::INDIRECT_THUNK_TCRETURN64:
37784 return EmitLoweredIndirectThunk(MI, BB);
37785 case X86::CATCHRET:
37786 return EmitLoweredCatchRet(MI, BB);
37787 case X86::SEG_ALLOCA_32:
37788 case X86::SEG_ALLOCA_64:
37789 return EmitLoweredSegAlloca(MI, BB);
37790 case X86::PROBED_ALLOCA_32:
37791 case X86::PROBED_ALLOCA_64:
37792 return EmitLoweredProbedAlloca(MI, BB);
37793 case X86::TLSCall_32:
37794 case X86::TLSCall_64:
37795 return EmitLoweredTLSCall(MI, BB);
37796 case X86::CMOV_FR16:
37797 case X86::CMOV_FR16X:
37798 case X86::CMOV_FR32:
37799 case X86::CMOV_FR32X:
37800 case X86::CMOV_FR64:
37801 case X86::CMOV_FR64X:
37802 case X86::CMOV_GR8:
37803 case X86::CMOV_GR16:
37804 case X86::CMOV_GR32:
37805 case X86::CMOV_RFP32:
37806 case X86::CMOV_RFP64:
37807 case X86::CMOV_RFP80:
37808 case X86::CMOV_VR64:
37809 case X86::CMOV_VR128:
37810 case X86::CMOV_VR128X:
37811 case X86::CMOV_VR256:
37812 case X86::CMOV_VR256X:
37813 case X86::CMOV_VR512:
37814 case X86::CMOV_VK1:
37815 case X86::CMOV_VK2:
37816 case X86::CMOV_VK4:
37817 case X86::CMOV_VK8:
37818 case X86::CMOV_VK16:
37819 case X86::CMOV_VK32:
37820 case X86::CMOV_VK64:
37821 return EmitLoweredSelect(MI, BB);
37822
37823 case X86::FP80_ADDr:
37824 case X86::FP80_ADDm32: {
37825 // Change the floating point control register to use double extended
37826 // precision when performing the addition.
37827 int OrigCWFrameIdx =
37828 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37829 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37830 OrigCWFrameIdx);
37831
37832 // Load the old value of the control word...
37833 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37834 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37835 OrigCWFrameIdx);
37836
37837 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37838 // precision.
37839 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37840 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37841 .addReg(OldCW, RegState::Kill)
37842 .addImm(0x300);
37843
37844 // Extract to 16 bits.
37845 Register NewCW16 =
37846 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37847 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37848 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37849
37850 // Prepare memory for FLDCW.
37851 int NewCWFrameIdx =
37852 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37853 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37854 NewCWFrameIdx)
37855 .addReg(NewCW16, RegState::Kill);
37856
37857 // Reload the modified control word now...
37858 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37859 NewCWFrameIdx);
37860
37861 // Do the addition.
37862 if (MI.getOpcode() == X86::FP80_ADDr) {
37863 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37864 .add(MI.getOperand(0))
37865 .add(MI.getOperand(1))
37866 .add(MI.getOperand(2));
37867 } else {
37868 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37869 .add(MI.getOperand(0))
37870 .add(MI.getOperand(1))
37871 .add(MI.getOperand(2))
37872 .add(MI.getOperand(3))
37873 .add(MI.getOperand(4))
37874 .add(MI.getOperand(5))
37875 .add(MI.getOperand(6));
37876 }
37877
37878 // Reload the original control word now.
37879 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37880 OrigCWFrameIdx);
37881
37882 MI.eraseFromParent(); // The pseudo instruction is gone now.
37883 return BB;
37884 }
37885
37886 case X86::FP32_TO_INT16_IN_MEM:
37887 case X86::FP32_TO_INT32_IN_MEM:
37888 case X86::FP32_TO_INT64_IN_MEM:
37889 case X86::FP64_TO_INT16_IN_MEM:
37890 case X86::FP64_TO_INT32_IN_MEM:
37891 case X86::FP64_TO_INT64_IN_MEM:
37892 case X86::FP80_TO_INT16_IN_MEM:
37893 case X86::FP80_TO_INT32_IN_MEM:
37894 case X86::FP80_TO_INT64_IN_MEM: {
37895 // Change the floating point control register to use "round towards zero"
37896 // mode when truncating to an integer value.
37897 int OrigCWFrameIdx =
37898 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37899 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37900 OrigCWFrameIdx);
37901
37902 // Load the old value of the control word...
37903 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37904 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37905 OrigCWFrameIdx);
37906
37907 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37908 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37909 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37910 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37911
37912 // Extract to 16 bits.
37913 Register NewCW16 =
37914 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37915 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37916 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37917
37918 // Prepare memory for FLDCW.
37919 int NewCWFrameIdx =
37920 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37921 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37922 NewCWFrameIdx)
37923 .addReg(NewCW16, RegState::Kill);
37924
37925 // Reload the modified control word now...
37926 addFrameReference(BuildMI(*BB, MI, MIMD,
37927 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37928
37929 // Get the X86 opcode to use.
37930 unsigned Opc;
37931 switch (MI.getOpcode()) {
37932 // clang-format off
37933 default: llvm_unreachable("illegal opcode!");
37934 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37935 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37936 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37937 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37938 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37939 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37940 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37941 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37942 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37943 // clang-format on
37944 }
37945
37947 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37948 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37949
37950 // Reload the original control word now.
37951 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37952 OrigCWFrameIdx);
37953
37954 MI.eraseFromParent(); // The pseudo instruction is gone now.
37955 return BB;
37956 }
37957
37958 // xbegin
37959 case X86::XBEGIN:
37960 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37961
37962 case X86::VAARG_64:
37963 case X86::VAARG_X32:
37964 return EmitVAARGWithCustomInserter(MI, BB);
37965
37966 case X86::EH_SjLj_SetJmp32:
37967 case X86::EH_SjLj_SetJmp64:
37968 return emitEHSjLjSetJmp(MI, BB);
37969
37970 case X86::EH_SjLj_LongJmp32:
37971 case X86::EH_SjLj_LongJmp64:
37972 return emitEHSjLjLongJmp(MI, BB);
37973
37974 case X86::Int_eh_sjlj_setup_dispatch:
37975 return EmitSjLjDispatchBlock(MI, BB);
37976
37977 case TargetOpcode::STATEPOINT:
37978 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37979 // this point in the process. We diverge later.
37980 return emitPatchPoint(MI, BB);
37981
37982 case TargetOpcode::STACKMAP:
37983 case TargetOpcode::PATCHPOINT:
37984 return emitPatchPoint(MI, BB);
37985
37986 case TargetOpcode::PATCHABLE_EVENT_CALL:
37987 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37988 return emitPatchableEventCall(MI, BB);
37989
37990 case X86::LCMPXCHG8B: {
37991 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37992 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37993 // requires a memory operand. If it happens that current architecture is
37994 // i686 and for current function we need a base pointer
37995 // - which is ESI for i686 - register allocator would not be able to
37996 // allocate registers for an address in form of X(%reg, %reg, Y)
37997 // - there never would be enough unreserved registers during regalloc
37998 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37999 // We are giving a hand to register allocator by precomputing the address in
38000 // a new vreg using LEA.
38001
38002 // If it is not i686 or there is no base pointer - nothing to do here.
38003 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38004 return BB;
38005
38006 // Even though this code does not necessarily needs the base pointer to
38007 // be ESI, we check for that. The reason: if this assert fails, there are
38008 // some changes happened in the compiler base pointer handling, which most
38009 // probably have to be addressed somehow here.
38010 assert(TRI->getBaseRegister() == X86::ESI &&
38011 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38012 "base pointer in mind");
38013
38015 MVT SPTy = getPointerTy(MF->getDataLayout());
38016 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38017 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38018
38020 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38021 // does not use index register.
38022 if (AM.IndexReg == X86::NoRegister)
38023 return BB;
38024
38025 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38026 // four operand definitions that are E[ABCD] registers. We skip them and
38027 // then insert the LEA.
38028 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38029 while (RMBBI != BB->rend() &&
38030 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38031 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38032 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38033 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38034 ++RMBBI;
38035 }
38038 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38039
38040 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38041
38042 return BB;
38043 }
38044 case X86::LCMPXCHG16B_NO_RBX: {
38045 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38046 Register BasePtr = TRI->getBaseRegister();
38047 if (TRI->hasBasePointer(*MF) &&
38048 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38049 if (!BB->isLiveIn(BasePtr))
38050 BB->addLiveIn(BasePtr);
38051 // Save RBX into a virtual register.
38052 Register SaveRBX =
38053 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38054 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38055 .addReg(X86::RBX);
38056 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38058 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38059 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38060 MIB.add(MI.getOperand(Idx));
38061 MIB.add(MI.getOperand(X86::AddrNumOperands));
38062 MIB.addReg(SaveRBX);
38063 } else {
38064 // Simple case, just copy the virtual register to RBX.
38065 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38066 .add(MI.getOperand(X86::AddrNumOperands));
38068 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38069 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38070 MIB.add(MI.getOperand(Idx));
38071 }
38072 MI.eraseFromParent();
38073 return BB;
38074 }
38075 case X86::MWAITX: {
38076 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38077 Register BasePtr = TRI->getBaseRegister();
38078 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38079 // If no need to save the base pointer, we generate MWAITXrrr,
38080 // else we generate pseudo MWAITX_SAVE_RBX.
38081 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38082 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38083 .addReg(MI.getOperand(0).getReg());
38084 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38085 .addReg(MI.getOperand(1).getReg());
38086 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38087 .addReg(MI.getOperand(2).getReg());
38088 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38089 MI.eraseFromParent();
38090 } else {
38091 if (!BB->isLiveIn(BasePtr)) {
38092 BB->addLiveIn(BasePtr);
38093 }
38094 // Parameters can be copied into ECX and EAX but not EBX yet.
38095 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38096 .addReg(MI.getOperand(0).getReg());
38097 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38098 .addReg(MI.getOperand(1).getReg());
38099 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38100 // Save RBX into a virtual register.
38101 Register SaveRBX =
38102 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38103 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38104 .addReg(X86::RBX);
38105 // Generate mwaitx pseudo.
38106 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38107 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38108 .addDef(Dst) // Destination tied in with SaveRBX.
38109 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38110 .addUse(SaveRBX); // Save of base pointer.
38111 MI.eraseFromParent();
38112 }
38113 return BB;
38114 }
38115 case TargetOpcode::PREALLOCATED_SETUP: {
38116 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38117 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38118 MFI->setHasPreallocatedCall(true);
38119 int64_t PreallocatedId = MI.getOperand(0).getImm();
38120 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38121 assert(StackAdjustment != 0 && "0 stack adjustment");
38122 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38123 << StackAdjustment << "\n");
38124 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38125 .addReg(X86::ESP)
38126 .addImm(StackAdjustment);
38127 MI.eraseFromParent();
38128 return BB;
38129 }
38130 case TargetOpcode::PREALLOCATED_ARG: {
38131 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38132 int64_t PreallocatedId = MI.getOperand(1).getImm();
38133 int64_t ArgIdx = MI.getOperand(2).getImm();
38134 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38135 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38136 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38137 << ", arg offset " << ArgOffset << "\n");
38138 // stack pointer + offset
38139 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38140 MI.getOperand(0).getReg()),
38141 X86::ESP, false, ArgOffset);
38142 MI.eraseFromParent();
38143 return BB;
38144 }
38145 case X86::PTDPBSSD:
38146 case X86::PTDPBSUD:
38147 case X86::PTDPBUSD:
38148 case X86::PTDPBUUD:
38149 case X86::PTDPBF16PS:
38150 case X86::PTDPFP16PS:
38151 case X86::PTCMMIMFP16PS:
38152 case X86::PTCMMRLFP16PS:
38153 case X86::PTDPBF8PS:
38154 case X86::PTDPBHF8PS:
38155 case X86::PTDPHBF8PS:
38156 case X86::PTDPHF8PS:
38157 case X86::PTTDPBF16PS:
38158 case X86::PTTDPFP16PS:
38159 case X86::PTTCMMIMFP16PS:
38160 case X86::PTTCMMRLFP16PS:
38161 case X86::PTCONJTCMMIMFP16PS:
38162 case X86::PTMMULTF32PS:
38163 case X86::PTTMMULTF32PS: {
38164 unsigned Opc;
38165 switch (MI.getOpcode()) {
38166 default: llvm_unreachable("illegal opcode!");
38167 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38168 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38169 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38170 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38171 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38172 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38173 case X86::PTCMMIMFP16PS:
38174 Opc = X86::TCMMIMFP16PS;
38175 break;
38176 case X86::PTCMMRLFP16PS:
38177 Opc = X86::TCMMRLFP16PS;
38178 break;
38179 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38180 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38181 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38182 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38183 case X86::PTTDPBF16PS:
38184 Opc = X86::TTDPBF16PS;
38185 break;
38186 case X86::PTTDPFP16PS:
38187 Opc = X86::TTDPFP16PS;
38188 break;
38189 case X86::PTTCMMIMFP16PS:
38190 Opc = X86::TTCMMIMFP16PS;
38191 break;
38192 case X86::PTTCMMRLFP16PS:
38193 Opc = X86::TTCMMRLFP16PS;
38194 break;
38195 case X86::PTCONJTCMMIMFP16PS:
38196 Opc = X86::TCONJTCMMIMFP16PS;
38197 break;
38198 case X86::PTMMULTF32PS:
38199 Opc = X86::TMMULTF32PS;
38200 break;
38201 case X86::PTTMMULTF32PS:
38202 Opc = X86::TTMMULTF32PS;
38203 break;
38204 }
38205
38206 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38207 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38208 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38209 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38210 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38211
38212 MI.eraseFromParent(); // The pseudo is gone now.
38213 return BB;
38214 }
38215 case X86::PTILEZERO: {
38216 unsigned Imm = MI.getOperand(0).getImm();
38217 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38218 MI.eraseFromParent(); // The pseudo is gone now.
38219 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38221 return BB;
38222 }
38223 case X86::PTILEZEROV: {
38224 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38226 return BB;
38227 }
38228 case X86::PTILELOADDRS:
38229 case X86::PTILELOADDRST1:
38230 case X86::PTILELOADD:
38231 case X86::PTILELOADDT1:
38232 case X86::PTILESTORED: {
38233 unsigned Opc;
38234 switch (MI.getOpcode()) {
38235 default: llvm_unreachable("illegal opcode!");
38236#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38237 case X86::PTILELOADD:
38238 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38239 break;
38240 case X86::PTILELOADDT1:
38241 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38242 break;
38243 case X86::PTILESTORED:
38244 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38245 break;
38246 case X86::PTILELOADDRS:
38247 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38248 break;
38249 case X86::PTILELOADDRST1:
38250 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38251 break;
38252 }
38253#undef GET_EGPR_IF_ENABLED
38254
38255 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38256 unsigned CurOp = 0;
38257 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38258 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38260
38261 MIB.add(MI.getOperand(CurOp++)); // base
38262 MIB.add(MI.getOperand(CurOp++)); // scale
38263 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38264 MIB.add(MI.getOperand(CurOp++)); // displacement
38265 MIB.add(MI.getOperand(CurOp++)); // segment
38266
38267 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38268 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38270
38271 MI.eraseFromParent(); // The pseudo is gone now.
38272 return BB;
38273 }
38274 case X86::PT2RPNTLVWZ0:
38275 case X86::PT2RPNTLVWZ0T1:
38276 case X86::PT2RPNTLVWZ1:
38277 case X86::PT2RPNTLVWZ1T1:
38278 case X86::PT2RPNTLVWZ0RS:
38279 case X86::PT2RPNTLVWZ0RST1:
38280 case X86::PT2RPNTLVWZ1RS:
38281 case X86::PT2RPNTLVWZ1RST1: {
38282 const DebugLoc &DL = MI.getDebugLoc();
38283 unsigned Opc;
38284#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38285 switch (MI.getOpcode()) {
38286 default:
38287 llvm_unreachable("Unexpected instruction!");
38288 case X86::PT2RPNTLVWZ0:
38289 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38290 break;
38291 case X86::PT2RPNTLVWZ0T1:
38292 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38293 break;
38294 case X86::PT2RPNTLVWZ1:
38295 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38296 break;
38297 case X86::PT2RPNTLVWZ1T1:
38298 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38299 break;
38300 case X86::PT2RPNTLVWZ0RS:
38301 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38302 break;
38303 case X86::PT2RPNTLVWZ0RST1:
38304 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38305 break;
38306 case X86::PT2RPNTLVWZ1RS:
38307 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38308 break;
38309 case X86::PT2RPNTLVWZ1RST1:
38310 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38311 break;
38312 }
38313#undef GET_EGPR_IF_ENABLED
38314 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38315 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38316
38317 MIB.add(MI.getOperand(1)); // base
38318 MIB.add(MI.getOperand(2)); // scale
38319 MIB.add(MI.getOperand(3)); // index
38320 MIB.add(MI.getOperand(4)); // displacement
38321 MIB.add(MI.getOperand(5)); // segment
38322 MI.eraseFromParent(); // The pseudo is gone now.
38323 return BB;
38324 }
38325 case X86::PTTRANSPOSED:
38326 case X86::PTCONJTFP16: {
38327 const DebugLoc &DL = MI.getDebugLoc();
38328 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38329 : X86::TCONJTFP16;
38330
38331 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38332 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38333 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38334
38335 MI.eraseFromParent(); // The pseudo is gone now.
38336 return BB;
38337 }
38338 case X86::PTCVTROWPS2BF16Hrri:
38339 case X86::PTCVTROWPS2BF16Lrri:
38340 case X86::PTCVTROWPS2PHHrri:
38341 case X86::PTCVTROWPS2PHLrri:
38342 case X86::PTCVTROWD2PSrri:
38343 case X86::PTILEMOVROWrri: {
38344 const DebugLoc &DL = MI.getDebugLoc();
38345 unsigned Opc;
38346 switch (MI.getOpcode()) {
38347 default:
38348 llvm_unreachable("Unexpected instruction!");
38349 case X86::PTCVTROWD2PSrri:
38350 Opc = X86::TCVTROWD2PSrri;
38351 break;
38352 case X86::PTCVTROWPS2BF16Hrri:
38353 Opc = X86::TCVTROWPS2BF16Hrri;
38354 break;
38355 case X86::PTCVTROWPS2PHHrri:
38356 Opc = X86::TCVTROWPS2PHHrri;
38357 break;
38358 case X86::PTCVTROWPS2BF16Lrri:
38359 Opc = X86::TCVTROWPS2BF16Lrri;
38360 break;
38361 case X86::PTCVTROWPS2PHLrri:
38362 Opc = X86::TCVTROWPS2PHLrri;
38363 break;
38364 case X86::PTILEMOVROWrri:
38365 Opc = X86::TILEMOVROWrri;
38366 break;
38367 }
38368 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38369 MIB.add(MI.getOperand(0));
38370 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38371 MIB.addImm(MI.getOperand(2).getImm());
38372
38373 MI.eraseFromParent(); // The pseudo is gone now.
38374 return BB;
38375 }
38376 case X86::PTCVTROWPS2BF16Hrre:
38377 case X86::PTCVTROWPS2BF16Lrre:
38378 case X86::PTCVTROWPS2PHHrre:
38379 case X86::PTCVTROWPS2PHLrre:
38380 case X86::PTCVTROWD2PSrre:
38381 case X86::PTILEMOVROWrre: {
38382 const DebugLoc &DL = MI.getDebugLoc();
38383 unsigned Opc;
38384 switch (MI.getOpcode()) {
38385 default:
38386 llvm_unreachable("Unexpected instruction!");
38387 case X86::PTCVTROWD2PSrre:
38388 Opc = X86::TCVTROWD2PSrre;
38389 break;
38390 case X86::PTCVTROWPS2BF16Hrre:
38391 Opc = X86::TCVTROWPS2BF16Hrre;
38392 break;
38393 case X86::PTCVTROWPS2BF16Lrre:
38394 Opc = X86::TCVTROWPS2BF16Lrre;
38395 break;
38396 case X86::PTCVTROWPS2PHHrre:
38397 Opc = X86::TCVTROWPS2PHHrre;
38398 break;
38399 case X86::PTCVTROWPS2PHLrre:
38400 Opc = X86::TCVTROWPS2PHLrre;
38401 break;
38402 case X86::PTILEMOVROWrre:
38403 Opc = X86::TILEMOVROWrre;
38404 break;
38405 }
38406 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38407 MIB.add(MI.getOperand(0));
38408 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38409 MIB.add(MI.getOperand(2));
38410
38411 MI.eraseFromParent(); // The pseudo is gone now.
38412 return BB;
38413 }
38414 }
38415}
38416
38417//===----------------------------------------------------------------------===//
38418// X86 Optimization Hooks
38419//===----------------------------------------------------------------------===//
38420
38421bool
38423 const APInt &DemandedBits,
38424 const APInt &DemandedElts,
38425 TargetLoweringOpt &TLO) const {
38426 EVT VT = Op.getValueType();
38427 unsigned Opcode = Op.getOpcode();
38428 unsigned EltSize = VT.getScalarSizeInBits();
38429
38430 if (VT.isVector()) {
38431 // If the constant is only all signbits in the active bits, then we should
38432 // extend it to the entire constant to allow it act as a boolean constant
38433 // vector.
38434 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38435 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38436 return false;
38437 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38438 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38439 continue;
38440 const APInt &Val = V.getConstantOperandAPInt(i);
38441 if (Val.getBitWidth() > Val.getNumSignBits() &&
38442 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38443 return true;
38444 }
38445 return false;
38446 };
38447 // For vectors - if we have a constant, then try to sign extend.
38448 // TODO: Handle AND cases.
38449 unsigned ActiveBits = DemandedBits.getActiveBits();
38450 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38451 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38452 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38453 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38454 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38456 SDValue NewC =
38458 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38459 SDValue NewOp =
38460 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38461 return TLO.CombineTo(Op, NewOp);
38462 }
38463 return false;
38464 }
38465
38466 // Only optimize Ands to prevent shrinking a constant that could be
38467 // matched by movzx.
38468 if (Opcode != ISD::AND)
38469 return false;
38470
38471 // Make sure the RHS really is a constant.
38472 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38473 if (!C)
38474 return false;
38475
38476 const APInt &Mask = C->getAPIntValue();
38477
38478 // Clear all non-demanded bits initially.
38479 APInt ShrunkMask = Mask & DemandedBits;
38480
38481 // Find the width of the shrunk mask.
38482 unsigned Width = ShrunkMask.getActiveBits();
38483
38484 // If the mask is all 0s there's nothing to do here.
38485 if (Width == 0)
38486 return false;
38487
38488 // Find the next power of 2 width, rounding up to a byte.
38489 Width = llvm::bit_ceil(std::max(Width, 8U));
38490 // Truncate the width to size to handle illegal types.
38491 Width = std::min(Width, EltSize);
38492
38493 // Calculate a possible zero extend mask for this constant.
38494 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38495
38496 // If we aren't changing the mask, just return true to keep it and prevent
38497 // the caller from optimizing.
38498 if (ZeroExtendMask == Mask)
38499 return true;
38500
38501 // Make sure the new mask can be represented by a combination of mask bits
38502 // and non-demanded bits.
38503 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38504 return false;
38505
38506 // Replace the constant with the zero extend mask.
38507 SDLoc DL(Op);
38508 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38509 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38510 return TLO.CombineTo(Op, NewOp);
38511}
38512
38514 KnownBits &Known,
38515 const APInt &DemandedElts,
38516 const SelectionDAG &DAG, unsigned Depth) {
38517 KnownBits Known2;
38518 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38519 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38520 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38521 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38522 Known = KnownBits::abdu(Known, Known2).zext(16);
38523 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38524 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38525 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38526 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38527 Known = Known.zext(64);
38528}
38529
38531 KnownBits &Known,
38532 const APInt &DemandedElts,
38533 const SelectionDAG &DAG,
38534 unsigned Depth) {
38535 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38536
38537 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38538 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38539 APInt DemandedLoElts =
38540 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38541 APInt DemandedHiElts =
38542 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38543 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38544 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38545 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38546 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38547 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38548 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38549 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38550}
38551
38553 KnownBits &Known,
38554 const APInt &DemandedElts,
38555 const SelectionDAG &DAG,
38556 unsigned Depth) {
38557 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38558
38559 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38560 // pairs.
38561 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38562 APInt DemandedLoElts =
38563 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38564 APInt DemandedHiElts =
38565 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38566 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38567 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38568 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38569 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38570 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38571 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38572 Known = KnownBits::sadd_sat(Lo, Hi);
38573}
38574
38576 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38577 const SelectionDAG &DAG,
38578 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38579 KnownBitsFunc) {
38580 APInt DemandedEltsLHS, DemandedEltsRHS;
38581 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38582 DemandedElts, DemandedEltsLHS,
38583 DemandedEltsRHS);
38584
38585 const auto ComputeForSingleOpFunc =
38586 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38587 return KnownBitsFunc(
38588 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38589 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38590 };
38591
38592 if (DemandedEltsRHS.isZero())
38593 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38594 if (DemandedEltsLHS.isZero())
38595 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38596
38597 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38598 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38599}
38600
38602 KnownBits &Known,
38603 const APInt &DemandedElts,
38604 const SelectionDAG &DAG,
38605 unsigned Depth) const {
38606 unsigned BitWidth = Known.getBitWidth();
38607 unsigned NumElts = DemandedElts.getBitWidth();
38608 unsigned Opc = Op.getOpcode();
38609 EVT VT = Op.getValueType();
38614 "Should use MaskedValueIsZero if you don't know whether Op"
38615 " is a target node!");
38616
38617 Known.resetAll();
38618 switch (Opc) {
38619 default: break;
38620 case X86ISD::MUL_IMM: {
38621 KnownBits Known2;
38622 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38623 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38624 Known = KnownBits::mul(Known, Known2);
38625 break;
38626 }
38627 case X86ISD::BSF: {
38629
38630 KnownBits Known2;
38631 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38632 if (Known2.isNonZero()) {
38633 // If we have a known 1, its position is our upper bound.
38634 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38635 unsigned LowBits = llvm::bit_width(PossibleTZ);
38636 Known.Zero.setBitsFrom(LowBits);
38637 } else if (!Op.getOperand(0).isUndef()) {
38638 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38639 Known = Known.intersectWith(Known2);
38640 }
38641 break;
38642 }
38643 case X86ISD::BSR: {
38644 // TODO: Bound with input known bits?
38646
38647 if (!Op.getOperand(0).isUndef() &&
38648 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38649 KnownBits Known2;
38650 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38651 Known = Known.intersectWith(Known2);
38652 }
38653 break;
38654 }
38655 case X86ISD::SETCC:
38656 Known.Zero.setBitsFrom(1);
38657 break;
38658 case X86ISD::MOVMSK: {
38659 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38660 Known.Zero.setBitsFrom(NumLoBits);
38661 break;
38662 }
38663 case X86ISD::PEXTRB:
38664 case X86ISD::PEXTRW: {
38665 SDValue Src = Op.getOperand(0);
38666 EVT SrcVT = Src.getValueType();
38667 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38668 Op.getConstantOperandVal(1));
38669 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38670 Known = Known.anyextOrTrunc(BitWidth);
38671 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38672 break;
38673 }
38674 case X86ISD::VSRAI:
38675 case X86ISD::VSHLI:
38676 case X86ISD::VSRLI: {
38677 unsigned ShAmt = Op.getConstantOperandVal(1);
38678 if (ShAmt >= VT.getScalarSizeInBits()) {
38679 // Out of range logical bit shifts are guaranteed to be zero.
38680 // Out of range arithmetic bit shifts splat the sign bit.
38681 if (Opc != X86ISD::VSRAI) {
38682 Known.setAllZero();
38683 break;
38684 }
38685
38686 ShAmt = VT.getScalarSizeInBits() - 1;
38687 }
38688
38689 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38690 if (Opc == X86ISD::VSHLI) {
38691 Known <<= ShAmt;
38692 // Low bits are known zero.
38693 Known.Zero.setLowBits(ShAmt);
38694 } else if (Opc == X86ISD::VSRLI) {
38695 Known >>= ShAmt;
38696 // High bits are known zero.
38697 Known.Zero.setHighBits(ShAmt);
38698 } else {
38699 Known.Zero.ashrInPlace(ShAmt);
38700 Known.One.ashrInPlace(ShAmt);
38701 }
38702 break;
38703 }
38704 case X86ISD::PACKUS: {
38705 // PACKUS is just a truncation if the upper half is zero.
38706 APInt DemandedLHS, DemandedRHS;
38707 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38708
38709 Known.One = APInt::getAllOnes(BitWidth * 2);
38710 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38711
38712 KnownBits Known2;
38713 if (!!DemandedLHS) {
38714 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38715 Known = Known.intersectWith(Known2);
38716 }
38717 if (!!DemandedRHS) {
38718 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38719 Known = Known.intersectWith(Known2);
38720 }
38721
38722 if (Known.countMinLeadingZeros() < BitWidth)
38723 Known.resetAll();
38724 Known = Known.trunc(BitWidth);
38725 break;
38726 }
38727 case X86ISD::PSHUFB: {
38728 SDValue Src = Op.getOperand(0);
38729 SDValue Idx = Op.getOperand(1);
38730
38731 // If the index vector is never negative (MSB is zero), then all elements
38732 // come from the source vector. This is useful for cases where
38733 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38734 // below will handle the more common constant shuffle mask case.
38735 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38736 if (KnownIdx.isNonNegative())
38737 Known = DAG.computeKnownBits(Src, Depth + 1);
38738 break;
38739 }
38740 case X86ISD::VBROADCAST: {
38741 SDValue Src = Op.getOperand(0);
38742 if (!Src.getSimpleValueType().isVector()) {
38743 Known = DAG.computeKnownBits(Src, Depth + 1);
38744 return;
38745 }
38746 break;
38747 }
38748 case X86ISD::AND: {
38749 if (Op.getResNo() == 0) {
38750 KnownBits Known2;
38751 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38752 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38753 Known &= Known2;
38754 }
38755 break;
38756 }
38757 case X86ISD::ANDNP: {
38758 KnownBits Known2;
38759 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38760 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38761
38762 // ANDNP = (~X & Y);
38763 Known.One &= Known2.Zero;
38764 Known.Zero |= Known2.One;
38765 break;
38766 }
38767 case X86ISD::FOR: {
38768 KnownBits Known2;
38769 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38770 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38771
38772 Known |= Known2;
38773 break;
38774 }
38775 case X86ISD::PSADBW: {
38776 SDValue LHS = Op.getOperand(0);
38777 SDValue RHS = Op.getOperand(1);
38778 assert(VT.getScalarType() == MVT::i64 &&
38779 LHS.getValueType() == RHS.getValueType() &&
38780 LHS.getValueType().getScalarType() == MVT::i8 &&
38781 "Unexpected PSADBW types");
38782 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38783 break;
38784 }
38785 case X86ISD::PCMPGT:
38786 case X86ISD::PCMPEQ: {
38787 KnownBits KnownLhs =
38788 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38789 KnownBits KnownRhs =
38790 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38791 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38792 ? KnownBits::eq(KnownLhs, KnownRhs)
38793 : KnownBits::sgt(KnownLhs, KnownRhs);
38794 if (Res) {
38795 if (*Res)
38796 Known.setAllOnes();
38797 else
38798 Known.setAllZero();
38799 }
38800 break;
38801 }
38802 case X86ISD::VPMADDWD: {
38803 SDValue LHS = Op.getOperand(0);
38804 SDValue RHS = Op.getOperand(1);
38805 assert(VT.getVectorElementType() == MVT::i32 &&
38806 LHS.getValueType() == RHS.getValueType() &&
38807 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38808 "Unexpected PMADDWD types");
38809 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38810 break;
38811 }
38812 case X86ISD::VPMADDUBSW: {
38813 SDValue LHS = Op.getOperand(0);
38814 SDValue RHS = Op.getOperand(1);
38815 assert(VT.getVectorElementType() == MVT::i16 &&
38816 LHS.getValueType() == RHS.getValueType() &&
38817 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38818 "Unexpected PMADDUBSW types");
38819 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38820 break;
38821 }
38822 case X86ISD::PMULUDQ: {
38823 KnownBits Known2;
38824 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38825 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38826
38827 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38828 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38829 Known = KnownBits::mul(Known, Known2);
38830 break;
38831 }
38832 case X86ISD::CMOV: {
38833 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38834 // If we don't know any bits, early out.
38835 if (Known.isUnknown())
38836 break;
38837 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38838
38839 // Only known if known in both the LHS and RHS.
38840 Known = Known.intersectWith(Known2);
38841 break;
38842 }
38843 case X86ISD::BEXTR:
38844 case X86ISD::BEXTRI: {
38845 SDValue Op0 = Op.getOperand(0);
38846 SDValue Op1 = Op.getOperand(1);
38847
38848 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38849 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38850 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38851
38852 // If the length is 0, the result is 0.
38853 if (Length == 0) {
38854 Known.setAllZero();
38855 break;
38856 }
38857
38858 if ((Shift + Length) <= BitWidth) {
38859 Known = DAG.computeKnownBits(Op0, Depth + 1);
38860 Known = Known.extractBits(Length, Shift);
38861 Known = Known.zextOrTrunc(BitWidth);
38862 }
38863 }
38864 break;
38865 }
38866 case X86ISD::PDEP: {
38867 KnownBits Known2;
38868 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38869 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38870 // Zeros are retained from the mask operand. But not ones.
38871 Known.One.clearAllBits();
38872 // The result will have at least as many trailing zeros as the non-mask
38873 // operand since bits can only map to the same or higher bit position.
38874 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38875 break;
38876 }
38877 case X86ISD::PEXT: {
38878 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38879 // The result has as many leading zeros as the number of zeroes in the mask.
38880 unsigned Count = Known.Zero.popcount();
38882 Known.One.clearAllBits();
38883 break;
38884 }
38885 case X86ISD::VTRUNC:
38886 case X86ISD::VTRUNCS:
38887 case X86ISD::VTRUNCUS:
38888 case X86ISD::CVTSI2P:
38889 case X86ISD::CVTUI2P:
38890 case X86ISD::CVTP2SI:
38891 case X86ISD::CVTP2UI:
38892 case X86ISD::MCVTP2SI:
38893 case X86ISD::MCVTP2UI:
38894 case X86ISD::CVTTP2SI:
38895 case X86ISD::CVTTP2UI:
38896 case X86ISD::MCVTTP2SI:
38897 case X86ISD::MCVTTP2UI:
38898 case X86ISD::MCVTSI2P:
38899 case X86ISD::MCVTUI2P:
38900 case X86ISD::VFPROUND:
38901 case X86ISD::VMFPROUND:
38902 case X86ISD::CVTPS2PH:
38903 case X86ISD::MCVTPS2PH:
38904 case X86ISD::MCVTTP2SIS:
38905 case X86ISD::MCVTTP2UIS: {
38906 // Truncations/Conversions - upper elements are known zero.
38907 EVT SrcVT = Op.getOperand(0).getValueType();
38908 if (SrcVT.isVector()) {
38909 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38910 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38911 Known.setAllZero();
38912 }
38913 break;
38914 }
38921 // Strict Conversions - upper elements are known zero.
38922 EVT SrcVT = Op.getOperand(1).getValueType();
38923 if (SrcVT.isVector()) {
38924 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38925 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38926 Known.setAllZero();
38927 }
38928 break;
38929 }
38930 case X86ISD::MOVQ2DQ: {
38931 // Move from MMX to XMM. Upper half of XMM should be 0.
38932 if (DemandedElts.countr_zero() >= (NumElts / 2))
38933 Known.setAllZero();
38934 break;
38935 }
38937 APInt UndefElts;
38938 SmallVector<APInt, 16> EltBits;
38939 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38940 /*AllowWholeUndefs*/ false,
38941 /*AllowPartialUndefs*/ false)) {
38942 Known.Zero.setAllBits();
38943 Known.One.setAllBits();
38944 for (unsigned I = 0; I != NumElts; ++I) {
38945 if (!DemandedElts[I])
38946 continue;
38947 if (UndefElts[I]) {
38948 Known.resetAll();
38949 break;
38950 }
38951 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38952 Known = Known.intersectWith(Known2);
38953 }
38954 return;
38955 }
38956 break;
38957 }
38958 case X86ISD::HADD:
38959 case X86ISD::HSUB: {
38961 Op, DemandedElts, Depth, DAG,
38962 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38964 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38965 KnownLHS, KnownRHS);
38966 });
38967 break;
38968 }
38970 switch (Op->getConstantOperandVal(0)) {
38971 case Intrinsic::x86_sse2_pmadd_wd:
38972 case Intrinsic::x86_avx2_pmadd_wd:
38973 case Intrinsic::x86_avx512_pmaddw_d_512: {
38974 SDValue LHS = Op.getOperand(1);
38975 SDValue RHS = Op.getOperand(2);
38976 assert(VT.getScalarType() == MVT::i32 &&
38977 LHS.getValueType() == RHS.getValueType() &&
38978 LHS.getValueType().getScalarType() == MVT::i16 &&
38979 "Unexpected PMADDWD types");
38980 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38981 break;
38982 }
38983 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38984 case Intrinsic::x86_avx2_pmadd_ub_sw:
38985 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38986 SDValue LHS = Op.getOperand(1);
38987 SDValue RHS = Op.getOperand(2);
38988 assert(VT.getScalarType() == MVT::i16 &&
38989 LHS.getValueType() == RHS.getValueType() &&
38990 LHS.getValueType().getScalarType() == MVT::i8 &&
38991 "Unexpected PMADDUBSW types");
38992 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38993 break;
38994 }
38995 case Intrinsic::x86_sse2_psad_bw:
38996 case Intrinsic::x86_avx2_psad_bw:
38997 case Intrinsic::x86_avx512_psad_bw_512: {
38998 SDValue LHS = Op.getOperand(1);
38999 SDValue RHS = Op.getOperand(2);
39000 assert(VT.getScalarType() == MVT::i64 &&
39001 LHS.getValueType() == RHS.getValueType() &&
39002 LHS.getValueType().getScalarType() == MVT::i8 &&
39003 "Unexpected PSADBW types");
39004 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
39005 break;
39006 }
39007 }
39008 break;
39009 }
39010 case X86ISD::VPMADD52L:
39011 case X86ISD::VPMADD52H: {
39012 assert(Op.getValueType().isVector() &&
39013 Op.getValueType().getScalarType() == MVT::i64 &&
39014 "Unexpected VPMADD52 type");
39015 KnownBits K0 =
39016 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
39017 KnownBits K1 =
39018 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
39019 KnownBits KAcc =
39020 DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
39021 K0 = K0.trunc(52);
39022 K1 = K1.trunc(52);
39023 KnownBits KnownMul = (Op.getOpcode() == X86ISD::VPMADD52L)
39024 ? KnownBits::mul(K0, K1)
39025 : KnownBits::mulhu(K0, K1);
39026 KnownMul = KnownMul.zext(64);
39027 Known = KnownBits::add(KAcc, KnownMul);
39028 return;
39029 }
39030 }
39031
39032 // Handle target shuffles.
39033 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39034 if (isTargetShuffle(Opc)) {
39037 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39038 unsigned NumOps = Ops.size();
39039 unsigned NumElts = VT.getVectorNumElements();
39040 if (Mask.size() == NumElts) {
39041 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39042 Known.Zero.setAllBits(); Known.One.setAllBits();
39043 for (unsigned i = 0; i != NumElts; ++i) {
39044 if (!DemandedElts[i])
39045 continue;
39046 int M = Mask[i];
39047 if (M == SM_SentinelUndef) {
39048 // For UNDEF elements, we don't know anything about the common state
39049 // of the shuffle result.
39050 Known.resetAll();
39051 break;
39052 }
39053 if (M == SM_SentinelZero) {
39054 Known.One.clearAllBits();
39055 continue;
39056 }
39057 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39058 "Shuffle index out of range");
39059
39060 unsigned OpIdx = (unsigned)M / NumElts;
39061 unsigned EltIdx = (unsigned)M % NumElts;
39062 if (Ops[OpIdx].getValueType() != VT) {
39063 // TODO - handle target shuffle ops with different value types.
39064 Known.resetAll();
39065 break;
39066 }
39067 DemandedOps[OpIdx].setBit(EltIdx);
39068 }
39069 // Known bits are the values that are shared by every demanded element.
39070 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39071 if (!DemandedOps[i])
39072 continue;
39073 KnownBits Known2 =
39074 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39075 Known = Known.intersectWith(Known2);
39076 }
39077 }
39078 }
39079 }
39080}
39081
39083 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39084 unsigned Depth) const {
39085 EVT VT = Op.getValueType();
39086 unsigned VTBits = VT.getScalarSizeInBits();
39087 unsigned Opcode = Op.getOpcode();
39088 switch (Opcode) {
39090 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39091 return VTBits;
39092
39093 case X86ISD::VTRUNC: {
39094 SDValue Src = Op.getOperand(0);
39095 MVT SrcVT = Src.getSimpleValueType();
39096 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39097 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39098 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39099 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39100 if (Tmp > (NumSrcBits - VTBits))
39101 return Tmp - (NumSrcBits - VTBits);
39102 return 1;
39103 }
39104
39105 case X86ISD::PACKSS: {
39106 // PACKSS is just a truncation if the sign bits extend to the packed size.
39107 APInt DemandedLHS, DemandedRHS;
39108 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39109 DemandedRHS);
39110
39111 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39112 // patterns often used to compact vXi64 allsignbit patterns.
39113 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39115 if (BC.getOpcode() == X86ISD::PACKSS &&
39116 BC.getScalarValueSizeInBits() == 16 &&
39117 V.getScalarValueSizeInBits() == 32) {
39120 if (BC0.getScalarValueSizeInBits() == 64 &&
39121 BC1.getScalarValueSizeInBits() == 64 &&
39122 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39123 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39124 return 32;
39125 }
39126 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39127 };
39128
39129 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39130 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39131 if (!!DemandedLHS)
39132 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39133 if (!!DemandedRHS)
39134 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39135 unsigned Tmp = std::min(Tmp0, Tmp1);
39136 if (Tmp > (SrcBits - VTBits))
39137 return Tmp - (SrcBits - VTBits);
39138 return 1;
39139 }
39140
39141 case X86ISD::VBROADCAST: {
39142 SDValue Src = Op.getOperand(0);
39143 if (!Src.getSimpleValueType().isVector())
39144 return DAG.ComputeNumSignBits(Src, Depth + 1);
39145 break;
39146 }
39147
39148 case X86ISD::VSHLI: {
39149 SDValue Src = Op.getOperand(0);
39150 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39151 if (ShiftVal.uge(VTBits))
39152 return VTBits; // Shifted all bits out --> zero.
39153 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39154 if (ShiftVal.uge(Tmp))
39155 return 1; // Shifted all sign bits out --> unknown.
39156 return Tmp - ShiftVal.getZExtValue();
39157 }
39158
39159 case X86ISD::VSRAI: {
39160 SDValue Src = Op.getOperand(0);
39161 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39162 if (ShiftVal.uge(VTBits - 1))
39163 return VTBits; // Sign splat.
39164 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39165 ShiftVal += Tmp;
39166 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39167 }
39168
39169 case X86ISD::FSETCC:
39170 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39171 if (VT == MVT::f32 || VT == MVT::f64 ||
39172 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39173 return VTBits;
39174 break;
39175
39176 case X86ISD::PCMPGT:
39177 case X86ISD::PCMPEQ:
39178 case X86ISD::CMPP:
39179 case X86ISD::VPCOM:
39180 case X86ISD::VPCOMU:
39181 // Vector compares return zero/all-bits result values.
39182 return VTBits;
39183
39184 case X86ISD::ANDNP: {
39185 unsigned Tmp0 =
39186 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39187 if (Tmp0 == 1) return 1; // Early out.
39188 unsigned Tmp1 =
39189 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39190 return std::min(Tmp0, Tmp1);
39191 }
39192
39193 case X86ISD::CMOV: {
39194 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39195 if (Tmp0 == 1) return 1; // Early out.
39196 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39197 return std::min(Tmp0, Tmp1);
39198 }
39199 }
39200
39201 // Handle target shuffles.
39202 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39203 if (isTargetShuffle(Opcode)) {
39206 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39207 unsigned NumOps = Ops.size();
39208 unsigned NumElts = VT.getVectorNumElements();
39209 if (Mask.size() == NumElts) {
39210 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39211 for (unsigned i = 0; i != NumElts; ++i) {
39212 if (!DemandedElts[i])
39213 continue;
39214 int M = Mask[i];
39215 if (M == SM_SentinelUndef) {
39216 // For UNDEF elements, we don't know anything about the common state
39217 // of the shuffle result.
39218 return 1;
39219 } else if (M == SM_SentinelZero) {
39220 // Zero = all sign bits.
39221 continue;
39222 }
39223 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39224 "Shuffle index out of range");
39225
39226 unsigned OpIdx = (unsigned)M / NumElts;
39227 unsigned EltIdx = (unsigned)M % NumElts;
39228 if (Ops[OpIdx].getValueType() != VT) {
39229 // TODO - handle target shuffle ops with different value types.
39230 return 1;
39231 }
39232 DemandedOps[OpIdx].setBit(EltIdx);
39233 }
39234 unsigned Tmp0 = VTBits;
39235 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39236 if (!DemandedOps[i])
39237 continue;
39238 unsigned Tmp1 =
39239 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39240 Tmp0 = std::min(Tmp0, Tmp1);
39241 }
39242 return Tmp0;
39243 }
39244 }
39245 }
39246
39247 // Fallback case.
39248 return 1;
39249}
39250
39252 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39253 return N->getOperand(0);
39254 return N;
39255}
39256
39257// Helper to look for a normal load that can be narrowed into a vzload with the
39258// specified VT and memory VT. Returns SDValue() on failure.
39260 SelectionDAG &DAG) {
39261 // Can't if the load is volatile or atomic.
39262 if (!LN->isSimple())
39263 return SDValue();
39264
39265 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39266 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39267 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39268 LN->getPointerInfo(), LN->getBaseAlign(),
39269 LN->getMemOperand()->getFlags());
39270}
39271
39272// Attempt to match a combined shuffle mask against supported unary shuffle
39273// instructions.
39274// TODO: Investigate sharing more of this with shuffle lowering.
39275static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39276 bool AllowFloatDomain, bool AllowIntDomain,
39277 SDValue V1, const SelectionDAG &DAG,
39278 const X86Subtarget &Subtarget, unsigned &Shuffle,
39279 MVT &SrcVT, MVT &DstVT) {
39280 unsigned NumMaskElts = Mask.size();
39281 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39282
39283 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39284 if (Mask[0] == 0 &&
39285 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39286 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39288 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39289 Shuffle = X86ISD::VZEXT_MOVL;
39290 if (MaskEltSize == 16)
39291 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39292 else
39293 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39294 return true;
39295 }
39296 }
39297
39298 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39299 if (AllowIntDomain &&
39300 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39301 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39302 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39303 unsigned MaxScale = 64 / MaskEltSize;
39304 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39305 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39306 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39307 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39308 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39309 continue;
39310 bool MatchAny = true;
39311 bool MatchZero = true;
39312 bool MatchSign = UseSign;
39313 unsigned NumDstElts = NumMaskElts / Scale;
39314 for (unsigned i = 0;
39315 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39316 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39317 MatchAny = MatchSign = MatchZero = false;
39318 break;
39319 }
39320 unsigned Pos = (i * Scale) + 1;
39321 unsigned Len = Scale - 1;
39322 MatchAny &= isUndefInRange(Mask, Pos, Len);
39323 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39324 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39325 }
39326 if (MatchAny || MatchSign || MatchZero) {
39327 assert((MatchSign || MatchZero) &&
39328 "Failed to match sext/zext but matched aext?");
39329 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39330 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39331 : MVT::getIntegerVT(MaskEltSize);
39332 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39333
39334 Shuffle = unsigned(
39335 MatchAny ? ISD::ANY_EXTEND
39336 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39337 if (SrcVT.getVectorNumElements() != NumDstElts)
39338 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39339
39340 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39341 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39342 return true;
39343 }
39344 }
39345 }
39346
39347 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39348 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39349 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39350 isUndefOrEqual(Mask[0], 0) &&
39351 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39352 Shuffle = X86ISD::VZEXT_MOVL;
39353 if (MaskEltSize == 16)
39354 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39355 else
39356 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39357 return true;
39358 }
39359
39360 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39361 // instructions are no slower than UNPCKLPD but has the option to
39362 // fold the input operand into even an unaligned memory load.
39363 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39364 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39365 Shuffle = X86ISD::MOVDDUP;
39366 SrcVT = DstVT = MVT::v2f64;
39367 return true;
39368 }
39369 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39370 Shuffle = X86ISD::MOVSLDUP;
39371 SrcVT = DstVT = MVT::v4f32;
39372 return true;
39373 }
39374 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39375 Shuffle = X86ISD::MOVSHDUP;
39376 SrcVT = DstVT = MVT::v4f32;
39377 return true;
39378 }
39379 }
39380
39381 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39382 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39383 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39384 Shuffle = X86ISD::MOVDDUP;
39385 SrcVT = DstVT = MVT::v4f64;
39386 return true;
39387 }
39388 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39389 V1)) {
39390 Shuffle = X86ISD::MOVSLDUP;
39391 SrcVT = DstVT = MVT::v8f32;
39392 return true;
39393 }
39394 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39395 V1)) {
39396 Shuffle = X86ISD::MOVSHDUP;
39397 SrcVT = DstVT = MVT::v8f32;
39398 return true;
39399 }
39400 }
39401
39402 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39403 assert(Subtarget.hasAVX512() &&
39404 "AVX512 required for 512-bit vector shuffles");
39405 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39406 V1)) {
39407 Shuffle = X86ISD::MOVDDUP;
39408 SrcVT = DstVT = MVT::v8f64;
39409 return true;
39410 }
39412 MaskVT, Mask,
39413 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39414 Shuffle = X86ISD::MOVSLDUP;
39415 SrcVT = DstVT = MVT::v16f32;
39416 return true;
39417 }
39419 MaskVT, Mask,
39420 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39421 Shuffle = X86ISD::MOVSHDUP;
39422 SrcVT = DstVT = MVT::v16f32;
39423 return true;
39424 }
39425 }
39426
39427 return false;
39428}
39429
39430// Attempt to match a combined shuffle mask against supported unary immediate
39431// permute instructions.
39432// TODO: Investigate sharing more of this with shuffle lowering.
39434 const APInt &Zeroable,
39435 bool AllowFloatDomain, bool AllowIntDomain,
39436 const SelectionDAG &DAG,
39437 const X86Subtarget &Subtarget,
39438 unsigned &Shuffle, MVT &ShuffleVT,
39439 unsigned &PermuteImm) {
39440 unsigned NumMaskElts = Mask.size();
39441 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39442 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39443 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39444 bool ContainsZeros = isAnyZero(Mask);
39445
39446 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39447 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39448 // Check for lane crossing permutes.
39449 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39450 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39451 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39452 Shuffle = X86ISD::VPERMI;
39453 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39454 PermuteImm = getV4X86ShuffleImm(Mask);
39455 return true;
39456 }
39457 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39458 SmallVector<int, 4> RepeatedMask;
39459 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39460 Shuffle = X86ISD::VPERMI;
39461 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39462 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39463 return true;
39464 }
39465 }
39466 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39467 // VPERMILPD can permute with a non-repeating shuffle.
39468 Shuffle = X86ISD::VPERMILPI;
39469 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39470 PermuteImm = 0;
39471 for (int i = 0, e = Mask.size(); i != e; ++i) {
39472 int M = Mask[i];
39473 if (M == SM_SentinelUndef)
39474 continue;
39475 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39476 PermuteImm |= (M & 1) << i;
39477 }
39478 return true;
39479 }
39480 }
39481
39482 // We are checking for shuffle match or shift match. Loop twice so we can
39483 // order which we try and match first depending on target preference.
39484 for (unsigned Order = 0; Order < 2; ++Order) {
39485 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39486 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39487 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39488 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39489 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39490 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39491 SmallVector<int, 4> RepeatedMask;
39492 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39493 // Narrow the repeated mask to create 32-bit element permutes.
39494 SmallVector<int, 4> WordMask = RepeatedMask;
39495 if (MaskScalarSizeInBits == 64)
39496 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39497
39498 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39499 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39500 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39501 PermuteImm = getV4X86ShuffleImm(WordMask);
39502 return true;
39503 }
39504 }
39505
39506 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39507 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39508 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39509 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39510 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39511 SmallVector<int, 4> RepeatedMask;
39512 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39513 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39514 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39515
39516 // PSHUFLW: permute lower 4 elements only.
39517 if (isUndefOrInRange(LoMask, 0, 4) &&
39518 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39519 Shuffle = X86ISD::PSHUFLW;
39520 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39521 PermuteImm = getV4X86ShuffleImm(LoMask);
39522 return true;
39523 }
39524
39525 // PSHUFHW: permute upper 4 elements only.
39526 if (isUndefOrInRange(HiMask, 4, 8) &&
39527 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39528 // Offset the HiMask so that we can create the shuffle immediate.
39529 int OffsetHiMask[4];
39530 for (int i = 0; i != 4; ++i)
39531 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39532
39533 Shuffle = X86ISD::PSHUFHW;
39534 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39535 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39536 return true;
39537 }
39538 }
39539 }
39540 } else {
39541 // Attempt to match against bit rotates.
39542 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39543 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39544 Subtarget.hasAVX512())) {
39545 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39546 Subtarget, Mask);
39547 if (0 < RotateAmt) {
39548 Shuffle = X86ISD::VROTLI;
39549 PermuteImm = (unsigned)RotateAmt;
39550 return true;
39551 }
39552 }
39553 }
39554 // Attempt to match against byte/bit shifts.
39555 if (AllowIntDomain &&
39556 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39557 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39558 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39559 int ShiftAmt =
39560 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39561 Zeroable, Subtarget);
39562 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39563 32 <= ShuffleVT.getScalarSizeInBits())) {
39564 // Byte shifts can be slower so only match them on second attempt.
39565 if (Order == 0 &&
39566 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39567 continue;
39568
39569 PermuteImm = (unsigned)ShiftAmt;
39570 return true;
39571 }
39572
39573 }
39574 }
39575
39576 return false;
39577}
39578
39579// Attempt to match a combined unary shuffle mask against supported binary
39580// shuffle instructions.
39581// TODO: Investigate sharing more of this with shuffle lowering.
39582static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39583 bool AllowFloatDomain, bool AllowIntDomain,
39584 SDValue &V1, SDValue &V2, const SDLoc &DL,
39585 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39586 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39587 bool IsUnary) {
39588 unsigned NumMaskElts = Mask.size();
39589 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39590 unsigned SizeInBits = MaskVT.getSizeInBits();
39591
39592 if (MaskVT.is128BitVector()) {
39593 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39594 AllowFloatDomain) {
39595 V2 = V1;
39596 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39597 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39598 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39599 return true;
39600 }
39601 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39602 AllowFloatDomain) {
39603 V2 = V1;
39604 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39605 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39606 return true;
39607 }
39608 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39609 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39610 std::swap(V1, V2);
39611 Shuffle = X86ISD::MOVSD;
39612 SrcVT = DstVT = MVT::v2f64;
39613 return true;
39614 }
39615 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39616 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39617 Shuffle = X86ISD::MOVSS;
39618 SrcVT = DstVT = MVT::v4f32;
39619 return true;
39620 }
39621 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39622 DAG) &&
39623 Subtarget.hasFP16()) {
39624 Shuffle = X86ISD::MOVSH;
39625 SrcVT = DstVT = MVT::v8f16;
39626 return true;
39627 }
39628 }
39629
39630 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39631 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39632 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39633 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39634 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39635 Subtarget)) {
39636 DstVT = MaskVT;
39637 return true;
39638 }
39639 }
39640 // TODO: Can we handle this inside matchShuffleWithPACK?
39641 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39642 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39643 V1.getScalarValueSizeInBits() == 64 &&
39644 V2.getScalarValueSizeInBits() == 64) {
39645 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39646 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39647 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39648 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39649 SrcVT = MVT::v4i32;
39650 DstVT = MVT::v8i16;
39651 Shuffle = X86ISD::PACKUS;
39652 return true;
39653 }
39654 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39655 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39656 SrcVT = MVT::v8i16;
39657 DstVT = MVT::v16i8;
39658 Shuffle = X86ISD::PACKUS;
39659 return true;
39660 }
39661 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39662 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39663 SrcVT = MVT::v4i32;
39664 DstVT = MVT::v8i16;
39665 Shuffle = X86ISD::PACKSS;
39666 return true;
39667 }
39668 }
39669
39670 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39671 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39672 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39673 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39674 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39675 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39676 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39677 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39678 Subtarget)) {
39679 SrcVT = DstVT = MaskVT;
39680 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39681 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39682 return true;
39683 }
39684 }
39685
39686 // Attempt to match against a OR if we're performing a blend shuffle and the
39687 // non-blended source element is zero in each case.
39688 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39689 if (SizeInBits == V1.getValueSizeInBits() &&
39690 SizeInBits == V2.getValueSizeInBits() &&
39691 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39692 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39693 bool IsBlend = true;
39694 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39695 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39696 unsigned Scale1 = NumV1Elts / NumMaskElts;
39697 unsigned Scale2 = NumV2Elts / NumMaskElts;
39698 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39699 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39700 for (unsigned i = 0; i != NumMaskElts; ++i) {
39701 int M = Mask[i];
39702 if (M == SM_SentinelUndef)
39703 continue;
39704 if (M == SM_SentinelZero) {
39705 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39706 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39707 continue;
39708 }
39709 if (M == (int)i) {
39710 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39711 continue;
39712 }
39713 if (M == (int)(i + NumMaskElts)) {
39714 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39715 continue;
39716 }
39717 IsBlend = false;
39718 break;
39719 }
39720 if (IsBlend) {
39721 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39722 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39723 Shuffle = ISD::OR;
39724 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39725 return true;
39726 }
39727 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39728 // FIXME: handle mismatched sizes?
39729 // TODO: investigate if `ISD::OR` handling in
39730 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39731 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39732 unsigned NumElts = V.getValueType().getVectorNumElements();
39733 KnownBits Known(NumElts);
39734 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39735 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39736 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39737 if (PeepholeKnown.isZero())
39738 Known.Zero.setBit(EltIdx);
39739 if (PeepholeKnown.isAllOnes())
39740 Known.One.setBit(EltIdx);
39741 }
39742 return Known;
39743 };
39744
39745 KnownBits V1Known = computeKnownBitsElementWise(V1);
39746 KnownBits V2Known = computeKnownBitsElementWise(V2);
39747
39748 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39749 int M = Mask[i];
39750 if (M == SM_SentinelUndef)
39751 continue;
39752 if (M == SM_SentinelZero) {
39753 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39754 continue;
39755 }
39756 if (M == (int)i) {
39757 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39758 continue;
39759 }
39760 if (M == (int)(i + NumMaskElts)) {
39761 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39762 continue;
39763 }
39764 llvm_unreachable("will not get here.");
39765 }
39766 if (IsBlend) {
39767 Shuffle = ISD::OR;
39768 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39769 return true;
39770 }
39771 }
39772 }
39773 }
39774
39775 return false;
39776}
39777
39779 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39780 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39781 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39782 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39783 unsigned NumMaskElts = Mask.size();
39784 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39785
39786 // Attempt to match against VALIGND/VALIGNQ rotate.
39787 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39788 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39789 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39790 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39791 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39792 MaskVT.getSizeInBits() / EltSizeInBits);
39793 if (!isAnyZero(Mask)) {
39794 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39795 if (0 < Rotation) {
39796 Shuffle = X86ISD::VALIGN;
39797 ShuffleVT = AlignVT;
39798 PermuteImm = Rotation;
39799 return true;
39800 }
39801 }
39802 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39803 unsigned ZeroLo = Zeroable.countr_one();
39804 unsigned ZeroHi = Zeroable.countl_one();
39805 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39806 if (ZeroLo) {
39807 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39808 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39809 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39810 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39811 Shuffle = X86ISD::VALIGN;
39812 ShuffleVT = AlignVT;
39813 PermuteImm = NumMaskElts - ZeroLo;
39814 return true;
39815 }
39816 }
39817 if (ZeroHi) {
39818 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39819 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39820 ZeroHi);
39821 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39822 V2 = V1;
39823 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39824 Shuffle = X86ISD::VALIGN;
39825 ShuffleVT = AlignVT;
39826 PermuteImm = ZeroHi;
39827 return true;
39828 }
39829 }
39830 }
39831
39832 // Attempt to match against PALIGNR byte rotate.
39833 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39834 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39835 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39836 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39837 if (0 < ByteRotation) {
39838 Shuffle = X86ISD::PALIGNR;
39839 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39840 PermuteImm = ByteRotation;
39841 return true;
39842 }
39843 }
39844
39845 // Attempt to combine to X86ISD::BLENDI.
39846 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39847 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39848 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39849 uint64_t BlendMask = 0;
39850 bool ForceV1Zero = false, ForceV2Zero = false;
39851 SmallVector<int, 8> TargetMask(Mask);
39852 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39853 ForceV2Zero, BlendMask)) {
39854 if (MaskVT == MVT::v16i16) {
39855 // We can only use v16i16 PBLENDW if the lanes are repeated.
39856 SmallVector<int, 8> RepeatedMask;
39857 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39858 RepeatedMask)) {
39859 assert(RepeatedMask.size() == 8 &&
39860 "Repeated mask size doesn't match!");
39861 PermuteImm = 0;
39862 for (int i = 0; i < 8; ++i)
39863 if (RepeatedMask[i] >= 8)
39864 PermuteImm |= 1 << i;
39865 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39866 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39867 Shuffle = X86ISD::BLENDI;
39868 ShuffleVT = MaskVT;
39869 return true;
39870 }
39871 } else {
39872 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39873 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39874 PermuteImm = (unsigned)BlendMask;
39875 Shuffle = X86ISD::BLENDI;
39876 ShuffleVT = MaskVT;
39877 return true;
39878 }
39879 }
39880 }
39881
39882 // Attempt to combine to INSERTPS, but only if it has elements that need to
39883 // be set to zero.
39884 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39885 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39886 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39887 Shuffle = X86ISD::INSERTPS;
39888 ShuffleVT = MVT::v4f32;
39889 return true;
39890 }
39891
39892 // Attempt to combine to SHUFPD.
39893 if (AllowFloatDomain && EltSizeInBits == 64 &&
39894 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39895 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39896 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39897 bool ForceV1Zero = false, ForceV2Zero = false;
39898 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39899 PermuteImm, Mask, Zeroable)) {
39900 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39901 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39902 Shuffle = X86ISD::SHUFP;
39903 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39904 return true;
39905 }
39906 }
39907
39908 // Attempt to combine to SHUFPS.
39909 if (AllowFloatDomain && EltSizeInBits == 32 &&
39910 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39911 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39912 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39913 SmallVector<int, 4> RepeatedMask;
39914 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39915 // Match each half of the repeated mask, to determine if its just
39916 // referencing one of the vectors, is zeroable or entirely undef.
39917 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39918 int M0 = RepeatedMask[Offset];
39919 int M1 = RepeatedMask[Offset + 1];
39920
39921 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39922 return DAG.getUNDEF(MaskVT);
39923 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39924 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39925 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39926 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39927 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39928 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39929 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39930 return V1;
39931 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39932 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39933 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39934 return V2;
39935 }
39936
39937 return SDValue();
39938 };
39939
39940 int ShufMask[4] = {-1, -1, -1, -1};
39941 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39942 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39943
39944 if (Lo && Hi) {
39945 V1 = Lo;
39946 V2 = Hi;
39947 Shuffle = X86ISD::SHUFP;
39948 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39949 PermuteImm = getV4X86ShuffleImm(ShufMask);
39950 return true;
39951 }
39952 }
39953 }
39954
39955 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39956 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39957 MaskVT.is128BitVector() &&
39958 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39959 Shuffle = X86ISD::INSERTPS;
39960 ShuffleVT = MVT::v4f32;
39961 return true;
39962 }
39963
39964 return false;
39965}
39966
39968 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39969 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39970 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39971 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39972 const X86Subtarget &Subtarget);
39973
39974/// Combine an arbitrary chain of shuffles into a single instruction if
39975/// possible.
39976///
39977/// This is the leaf of the recursive combine below. When we have found some
39978/// chain of single-use x86 shuffle instructions and accumulated the combined
39979/// shuffle mask represented by them, this will try to pattern match that mask
39980/// into either a single instruction if there is a special purpose instruction
39981/// for this operation, or into a PSHUFB instruction which is a fully general
39982/// instruction but should only be used to replace chains over a certain depth.
39984 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
39985 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39986 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39987 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39988 const X86Subtarget &Subtarget) {
39989 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39990 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39991 "Unexpected number of shuffle inputs!");
39992 unsigned RootSizeInBits = RootVT.getSizeInBits();
39993 unsigned NumRootElts = RootVT.getVectorNumElements();
39994
39995 // Canonicalize shuffle input op to the requested type.
39996 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39997 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39998 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39999 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
40000 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
40001 return DAG.getBitcast(VT, Op);
40002 };
40003
40004 // Find the inputs that enter the chain. Note that multiple uses are OK
40005 // here, we're not going to remove the operands we find.
40006 bool UnaryShuffle = (Inputs.size() == 1);
40007 SDValue V1 = peekThroughBitcasts(Inputs[0]);
40008 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
40009 : peekThroughBitcasts(Inputs[1]));
40010
40011 MVT VT1 = V1.getSimpleValueType();
40012 MVT VT2 = V2.getSimpleValueType();
40013 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
40014 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
40015
40016 SDValue Res;
40017
40018 unsigned NumBaseMaskElts = BaseMask.size();
40019 if (NumBaseMaskElts == 1) {
40020 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
40021 return CanonicalizeShuffleInput(RootVT, V1);
40022 }
40023
40024 bool OptForSize = DAG.shouldOptForSize();
40025 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
40026 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
40027 (RootVT.isFloatingPoint() && Depth >= 1) ||
40028 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
40029
40030 // If we are shuffling a splat (and not introducing zeros) then we can just
40031 // use it directly. This works for smaller elements as well as they already
40032 // repeat across each mask element.
40033 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40034 V1.getValueSizeInBits() >= RootSizeInBits &&
40035 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40036 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40037 return CanonicalizeShuffleInput(RootVT, V1);
40038 }
40039
40040 SmallVector<int, 64> Mask(BaseMask);
40041
40042 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40043 // etc. can be simplified.
40044 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40045 SmallVector<int> ScaledMask, IdentityMask;
40046 unsigned NumElts = VT1.getVectorNumElements();
40047 if (Mask.size() <= NumElts &&
40048 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40049 for (unsigned i = 0; i != NumElts; ++i)
40050 IdentityMask.push_back(i);
40051 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40052 V2))
40053 return CanonicalizeShuffleInput(RootVT, V1);
40054 }
40055 }
40056
40057 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40058 if (RootVT.is512BitVector() &&
40059 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40060 // If the upper subvectors are zeroable, then an extract+insert is more
40061 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40062 // to zero the upper subvectors.
40063 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40064 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40065 return SDValue(); // Nothing to do!
40066 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40067 "Unexpected lane shuffle");
40068 Res = CanonicalizeShuffleInput(RootVT, V1);
40069 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40070 bool UseZero = isAnyZero(Mask);
40071 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40072 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40073 }
40074
40075 // Narrow shuffle mask to v4x128.
40076 SmallVector<int, 4> ScaledMask;
40077 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40078 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40079
40080 // Try to lower to vshuf64x2/vshuf32x4.
40081 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40082 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40083 SelectionDAG &DAG) {
40084 int PermMask[4] = {-1, -1, -1, -1};
40085 // Ensure elements came from the same Op.
40086 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40087 for (int i = 0; i < 4; ++i) {
40088 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40089 if (ScaledMask[i] < 0)
40090 continue;
40091
40092 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40093 unsigned OpIndex = i / 2;
40094 if (Ops[OpIndex].isUndef())
40095 Ops[OpIndex] = Op;
40096 else if (Ops[OpIndex] != Op)
40097 return SDValue();
40098
40099 PermMask[i] = ScaledMask[i] % 4;
40100 }
40101
40102 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40103 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40104 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40105 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40106 };
40107
40108 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40109 // doesn't work because our mask is for 128 bits and we don't have an MVT
40110 // to match that.
40111 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40112 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40113 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40114 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40115 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40116 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40117 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40118 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40119 ScaledMask[1] == (ScaledMask[3] % 2));
40120
40121 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40122 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40123 return SDValue(); // Nothing to do!
40124 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40125 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40126 return DAG.getBitcast(RootVT, V);
40127 }
40128 }
40129
40130 // Handle 128-bit lane shuffles of 256-bit vectors.
40131 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40132 // If the upper half is zeroable, then an extract+insert is more optimal
40133 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40134 // zero the upper half.
40135 if (isUndefOrZero(Mask[1])) {
40136 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40137 return SDValue(); // Nothing to do!
40138 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40139 Res = CanonicalizeShuffleInput(RootVT, V1);
40140 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40141 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40142 256);
40143 }
40144
40145 // If we're inserting the low subvector, an insert-subvector 'concat'
40146 // pattern is quicker than VPERM2X128.
40147 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40148 !Subtarget.hasAVX2()) {
40149 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40150 return SDValue(); // Nothing to do!
40151 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40152 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40153 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40154 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40155 }
40156
40157 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40158 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40159 // feature.
40160 // Prefer blends for sequential shuffles unless we are optimizing for size.
40161 if (UnaryShuffle &&
40162 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40163 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40164 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40165 return SDValue(); // Nothing to do!
40166 unsigned PermMask = 0;
40167 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40168 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40169 return DAG.getNode(
40170 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40171 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40172 }
40173
40174 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40175 return SDValue(); // Nothing to do!
40176
40177 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40178 if (!UnaryShuffle && !IsMaskedShuffle) {
40179 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40180 "Unexpected shuffle sentinel value");
40181 // Prefer blends to X86ISD::VPERM2X128.
40182 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40183 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40184 return SDValue(); // Nothing to do!
40185 unsigned PermMask = 0;
40186 PermMask |= ((Mask[0] & 3) << 0);
40187 PermMask |= ((Mask[1] & 3) << 4);
40188 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40189 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40190 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40191 CanonicalizeShuffleInput(RootVT, LHS),
40192 CanonicalizeShuffleInput(RootVT, RHS),
40193 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40194 }
40195 }
40196 }
40197
40198 // For masks that have been widened to 128-bit elements or more,
40199 // narrow back down to 64-bit elements.
40200 if (BaseMaskEltSizeInBits > 64) {
40201 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40202 int MaskScale = BaseMaskEltSizeInBits / 64;
40203 SmallVector<int, 64> ScaledMask;
40204 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40205 Mask = std::move(ScaledMask);
40206 }
40207
40208 // For masked shuffles, we're trying to match the root width for better
40209 // writemask folding, attempt to scale the mask.
40210 // TODO - variable shuffles might need this to be widened again.
40211 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40212 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40213 int MaskScale = NumRootElts / Mask.size();
40214 SmallVector<int, 64> ScaledMask;
40215 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40216 Mask = std::move(ScaledMask);
40217 }
40218
40219 unsigned NumMaskElts = Mask.size();
40220 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40221 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40222
40223 // Determine the effective mask value type.
40224 FloatDomain &= (32 <= MaskEltSizeInBits);
40225 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40226 : MVT::getIntegerVT(MaskEltSizeInBits);
40227 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40228
40229 // Only allow legal mask types.
40230 if (!TLI.isTypeLegal(MaskVT))
40231 return SDValue();
40232
40233 // Attempt to match the mask against known shuffle patterns.
40234 MVT ShuffleSrcVT, ShuffleVT;
40235 unsigned Shuffle, PermuteImm;
40236
40237 // Which shuffle domains are permitted?
40238 // Permit domain crossing at higher combine depths.
40239 // TODO: Should we indicate which domain is preferred if both are allowed?
40240 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40241 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40242 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40243
40244 // Determine zeroable mask elements.
40245 APInt KnownUndef, KnownZero;
40246 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40247 APInt Zeroable = KnownUndef | KnownZero;
40248
40249 if (UnaryShuffle) {
40250 // Attempt to match against broadcast-from-vector.
40251 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40252 if ((Subtarget.hasAVX2() ||
40253 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40254 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40255 if (isUndefOrEqual(Mask, 0)) {
40256 if (V1.getValueType() == MaskVT &&
40258 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40259 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40260 return SDValue(); // Nothing to do!
40261 Res = V1.getOperand(0);
40262 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40263 return DAG.getBitcast(RootVT, Res);
40264 }
40265 if (Subtarget.hasAVX2()) {
40266 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40267 return SDValue(); // Nothing to do!
40268 Res = CanonicalizeShuffleInput(MaskVT, V1);
40269 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40270 return DAG.getBitcast(RootVT, Res);
40271 }
40272 }
40273 }
40274
40275 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40276 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40277 (!IsMaskedShuffle ||
40278 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40279 if (Depth == 0 && RootOpc == Shuffle)
40280 return SDValue(); // Nothing to do!
40281 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40282 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40283 return DAG.getBitcast(RootVT, Res);
40284 }
40285
40286 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40287 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40288 PermuteImm) &&
40289 (!IsMaskedShuffle ||
40290 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40291 if (Depth == 0 && RootOpc == Shuffle)
40292 return SDValue(); // Nothing to do!
40293 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40294 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40295 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40296 return DAG.getBitcast(RootVT, Res);
40297 }
40298 }
40299
40300 // Attempt to combine to INSERTPS, but only if the inserted element has come
40301 // from a scalar.
40302 // TODO: Handle other insertions here as well?
40303 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40304 Subtarget.hasSSE41() &&
40305 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40306 if (MaskEltSizeInBits == 32) {
40307 SDValue SrcV1 = V1, SrcV2 = V2;
40308 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40309 DAG) &&
40310 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40311 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40312 return SDValue(); // Nothing to do!
40313 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40314 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40315 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40316 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40317 return DAG.getBitcast(RootVT, Res);
40318 }
40319 }
40320 if (MaskEltSizeInBits == 64 &&
40321 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40323 V2.getScalarValueSizeInBits() <= 32) {
40324 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40325 return SDValue(); // Nothing to do!
40326 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40327 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40328 CanonicalizeShuffleInput(MVT::v4f32, V1),
40329 CanonicalizeShuffleInput(MVT::v4f32, V2),
40330 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40331 return DAG.getBitcast(RootVT, Res);
40332 }
40333 }
40334
40335 SDValue NewV1 = V1; // Save operands in case early exit happens.
40336 SDValue NewV2 = V2;
40337 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40338 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40339 ShuffleVT, UnaryShuffle) &&
40340 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40341 if (Depth == 0 && RootOpc == Shuffle)
40342 return SDValue(); // Nothing to do!
40343 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40344 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40345 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40346 return DAG.getBitcast(RootVT, Res);
40347 }
40348
40349 NewV1 = V1; // Save operands in case early exit happens.
40350 NewV2 = V2;
40351 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40352 AllowIntDomain, NewV1, NewV2, DL, DAG,
40353 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40354 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40355 if (Depth == 0 && RootOpc == Shuffle)
40356 return SDValue(); // Nothing to do!
40357 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40358 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40359 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40360 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40361 return DAG.getBitcast(RootVT, Res);
40362 }
40363
40364 // Typically from here on, we need an integer version of MaskVT.
40365 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40366 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40367
40368 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40369 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40370 uint64_t BitLen, BitIdx;
40371 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40372 Zeroable)) {
40373 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40374 return SDValue(); // Nothing to do!
40375 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40376 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40377 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40378 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40379 return DAG.getBitcast(RootVT, Res);
40380 }
40381
40382 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40383 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40384 return SDValue(); // Nothing to do!
40385 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40386 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40387 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40388 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40389 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40390 return DAG.getBitcast(RootVT, Res);
40391 }
40392 }
40393
40394 // Match shuffle against TRUNCATE patterns.
40395 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40396 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40397 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40398 Subtarget)) {
40399 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40400 ShuffleSrcVT.getVectorNumElements();
40401 unsigned Opc =
40403 if (Depth == 0 && RootOpc == Opc)
40404 return SDValue(); // Nothing to do!
40405 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40406 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40407 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40408 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40409 return DAG.getBitcast(RootVT, Res);
40410 }
40411
40412 // Do we need a more general binary truncation pattern?
40413 if (RootSizeInBits < 512 &&
40414 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40415 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40416 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40417 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40418 // Bail if this was already a truncation or PACK node.
40419 // We sometimes fail to match PACK if we demand known undef elements.
40420 if (Depth == 0 &&
40421 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40422 RootOpc == X86ISD::PACKUS))
40423 return SDValue(); // Nothing to do!
40424 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40425 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40426 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40427 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40428 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40429 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40430 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40431 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40432 return DAG.getBitcast(RootVT, Res);
40433 }
40434 }
40435
40436 // Don't try to re-form single instruction chains under any circumstances now
40437 // that we've done encoding canonicalization for them.
40438 if (Depth < 1)
40439 return SDValue();
40440
40441 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40442 return isTargetShuffleVariableMask(N->getOpcode());
40443 });
40444 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40445 return (N->getOpcode() == X86ISD::VPERMV3 ||
40446 N->getOpcode() == X86ISD::VPERMV);
40447 });
40448
40449 // Depth threshold above which we can efficiently use variable mask shuffles.
40450 int VariableCrossLaneShuffleDepth =
40451 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40452 int VariablePerLaneShuffleDepth =
40453 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40454 AllowVariableCrossLaneMask &=
40455 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40456 AllowVariablePerLaneMask &=
40457 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40458 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40459 // higher depth before combining them.
40460 int BWIVPERMV3ShuffleDepth =
40461 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40462 bool AllowBWIVPERMV3 =
40463 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40464
40465 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40466 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40467 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40468
40469 bool MaskContainsZeros = isAnyZero(Mask);
40470
40471 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40472 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40473 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40474 if (Subtarget.hasAVX2() &&
40475 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40476 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40477 Res = CanonicalizeShuffleInput(MaskVT, V1);
40478 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40479 return DAG.getBitcast(RootVT, Res);
40480 }
40481 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40482 if ((Subtarget.hasAVX512() &&
40483 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40484 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40485 (Subtarget.hasBWI() &&
40486 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40487 (Subtarget.hasVBMI() &&
40488 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40489 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40490 V2 = DAG.getUNDEF(MaskVT);
40491 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40492 return DAG.getBitcast(RootVT, Res);
40493 }
40494 }
40495
40496 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40497 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40498 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40499 ((Subtarget.hasAVX512() &&
40500 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40501 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40502 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40503 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40504 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40505 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40506 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40507 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40508 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40509 for (unsigned i = 0; i != NumMaskElts; ++i)
40510 if (Mask[i] == SM_SentinelZero)
40511 Mask[i] = NumMaskElts + i;
40512 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40513 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40514 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40515 return DAG.getBitcast(RootVT, Res);
40516 }
40517
40518 // If that failed and either input is extracted then try to combine as a
40519 // shuffle with the larger type.
40521 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40522 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40523 IsMaskedShuffle, DAG, DL, Subtarget))
40524 return WideShuffle;
40525
40526 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40527 // (non-VLX will pad to 512-bit shuffles).
40528 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40529 ((Subtarget.hasAVX512() &&
40530 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40531 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40532 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40533 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40534 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40535 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40536 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40537 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40538 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40539 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40540 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40541 return DAG.getBitcast(RootVT, Res);
40542 }
40543 return SDValue();
40544 }
40545
40546 // See if we can combine a single input shuffle with zeros to a bit-mask,
40547 // which is much simpler than any shuffle.
40548 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40549 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40550 TLI.isTypeLegal(MaskVT)) {
40551 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40552 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40553 APInt UndefElts(NumMaskElts, 0);
40554 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40555 for (unsigned i = 0; i != NumMaskElts; ++i) {
40556 int M = Mask[i];
40557 if (M == SM_SentinelUndef) {
40558 UndefElts.setBit(i);
40559 continue;
40560 }
40561 if (M == SM_SentinelZero)
40562 continue;
40563 EltBits[i] = AllOnes;
40564 }
40565 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40566 Res = CanonicalizeShuffleInput(MaskVT, V1);
40567 unsigned AndOpcode =
40569 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40570 return DAG.getBitcast(RootVT, Res);
40571 }
40572
40573 // If we have a single input shuffle with different shuffle patterns in the
40574 // the 128-bit lanes use the variable mask to VPERMILPS.
40575 // TODO Combine other mask types at higher depths.
40576 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40577 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40578 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40579 SmallVector<SDValue, 16> VPermIdx;
40580 for (int M : Mask) {
40581 SDValue Idx =
40582 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40583 VPermIdx.push_back(Idx);
40584 }
40585 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40586 Res = CanonicalizeShuffleInput(MaskVT, V1);
40587 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40588 return DAG.getBitcast(RootVT, Res);
40589 }
40590
40591 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40592 // to VPERMIL2PD/VPERMIL2PS.
40593 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40594 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40595 MaskVT == MVT::v8f32)) {
40596 // VPERMIL2 Operation.
40597 // Bits[3] - Match Bit.
40598 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40599 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40600 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40601 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40602 SmallVector<int, 8> VPerm2Idx;
40603 unsigned M2ZImm = 0;
40604 for (int M : Mask) {
40605 if (M == SM_SentinelUndef) {
40606 VPerm2Idx.push_back(-1);
40607 continue;
40608 }
40609 if (M == SM_SentinelZero) {
40610 M2ZImm = 2;
40611 VPerm2Idx.push_back(8);
40612 continue;
40613 }
40614 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40615 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40616 VPerm2Idx.push_back(Index);
40617 }
40618 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40619 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40620 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40621 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40622 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40623 return DAG.getBitcast(RootVT, Res);
40624 }
40625
40626 // If we have 3 or more shuffle instructions or a chain involving a variable
40627 // mask, we can replace them with a single PSHUFB instruction profitably.
40628 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40629 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40630 // more aggressive.
40631 if (UnaryShuffle && AllowVariablePerLaneMask &&
40632 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40633 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40634 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40635 SmallVector<SDValue, 16> PSHUFBMask;
40636 int NumBytes = RootVT.getSizeInBits() / 8;
40637 int Ratio = NumBytes / NumMaskElts;
40638 for (int i = 0; i < NumBytes; ++i) {
40639 int M = Mask[i / Ratio];
40640 if (M == SM_SentinelUndef) {
40641 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40642 continue;
40643 }
40644 if (M == SM_SentinelZero) {
40645 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40646 continue;
40647 }
40648 M = Ratio * M + i % Ratio;
40649 assert((M / 16) == (i / 16) && "Lane crossing detected");
40650 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40651 }
40652 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40653 Res = CanonicalizeShuffleInput(ByteVT, V1);
40654 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40655 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40656 return DAG.getBitcast(RootVT, Res);
40657 }
40658
40659 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40660 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40661 // slower than PSHUFB on targets that support both.
40662 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40663 Subtarget.hasXOP()) {
40664 // VPPERM Mask Operation
40665 // Bits[4:0] - Byte Index (0 - 31)
40666 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40667 SmallVector<SDValue, 16> VPPERMMask;
40668 int NumBytes = 16;
40669 int Ratio = NumBytes / NumMaskElts;
40670 for (int i = 0; i < NumBytes; ++i) {
40671 int M = Mask[i / Ratio];
40672 if (M == SM_SentinelUndef) {
40673 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40674 continue;
40675 }
40676 if (M == SM_SentinelZero) {
40677 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40678 continue;
40679 }
40680 M = Ratio * M + i % Ratio;
40681 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40682 }
40683 MVT ByteVT = MVT::v16i8;
40684 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40685 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40686 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40687 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40688 return DAG.getBitcast(RootVT, Res);
40689 }
40690
40691 // If that failed and either input is extracted then try to combine as a
40692 // shuffle with the larger type.
40694 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40695 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40696 DAG, DL, Subtarget))
40697 return WideShuffle;
40698
40699 // If we have a dual input shuffle then lower to VPERMV3,
40700 // (non-VLX will pad to 512-bit shuffles)
40701 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40702 ((Subtarget.hasAVX512() &&
40703 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40704 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40705 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40706 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40707 MaskVT == MVT::v16i32)) ||
40708 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40709 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40710 MaskVT == MVT::v32i16)) ||
40711 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40712 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40713 MaskVT == MVT::v64i8)))) {
40714 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40715 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40716 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40717 return DAG.getBitcast(RootVT, Res);
40718 }
40719
40720 // Failed to find any combines.
40721 return SDValue();
40722}
40723
40724// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40725// instruction if possible.
40726//
40727// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40728// type size to attempt to combine:
40729// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40730// -->
40731// extract_subvector(shuffle(x,y,m2),0)
40733 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40734 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40735 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40736 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40737 const X86Subtarget &Subtarget) {
40738 unsigned NumMaskElts = BaseMask.size();
40739 unsigned NumInputs = Inputs.size();
40740 if (NumInputs == 0)
40741 return SDValue();
40742
40743 unsigned RootSizeInBits = RootVT.getSizeInBits();
40744 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40745 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40746
40747 // Peek through subvectors to find widest legal vector.
40748 // TODO: Handle ISD::TRUNCATE
40749 unsigned WideSizeInBits = RootSizeInBits;
40750 for (SDValue Input : Inputs) {
40752 while (1) {
40753 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40754 Input = peekThroughBitcasts(Input.getOperand(0));
40755 continue;
40756 }
40757 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40758 Input.getOperand(0).isUndef() &&
40759 isNullConstant(Input.getOperand(2))) {
40760 Input = peekThroughBitcasts(Input.getOperand(1));
40761 continue;
40762 }
40763 break;
40764 }
40765 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40766 WideSizeInBits < Input.getValueSizeInBits())
40767 WideSizeInBits = Input.getValueSizeInBits();
40768 }
40769
40770 // Bail if we fail to find a source larger than the existing root.
40771 if (WideSizeInBits <= RootSizeInBits ||
40772 (WideSizeInBits % RootSizeInBits) != 0)
40773 return SDValue();
40774
40775 // Create new mask for larger type.
40776 SmallVector<int, 64> WideMask;
40777 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40778
40779 // Attempt to peek through inputs and adjust mask when we extract from an
40780 // upper subvector.
40781 int AdjustedMasks = 0;
40782 SmallVector<SDValue, 4> WideInputs(Inputs);
40783 for (unsigned I = 0; I != NumInputs; ++I) {
40784 SDValue &Input = WideInputs[I];
40786 while (1) {
40787 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40788 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40789 uint64_t Idx = Input.getConstantOperandVal(1);
40790 if (Idx != 0) {
40791 ++AdjustedMasks;
40792 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40793 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40794
40795 int lo = I * WideMask.size();
40796 int hi = (I + 1) * WideMask.size();
40797 for (int &M : WideMask)
40798 if (lo <= M && M < hi)
40799 M += Idx;
40800 }
40801 Input = peekThroughBitcasts(Input.getOperand(0));
40802 continue;
40803 }
40804 // TODO: Handle insertions into upper subvectors.
40805 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40806 Input.getOperand(0).isUndef() &&
40807 isNullConstant(Input.getOperand(2))) {
40808 Input = peekThroughBitcasts(Input.getOperand(1));
40809 continue;
40810 }
40811 break;
40812 }
40813 }
40814
40815 // Remove unused/repeated shuffle source ops.
40816 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40817 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40818
40819 // Bail if we're always extracting from the lowest subvectors,
40820 // combineX86ShuffleChain should match this for the current width, or the
40821 // shuffle still references too many inputs.
40822 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40823 return SDValue();
40824
40825 // Minor canonicalization of the accumulated shuffle mask to make it easier
40826 // to match below. All this does is detect masks with sequential pairs of
40827 // elements, and shrink them to the half-width mask. It does this in a loop
40828 // so it will reduce the size of the mask to the minimal width mask which
40829 // performs an equivalent shuffle.
40830 while (WideMask.size() > 1) {
40831 SmallVector<int, 64> WidenedMask;
40832 if (!canWidenShuffleElements(WideMask, WidenedMask))
40833 break;
40834 WideMask = std::move(WidenedMask);
40835 }
40836
40837 // Canonicalization of binary shuffle masks to improve pattern matching by
40838 // commuting the inputs.
40839 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40841 std::swap(WideInputs[0], WideInputs[1]);
40842 }
40843
40844 // Increase depth for every upper subvector we've peeked through.
40845 Depth += AdjustedMasks;
40846
40847 // Attempt to combine wider chain.
40848 // TODO: Can we use a better Root?
40849 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40850 WideInputs.back().getValueSizeInBits()
40851 ? WideInputs.front()
40852 : WideInputs.back();
40853 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40854 "WideRootSize mismatch");
40855
40856 if (SDValue WideShuffle = combineX86ShuffleChain(
40857 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40858 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40859 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40860 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40861 return DAG.getBitcast(RootVT, WideShuffle);
40862 }
40863
40864 return SDValue();
40865}
40866
40867// Canonicalize the combined shuffle mask chain with horizontal ops.
40868// NOTE: This may update the Ops and Mask.
40871 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40872 const X86Subtarget &Subtarget) {
40873 if (Mask.empty() || Ops.empty())
40874 return SDValue();
40875
40877 for (SDValue Op : Ops)
40879
40880 // All ops must be the same horizop + type.
40881 SDValue BC0 = BC[0];
40882 EVT VT0 = BC0.getValueType();
40883 unsigned Opcode0 = BC0.getOpcode();
40884 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40885 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40886 }))
40887 return SDValue();
40888
40889 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40890 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40891 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40892 if (!isHoriz && !isPack)
40893 return SDValue();
40894
40895 // Do all ops have a single use?
40896 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40897 return Op.hasOneUse() &&
40899 });
40900
40901 int NumElts = VT0.getVectorNumElements();
40902 int NumLanes = VT0.getSizeInBits() / 128;
40903 int NumEltsPerLane = NumElts / NumLanes;
40904 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40905 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40906 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40907
40908 if (NumEltsPerLane >= 4 &&
40909 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40910 SmallVector<int> LaneMask, ScaledMask;
40911 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40912 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40913 // See if we can remove the shuffle by resorting the HOP chain so that
40914 // the HOP args are pre-shuffled.
40915 // TODO: Generalize to any sized/depth chain.
40916 // TODO: Add support for PACKSS/PACKUS.
40917 if (isHoriz) {
40918 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40919 auto GetHOpSrc = [&](int M) {
40920 if (M == SM_SentinelUndef)
40921 return DAG.getUNDEF(VT0);
40922 if (M == SM_SentinelZero)
40923 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40924 SDValue Src0 = BC[M / 4];
40925 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40926 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40927 return Src1.getOperand(M % 2);
40928 return SDValue();
40929 };
40930 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40931 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40932 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40933 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40934 if (M0 && M1 && M2 && M3) {
40935 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40936 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40937 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40938 }
40939 }
40940 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40941 if (Ops.size() >= 2) {
40942 SDValue LHS, RHS;
40943 auto GetHOpSrc = [&](int M, int &OutM) {
40944 // TODO: Support SM_SentinelZero
40945 if (M < 0)
40946 return M == SM_SentinelUndef;
40947 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40948 if (!LHS || LHS == Src) {
40949 LHS = Src;
40950 OutM = (M % 2);
40951 return true;
40952 }
40953 if (!RHS || RHS == Src) {
40954 RHS = Src;
40955 OutM = (M % 2) + 2;
40956 return true;
40957 }
40958 return false;
40959 };
40960 int PostMask[4] = {-1, -1, -1, -1};
40961 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40962 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40963 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40964 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40965 LHS = DAG.getBitcast(SrcVT, LHS);
40966 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40967 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40968 // Use SHUFPS for the permute so this will work on SSE2 targets,
40969 // shuffle combining and domain handling will simplify this later on.
40970 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40971 Res = DAG.getBitcast(ShuffleVT, Res);
40972 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40973 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40974 }
40975 }
40976 }
40977 }
40978
40979 if (2 < Ops.size())
40980 return SDValue();
40981
40982 SDValue BC1 = BC[BC.size() - 1];
40983 if (Mask.size() == VT0.getVectorNumElements()) {
40984 // Canonicalize binary shuffles of horizontal ops that use the
40985 // same sources to an unary shuffle.
40986 // TODO: Try to perform this fold even if the shuffle remains.
40987 if (Ops.size() == 2) {
40988 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40989 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40990 };
40991 // Commute if all BC0's ops are contained in BC1.
40992 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40993 ContainsOps(BC1, BC0.getOperand(1))) {
40995 std::swap(Ops[0], Ops[1]);
40996 std::swap(BC0, BC1);
40997 }
40998
40999 // If BC1 can be represented by BC0, then convert to unary shuffle.
41000 if (ContainsOps(BC0, BC1.getOperand(0)) &&
41001 ContainsOps(BC0, BC1.getOperand(1))) {
41002 for (int &M : Mask) {
41003 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
41004 continue;
41005 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
41006 M -= NumElts + (SubLane * NumHalfEltsPerLane);
41007 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
41008 M += NumHalfEltsPerLane;
41009 }
41010 }
41011 }
41012
41013 // Canonicalize unary horizontal ops to only refer to lower halves.
41014 for (int i = 0; i != NumElts; ++i) {
41015 int &M = Mask[i];
41016 if (isUndefOrZero(M))
41017 continue;
41018 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
41019 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41020 M -= NumHalfEltsPerLane;
41021 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
41022 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41023 M -= NumHalfEltsPerLane;
41024 }
41025 }
41026
41027 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
41028 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
41029 // represents the LHS/RHS inputs for the lower/upper halves.
41030 SmallVector<int, 16> TargetMask128, WideMask128;
41031 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41032 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41033 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41034 bool SingleOp = (Ops.size() == 1);
41035 if (isPack || OneUseOps ||
41036 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41037 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41038 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41039 Lo = Lo.getOperand(WideMask128[0] & 1);
41040 Hi = Hi.getOperand(WideMask128[1] & 1);
41041 if (SingleOp) {
41042 SDValue Undef = DAG.getUNDEF(SrcVT);
41043 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41044 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41045 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41046 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41047 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41048 }
41049 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41050 }
41051 }
41052
41053 // If we are post-shuffling a 256-bit hop and not requiring the upper
41054 // elements, then try to narrow to a 128-bit hop directly.
41055 SmallVector<int, 16> WideMask64;
41056 if (Ops.size() == 1 && NumLanes == 2 &&
41057 scaleShuffleElements(Mask, 4, WideMask64) &&
41058 isUndefInRange(WideMask64, 2, 2)) {
41059 int M0 = WideMask64[0];
41060 int M1 = WideMask64[1];
41061 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41063 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41064 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41065 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41066 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41067 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41068 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41069 }
41070 }
41071
41072 return SDValue();
41073}
41074
41075// Attempt to constant fold all of the constant source ops.
41076// Returns true if the entire shuffle is folded to a constant.
41077// TODO: Extend this to merge multiple constant Ops and update the mask.
41079 ArrayRef<int> Mask,
41080 ArrayRef<const SDNode *> SrcNodes,
41081 SelectionDAG &DAG, const SDLoc &DL,
41082 const X86Subtarget &Subtarget) {
41083 unsigned SizeInBits = VT.getSizeInBits();
41084 unsigned NumMaskElts = Mask.size();
41085 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41086 unsigned NumOps = Ops.size();
41087
41088 // Extract constant bits from each source op.
41089 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41091 for (unsigned I = 0; I != NumOps; ++I)
41092 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41093 RawBitsOps[I],
41094 /*AllowWholeUndefs*/ true,
41095 /*AllowPartialUndefs*/ true))
41096 return SDValue();
41097
41098 // If we're optimizing for size, only fold if at least one of the constants is
41099 // only used once or the combined shuffle has included a variable mask
41100 // shuffle, this is to avoid constant pool bloat.
41101 bool IsOptimizingSize = DAG.shouldOptForSize();
41102 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41103 return isTargetShuffleVariableMask(N->getOpcode());
41104 });
41105 if (IsOptimizingSize && !HasVariableMask &&
41106 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41107 return SDValue();
41108
41109 // Shuffle the constant bits according to the mask.
41110 APInt UndefElts(NumMaskElts, 0);
41111 APInt ZeroElts(NumMaskElts, 0);
41112 APInt ConstantElts(NumMaskElts, 0);
41113 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41114 APInt::getZero(MaskSizeInBits));
41115 for (unsigned i = 0; i != NumMaskElts; ++i) {
41116 int M = Mask[i];
41117 if (M == SM_SentinelUndef) {
41118 UndefElts.setBit(i);
41119 continue;
41120 } else if (M == SM_SentinelZero) {
41121 ZeroElts.setBit(i);
41122 continue;
41123 }
41124 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41125
41126 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41127 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41128
41129 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41130 if (SrcUndefElts[SrcMaskIdx]) {
41131 UndefElts.setBit(i);
41132 continue;
41133 }
41134
41135 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41136 APInt &Bits = SrcEltBits[SrcMaskIdx];
41137 if (!Bits) {
41138 ZeroElts.setBit(i);
41139 continue;
41140 }
41141
41142 ConstantElts.setBit(i);
41143 ConstantBitData[i] = Bits;
41144 }
41145 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41146
41147 // Attempt to create a zero vector.
41148 if ((UndefElts | ZeroElts).isAllOnes())
41149 return getZeroVector(VT, Subtarget, DAG, DL);
41150
41151 // Create the constant data.
41152 MVT MaskSVT;
41153 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41154 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41155 else
41156 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41157
41158 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41159 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41160 return SDValue();
41161
41162 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41163 return DAG.getBitcast(VT, CstOp);
41164}
41165
41166namespace llvm {
41167 namespace X86 {
41168 enum {
41170 };
41171 } // namespace X86
41172} // namespace llvm
41173
41174/// Fully generic combining of x86 shuffle instructions.
41175///
41176/// This should be the last combine run over the x86 shuffle instructions. Once
41177/// they have been fully optimized, this will recursively consider all chains
41178/// of single-use shuffle instructions, build a generic model of the cumulative
41179/// shuffle operation, and check for simpler instructions which implement this
41180/// operation. We use this primarily for two purposes:
41181///
41182/// 1) Collapse generic shuffles to specialized single instructions when
41183/// equivalent. In most cases, this is just an encoding size win, but
41184/// sometimes we will collapse multiple generic shuffles into a single
41185/// special-purpose shuffle.
41186/// 2) Look for sequences of shuffle instructions with 3 or more total
41187/// instructions, and replace them with the slightly more expensive SSSE3
41188/// PSHUFB instruction if available. We do this as the last combining step
41189/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41190/// a suitable short sequence of other instructions. The PSHUFB will either
41191/// use a register or have to read from memory and so is slightly (but only
41192/// slightly) more expensive than the other shuffle instructions.
41193///
41194/// Because this is inherently a quadratic operation (for each shuffle in
41195/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41196/// This should never be an issue in practice as the shuffle lowering doesn't
41197/// produce sequences of more than 8 instructions.
41198///
41199/// FIXME: We will currently miss some cases where the redundant shuffling
41200/// would simplify under the threshold for PSHUFB formation because of
41201/// combine-ordering. To fix this, we should do the redundant instruction
41202/// combining in this recursive walk.
41204 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41205 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41206 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41207 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41208 const SDLoc &DL, const X86Subtarget &Subtarget) {
41209 assert(!RootMask.empty() &&
41210 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41211 "Illegal shuffle root mask");
41212 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41213 unsigned RootSizeInBits = RootVT.getSizeInBits();
41214 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41215
41216 // Bound the depth of our recursive combine because this is ultimately
41217 // quadratic in nature.
41218 if (Depth >= MaxDepth)
41219 return SDValue();
41220
41221 // Directly rip through bitcasts to find the underlying operand.
41222 SDValue Op = SrcOps[SrcOpIndex];
41224
41225 EVT VT = Op.getValueType();
41226 if (!VT.isVector() || !VT.isSimple())
41227 return SDValue(); // Bail if we hit a non-simple non-vector.
41228
41229 // FIXME: Just bail on f16 for now.
41230 if (VT.getVectorElementType() == MVT::f16)
41231 return SDValue();
41232
41233 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41234 "Can only combine shuffles upto size of the root op.");
41235
41236 // Create a demanded elts mask from the referenced elements of Op.
41237 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41238 for (int M : RootMask) {
41239 int BaseIdx = RootMask.size() * SrcOpIndex;
41240 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41241 OpDemandedElts.setBit(M - BaseIdx);
41242 }
41243 if (RootSizeInBits != VT.getSizeInBits()) {
41244 // Op is smaller than Root - extract the demanded elts for the subvector.
41245 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41246 unsigned NumOpMaskElts = RootMask.size() / Scale;
41247 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41248 assert(OpDemandedElts
41249 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41250 .isZero() &&
41251 "Out of range elements referenced in root mask");
41252 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41253 }
41254 OpDemandedElts =
41255 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41256
41257 // Extract target shuffle mask and resolve sentinels and inputs.
41258 SmallVector<int, 64> OpMask;
41259 SmallVector<SDValue, 2> OpInputs;
41260 APInt OpUndef, OpZero;
41261 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41262 OpZero, DAG, Depth, false)) {
41263 // Shuffle inputs must not be larger than the shuffle result.
41264 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41265 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41266 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41267 }))
41268 return SDValue();
41269 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41270 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41271 !isNullConstant(Op.getOperand(1))) {
41272 SDValue SrcVec = Op.getOperand(0);
41273 int ExtractIdx = Op.getConstantOperandVal(1);
41274 unsigned NumElts = VT.getVectorNumElements();
41275 OpInputs.assign({SrcVec});
41276 OpMask.assign(NumElts, SM_SentinelUndef);
41277 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41278 OpZero = OpUndef = APInt::getZero(NumElts);
41279 } else {
41280 return SDValue();
41281 }
41282
41283 // If the shuffle result was smaller than the root, we need to adjust the
41284 // mask indices and pad the mask with undefs.
41285 if (RootSizeInBits > VT.getSizeInBits()) {
41286 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41287 unsigned OpMaskSize = OpMask.size();
41288 if (OpInputs.size() > 1) {
41289 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41290 for (int &M : OpMask) {
41291 if (M < 0)
41292 continue;
41293 int EltIdx = M % OpMaskSize;
41294 int OpIdx = M / OpMaskSize;
41295 M = (PaddedMaskSize * OpIdx) + EltIdx;
41296 }
41297 }
41298 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41299 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41300 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41301 }
41302
41305
41306 // We don't need to merge masks if the root is empty.
41307 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41308 if (EmptyRoot) {
41309 // Only resolve zeros if it will remove an input, otherwise we might end
41310 // up in an infinite loop.
41311 bool ResolveKnownZeros = true;
41312 if (!OpZero.isZero()) {
41313 APInt UsedInputs = APInt::getZero(OpInputs.size());
41314 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41315 int M = OpMask[i];
41316 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41317 continue;
41318 UsedInputs.setBit(M / OpMask.size());
41319 if (UsedInputs.isAllOnes()) {
41320 ResolveKnownZeros = false;
41321 break;
41322 }
41323 }
41324 }
41325 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41326 ResolveKnownZeros);
41327
41328 Mask = OpMask;
41329 Ops.append(OpInputs.begin(), OpInputs.end());
41330 } else {
41331 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41332
41333 // Add the inputs to the Ops list, avoiding duplicates.
41334 Ops.append(SrcOps.begin(), SrcOps.end());
41335
41336 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41337 // Attempt to find an existing match.
41339 for (int i = 0, e = Ops.size(); i < e; ++i)
41340 if (InputBC == peekThroughBitcasts(Ops[i]))
41341 return i;
41342 // Match failed - should we replace an existing Op?
41343 if (InsertionPoint >= 0) {
41345 return InsertionPoint;
41346 }
41347 // Add to the end of the Ops list.
41348 Ops.push_back(Input);
41349 return Ops.size() - 1;
41350 };
41351
41352 SmallVector<int, 2> OpInputIdx;
41353 for (SDValue OpInput : OpInputs)
41354 OpInputIdx.push_back(
41355 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41356
41357 assert(((RootMask.size() > OpMask.size() &&
41358 RootMask.size() % OpMask.size() == 0) ||
41359 (OpMask.size() > RootMask.size() &&
41360 OpMask.size() % RootMask.size() == 0) ||
41361 OpMask.size() == RootMask.size()) &&
41362 "The smaller number of elements must divide the larger.");
41363
41364 // This function can be performance-critical, so we rely on the power-of-2
41365 // knowledge that we have about the mask sizes to replace div/rem ops with
41366 // bit-masks and shifts.
41368 "Non-power-of-2 shuffle mask sizes");
41370 "Non-power-of-2 shuffle mask sizes");
41371 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41372 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41373
41374 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41375 unsigned RootRatio =
41376 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41377 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41378 assert((RootRatio == 1 || OpRatio == 1) &&
41379 "Must not have a ratio for both incoming and op masks!");
41380
41381 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41382 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41383 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41384 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41385 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41386
41387 Mask.resize(MaskWidth, SM_SentinelUndef);
41388
41389 // Merge this shuffle operation's mask into our accumulated mask. Note that
41390 // this shuffle's mask will be the first applied to the input, followed by
41391 // the root mask to get us all the way to the root value arrangement. The
41392 // reason for this order is that we are recursing up the operation chain.
41393 for (unsigned i = 0; i < MaskWidth; ++i) {
41394 unsigned RootIdx = i >> RootRatioLog2;
41395 if (RootMask[RootIdx] < 0) {
41396 // This is a zero or undef lane, we're done.
41397 Mask[i] = RootMask[RootIdx];
41398 continue;
41399 }
41400
41401 unsigned RootMaskedIdx =
41402 RootRatio == 1
41403 ? RootMask[RootIdx]
41404 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41405
41406 // Just insert the scaled root mask value if it references an input other
41407 // than the SrcOp we're currently inserting.
41408 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41409 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41410 Mask[i] = RootMaskedIdx;
41411 continue;
41412 }
41413
41414 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41415 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41416 if (OpMask[OpIdx] < 0) {
41417 // The incoming lanes are zero or undef, it doesn't matter which ones we
41418 // are using.
41419 Mask[i] = OpMask[OpIdx];
41420 continue;
41421 }
41422
41423 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41424 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41425 : (OpMask[OpIdx] << OpRatioLog2) +
41426 (RootMaskedIdx & (OpRatio - 1));
41427
41428 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41429 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41430 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41431 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41432
41433 Mask[i] = OpMaskedIdx;
41434 }
41435 }
41436
41437 // Peek through any free bitcasts to insert_subvector vector widenings or
41438 // extract_subvector nodes back to root size.
41439 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41440 for (auto [I, Op] : enumerate(Ops)) {
41441 SDValue BC = Op;
41442 while (1) {
41443 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41444 BC = BC.getOperand(0);
41445 continue;
41446 }
41447 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41448 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41449 // Set out of bounds mask indices to undef.
41450 Op = BC = BC.getOperand(1);
41451 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41452 int Lo = I * Mask.size();
41453 int Hi = (I + 1) * Mask.size();
41454 int NewHi = Lo + (Mask.size() / Scale);
41455 for (int &M : Mask) {
41456 if (Lo <= M && NewHi <= M && M < Hi)
41457 M = SM_SentinelUndef;
41458 }
41459 continue;
41460 }
41461 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41462 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41463 isNullConstant(BC.getOperand(1))) {
41464 Op = BC = BC.getOperand(0);
41465 continue;
41466 }
41467 break;
41468 }
41469 }
41470
41471 // Remove unused/repeated shuffle source ops.
41473
41474 // Handle the all undef/zero/ones cases early.
41475 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41476 return DAG.getUNDEF(RootVT);
41477 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41478 return getZeroVector(RootVT, Subtarget, DAG, DL);
41479 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41481 return getOnesVector(RootVT, DAG, DL);
41482
41483 assert(!Ops.empty() && "Shuffle with no inputs detected");
41484
41485 // Update the list of shuffle nodes that have been combined so far.
41486 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41487 CombinedNodes.push_back(Op.getNode());
41488
41489 // See if we can recurse into each shuffle source op (if it's a target
41490 // shuffle). The source op should only be generally combined if it either has
41491 // a single use (i.e. current Op) or all its users have already been combined,
41492 // if not then we can still combine but should prevent generation of variable
41493 // shuffles to avoid constant pool bloat.
41494 // Don't recurse if we already have more source ops than we can combine in
41495 // the remaining recursion depth.
41496 if (Ops.size() < (MaxDepth - Depth)) {
41497 for (int i = 0, e = Ops.size(); i < e; ++i) {
41498 // For empty roots, we need to resolve zeroable elements before combining
41499 // them with other shuffles.
41500 SmallVector<int, 64> ResolvedMask = Mask;
41501 if (EmptyRoot)
41502 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41503 bool AllowCrossLaneVar = false;
41504 bool AllowPerLaneVar = false;
41505 if (Ops[i].getNode()->hasOneUse() ||
41506 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41507 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41508 AllowPerLaneVar = AllowVariablePerLaneMask;
41509 }
41511 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41512 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41513 DAG, DL, Subtarget))
41514 return Res;
41515 }
41516 }
41517
41518 // Attempt to constant fold all of the constant source ops.
41520 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41521 return Cst;
41522
41523 // If constant fold failed and we only have constants - then we have
41524 // multiple uses by a single non-variable shuffle - just bail.
41525 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41526 APInt UndefElts;
41527 SmallVector<APInt> RawBits;
41528 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41529 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41530 RawBits,
41531 /*AllowWholeUndefs*/ true,
41532 /*AllowPartialUndefs*/ true);
41533 })) {
41534 return SDValue();
41535 }
41536
41537 // Canonicalize the combined shuffle mask chain with horizontal ops.
41538 // NOTE: This will update the Ops and Mask.
41540 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41541 return DAG.getBitcast(RootVT, HOp);
41542
41543 // Try to refine our inputs given our knowledge of target shuffle mask.
41544 for (auto I : enumerate(Ops)) {
41545 int OpIdx = I.index();
41546 SDValue &Op = I.value();
41547
41548 // What range of shuffle mask element values results in picking from Op?
41549 int Lo = OpIdx * Mask.size();
41550 int Hi = Lo + Mask.size();
41551
41552 // Which elements of Op do we demand, given the mask's granularity?
41553 APInt OpDemandedElts(Mask.size(), 0);
41554 for (int MaskElt : Mask) {
41555 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41556 int OpEltIdx = MaskElt - Lo;
41557 OpDemandedElts.setBit(OpEltIdx);
41558 }
41559 }
41560
41561 // Is the shuffle result smaller than the root?
41562 if (Op.getValueSizeInBits() < RootSizeInBits) {
41563 // We padded the mask with undefs. But we now need to undo that.
41564 unsigned NumExpectedVectorElts = Mask.size();
41565 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41566 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41567 assert(!OpDemandedElts.extractBits(
41568 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41569 "Demanding the virtual undef widening padding?");
41570 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41571 }
41572
41573 // The Op itself may be of different VT, so we need to scale the mask.
41574 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41575 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41576
41577 // Can this operand be simplified any further, given it's demanded elements?
41579 Op, OpScaledDemandedElts, DAG))
41580 Op = NewOp;
41581 }
41582 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41583
41584 // Widen any subvector shuffle inputs we've collected.
41585 // TODO: Remove this to avoid generating temporary nodes, we should only
41586 // widen once combineX86ShuffleChain has found a match.
41587 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41588 return Op.getValueSizeInBits() < RootSizeInBits;
41589 })) {
41590 for (SDValue &Op : Ops)
41591 if (Op.getValueSizeInBits() < RootSizeInBits)
41592 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41593 RootSizeInBits);
41594 // Reresolve - we might have repeated subvector sources.
41596 }
41597
41598 // Handle the all undef/zero/ones cases.
41599 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41600 return DAG.getUNDEF(RootVT);
41601 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41602 return getZeroVector(RootVT, Subtarget, DAG, DL);
41603 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41605 return getOnesVector(RootVT, DAG, DL);
41606
41607 assert(!Ops.empty() && "Shuffle with no inputs detected");
41608
41609 // We can only combine unary and binary shuffle mask cases.
41610 if (Ops.size() <= 2) {
41611 // Minor canonicalization of the accumulated shuffle mask to make it easier
41612 // to match below. All this does is detect masks with sequential pairs of
41613 // elements, and shrink them to the half-width mask. It does this in a loop
41614 // so it will reduce the size of the mask to the minimal width mask which
41615 // performs an equivalent shuffle.
41616 while (Mask.size() > 1) {
41617 SmallVector<int, 64> WidenedMask;
41618 if (!canWidenShuffleElements(Mask, WidenedMask))
41619 break;
41620 Mask = std::move(WidenedMask);
41621 }
41622
41623 // Canonicalization of binary shuffle masks to improve pattern matching by
41624 // commuting the inputs.
41625 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41627 std::swap(Ops[0], Ops[1]);
41628 }
41629
41630 // Try to combine into a single shuffle instruction.
41631 if (SDValue Shuffle = combineX86ShuffleChain(
41632 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41633 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41634 IsMaskedShuffle, DAG, DL, Subtarget))
41635 return Shuffle;
41636
41637 // If all the operands come from the same larger vector, fallthrough and try
41638 // to use combineX86ShuffleChainWithExtract.
41641 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41642 (RootSizeInBits / Mask.size()) != 64 ||
41643 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41644 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41645 LHS.getOperand(0) != RHS.getOperand(0))
41646 return SDValue();
41647 }
41648
41649 // If that failed and any input is extracted then try to combine as a
41650 // shuffle with the larger type.
41652 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41653 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41654 DAG, DL, Subtarget);
41655}
41656
41657/// Helper entry wrapper to combineX86ShufflesRecursively.
41659 const X86Subtarget &Subtarget) {
41661 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41662 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41663 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41664 SDLoc(Op), Subtarget);
41665}
41666
41667/// Get the PSHUF-style mask from PSHUF node.
41668///
41669/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41670/// PSHUF-style masks that can be reused with such instructions.
41672 MVT VT = N.getSimpleValueType();
41675 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41676 (void)HaveMask;
41677 assert(HaveMask);
41678
41679 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41680 // matter. Check that the upper masks are repeats and remove them.
41681 if (VT.getSizeInBits() > 128) {
41682 int LaneElts = 128 / VT.getScalarSizeInBits();
41683#ifndef NDEBUG
41684 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41685 for (int j = 0; j < LaneElts; ++j)
41686 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41687 "Mask doesn't repeat in high 128-bit lanes!");
41688#endif
41689 Mask.resize(LaneElts);
41690 }
41691
41692 switch (N.getOpcode()) {
41693 case X86ISD::PSHUFD:
41694 return Mask;
41695 case X86ISD::PSHUFLW:
41696 Mask.resize(4);
41697 return Mask;
41698 case X86ISD::PSHUFHW:
41699 Mask.erase(Mask.begin(), Mask.begin() + 4);
41700 for (int &M : Mask)
41701 M -= 4;
41702 return Mask;
41703 default:
41704 llvm_unreachable("No valid shuffle instruction found!");
41705 }
41706}
41707
41708/// Get the expanded blend mask from a BLENDI node.
41709/// For v16i16 nodes, this will splat the repeated i8 mask.
41711 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41712 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41713 APInt Mask = V.getConstantOperandAPInt(2);
41714 if (Mask.getBitWidth() > NumElts)
41715 Mask = Mask.trunc(NumElts);
41716 if (NumElts == 16) {
41717 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41718 Mask = APInt::getSplat(16, Mask);
41719 }
41720 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41721 return Mask;
41722}
41723
41724/// Search for a combinable shuffle across a chain ending in pshufd.
41725///
41726/// We walk up the chain and look for a combinable shuffle, skipping over
41727/// shuffles that we could hoist this shuffle's transformation past without
41728/// altering anything.
41731 const SDLoc &DL,
41732 SelectionDAG &DAG) {
41733 assert(N.getOpcode() == X86ISD::PSHUFD &&
41734 "Called with something other than an x86 128-bit half shuffle!");
41735
41736 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41737 // of the shuffles in the chain so that we can form a fresh chain to replace
41738 // this one.
41740 SDValue V = N.getOperand(0);
41741 for (; V.hasOneUse(); V = V.getOperand(0)) {
41742 switch (V.getOpcode()) {
41743 default:
41744 return SDValue(); // Nothing combined!
41745
41746 case ISD::BITCAST:
41747 // Skip bitcasts as we always know the type for the target specific
41748 // instructions.
41749 continue;
41750
41751 case X86ISD::PSHUFD:
41752 // Found another dword shuffle.
41753 break;
41754
41755 case X86ISD::PSHUFLW:
41756 // Check that the low words (being shuffled) are the identity in the
41757 // dword shuffle, and the high words are self-contained.
41758 if (Mask[0] != 0 || Mask[1] != 1 ||
41759 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41760 return SDValue();
41761
41762 Chain.push_back(V);
41763 continue;
41764
41765 case X86ISD::PSHUFHW:
41766 // Check that the high words (being shuffled) are the identity in the
41767 // dword shuffle, and the low words are self-contained.
41768 if (Mask[2] != 2 || Mask[3] != 3 ||
41769 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41770 return SDValue();
41771
41772 Chain.push_back(V);
41773 continue;
41774
41775 case X86ISD::UNPCKL:
41776 case X86ISD::UNPCKH:
41777 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41778 // shuffle into a preceding word shuffle.
41779 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41780 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41781 return SDValue();
41782
41783 // Search for a half-shuffle which we can combine with.
41784 unsigned CombineOp =
41785 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41786 if (V.getOperand(0) != V.getOperand(1) ||
41787 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41788 return SDValue();
41789 Chain.push_back(V);
41790 V = V.getOperand(0);
41791 do {
41792 switch (V.getOpcode()) {
41793 default:
41794 return SDValue(); // Nothing to combine.
41795
41796 case X86ISD::PSHUFLW:
41797 case X86ISD::PSHUFHW:
41798 if (V.getOpcode() == CombineOp)
41799 break;
41800
41801 Chain.push_back(V);
41802
41803 [[fallthrough]];
41804 case ISD::BITCAST:
41805 V = V.getOperand(0);
41806 continue;
41807 }
41808 break;
41809 } while (V.hasOneUse());
41810 break;
41811 }
41812 // Break out of the loop if we break out of the switch.
41813 break;
41814 }
41815
41816 if (!V.hasOneUse())
41817 // We fell out of the loop without finding a viable combining instruction.
41818 return SDValue();
41819
41820 // Merge this node's mask and our incoming mask.
41822 for (int &M : Mask)
41823 M = VMask[M];
41824 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41825 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41826
41827 // Rebuild the chain around this new shuffle.
41828 while (!Chain.empty()) {
41829 SDValue W = Chain.pop_back_val();
41830
41831 if (V.getValueType() != W.getOperand(0).getValueType())
41832 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41833
41834 switch (W.getOpcode()) {
41835 default:
41836 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41837
41838 case X86ISD::UNPCKL:
41839 case X86ISD::UNPCKH:
41840 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41841 break;
41842
41843 case X86ISD::PSHUFD:
41844 case X86ISD::PSHUFLW:
41845 case X86ISD::PSHUFHW:
41846 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41847 break;
41848 }
41849 }
41850 if (V.getValueType() != N.getValueType())
41851 V = DAG.getBitcast(N.getValueType(), V);
41852
41853 // Return the new chain to replace N.
41854 return V;
41855}
41856
41857// Attempt to commute shufps LHS loads:
41858// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41860 SelectionDAG &DAG) {
41861 // TODO: Add vXf64 support.
41862 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41863 return SDValue();
41864
41865 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41866 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41867 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41868 return SDValue();
41869 SDValue N0 = V.getOperand(0);
41870 SDValue N1 = V.getOperand(1);
41871 unsigned Imm = V.getConstantOperandVal(2);
41872 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41873 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41875 return SDValue();
41876 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41877 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41878 DAG.getTargetConstant(Imm, DL, MVT::i8));
41879 };
41880
41881 switch (N.getOpcode()) {
41882 case X86ISD::VPERMILPI:
41883 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41884 unsigned Imm = N.getConstantOperandVal(1);
41885 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41886 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41887 }
41888 break;
41889 case X86ISD::SHUFP: {
41890 SDValue N0 = N.getOperand(0);
41891 SDValue N1 = N.getOperand(1);
41892 unsigned Imm = N.getConstantOperandVal(2);
41893 if (N0 == N1) {
41894 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41895 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41896 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41897 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41898 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41899 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41900 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41901 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41902 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41903 }
41904 break;
41905 }
41906 }
41907
41908 return SDValue();
41909}
41910
41911// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41912// iff we don't demand the same element index for both X and Y.
41913static SDValue
41915 const APInt &DemandedElts, SelectionDAG &DAG,
41916 const X86Subtarget &Subtarget, const SDLoc &DL) {
41917 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41918 if (!N0.hasOneUse() || !N1.hasOneUse())
41919 return SDValue();
41920
41921 unsigned NumElts = VT.getVectorNumElements();
41924
41925 // See if both operands are shuffles, and that we can scale the shuffle masks
41926 // to the same width as the blend mask.
41927 // TODO: Support SM_SentinelZero?
41928 SmallVector<SDValue, 2> Ops0, Ops1;
41929 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41930 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41931 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41932 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41933 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41934 return SDValue();
41935
41936 // Determine the demanded elts from both permutes.
41937 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41938 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41939 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41940 Demanded1,
41941 /*AllowUndefElts=*/true) ||
41942 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41943 DemandedRHS0, /*AllowUndefElts=*/true) ||
41944 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41945 DemandedRHS1, /*AllowUndefElts=*/true))
41946 return SDValue();
41947
41948 // Confirm that we only use a single operand from both permutes and that we
41949 // don't demand the same index from both.
41950 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41951 DemandedLHS0.intersects(DemandedLHS1))
41952 return SDValue();
41953
41954 // Use the permute demanded elts masks as the new blend mask.
41955 // Create the new permute mask as a blend of the 2 original permute masks.
41956 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41957 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41958 for (unsigned I = 0; I != NumElts; ++I) {
41959 if (Demanded0[I]) {
41960 int M = ScaledMask0[I];
41961 if (0 <= M) {
41962 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41963 "BlendMask demands LHS AND RHS");
41964 NewBlendMask[M] = M;
41965 NewPermuteMask[I] = M;
41966 }
41967 } else if (Demanded1[I]) {
41968 int M = ScaledMask1[I];
41969 if (0 <= M) {
41970 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41971 "BlendMask demands LHS AND RHS");
41972 NewBlendMask[M] = M + NumElts;
41973 NewPermuteMask[I] = M;
41974 }
41975 }
41976 }
41977 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41978 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41979
41980 // v16i16 shuffles can explode in complexity very easily, only accept them if
41981 // the blend mask is the same in the 128-bit subvectors (or can widen to
41982 // v8i32) and the permute can be widened as well.
41983 if (VT == MVT::v16i16) {
41984 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41985 !canWidenShuffleElements(NewBlendMask))
41986 return SDValue();
41987 if (!canWidenShuffleElements(NewPermuteMask))
41988 return SDValue();
41989 }
41990
41991 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41992 // widened to a lane permute (vperm2f128).
41993 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41995 NewPermuteMask) &&
41996 !canScaleShuffleElements(NewPermuteMask, 2))
41997 return SDValue();
41998
41999 SDValue NewBlend =
42000 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
42001 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
42002 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
42003 NewPermuteMask);
42004}
42005
42006// TODO - move this to TLI like isBinOp?
42007static bool isUnaryOp(unsigned Opcode) {
42008 switch (Opcode) {
42009 case ISD::CTLZ:
42010 case ISD::CTTZ:
42011 case ISD::CTPOP:
42012 return true;
42013 }
42014 return false;
42015}
42016
42017// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42018// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42020 const SDLoc &DL) {
42021 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42022 EVT ShuffleVT = N.getValueType();
42023 unsigned Opc = N.getOpcode();
42024
42025 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
42026 // AllZeros/AllOnes constants are freely shuffled and will peek through
42027 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
42028 // merge with target shuffles if it has one use so shuffle combining is
42029 // likely to kick in. Shuffles of splats are expected to be removed.
42030 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
42031 ISD::isBuildVectorAllZeros(Op.getNode()) ||
42035 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
42036 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
42037 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
42038 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
42039 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
42040 };
42041 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
42042 // Ensure we only shuffle whole vector src elements, unless its a logical
42043 // binops where we can more aggressively move shuffles from dst to src.
42044 return isLogicOp(BinOp) ||
42045 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42046 };
42047
42048 switch (Opc) {
42049 // Unary and Unary+Permute Shuffles.
42050 case X86ISD::PSHUFB: {
42051 // Don't merge PSHUFB if it contains zero'd elements.
42052 SmallVector<int> Mask;
42054 if (!getTargetShuffleMask(N, false, Ops, Mask))
42055 break;
42056 [[fallthrough]];
42057 }
42058 case X86ISD::VBROADCAST:
42059 case X86ISD::MOVDDUP:
42060 case X86ISD::PSHUFD:
42061 case X86ISD::PSHUFHW:
42062 case X86ISD::PSHUFLW:
42063 case X86ISD::VPERMV:
42064 case X86ISD::VPERMI:
42065 case X86ISD::VPERMILPI: {
42066 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42067 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42068 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42069 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42070 unsigned SrcOpcode = N0.getOpcode();
42071 EVT OpVT = N0.getValueType();
42072 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42075 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42076 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42077 IsMergeableWithShuffle(Op01, FoldShuf)) {
42078 SDValue LHS, RHS;
42079 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42080 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42081 if (Opc == X86ISD::VPERMV) {
42082 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42083 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42084 } else if (N.getNumOperands() == 2) {
42085 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42086 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42087 } else {
42088 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42089 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42090 }
42091 return DAG.getBitcast(ShuffleVT,
42092 DAG.getNode(SrcOpcode, DL, OpVT,
42093 DAG.getBitcast(OpVT, LHS),
42094 DAG.getBitcast(OpVT, RHS)));
42095 }
42096 }
42097 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42098 OpVT.getScalarSizeInBits() ==
42100 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42101 if (Opc == X86ISD::VPERMV)
42102 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42103 else if (N.getNumOperands() == 2)
42104 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42105 else
42106 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42107 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42108 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42109 }
42110 }
42111 break;
42112 }
42113 // Binary and Binary+Permute Shuffles.
42114 case X86ISD::INSERTPS: {
42115 // Don't merge INSERTPS if it contains zero'd elements.
42116 unsigned InsertPSMask = N.getConstantOperandVal(2);
42117 unsigned ZeroMask = InsertPSMask & 0xF;
42118 if (ZeroMask != 0)
42119 break;
42120 [[fallthrough]];
42121 }
42122 case X86ISD::MOVSD:
42123 case X86ISD::MOVSS:
42124 case X86ISD::BLENDI:
42125 case X86ISD::SHUFP:
42126 case X86ISD::UNPCKH:
42127 case X86ISD::UNPCKL: {
42128 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42129 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42130 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42131 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42132 unsigned SrcOpcode = N0.getOpcode();
42133 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42134 N0.getValueType() == N1.getValueType() &&
42135 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42136 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42141 // Ensure the total number of shuffles doesn't increase by folding this
42142 // shuffle through to the source ops.
42143 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42144 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42145 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42146 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42147 SDValue LHS, RHS;
42148 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42149 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42150 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42151 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42152 if (N.getNumOperands() == 3) {
42153 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42154 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42155 } else {
42156 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42157 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42158 }
42159 EVT OpVT = N0.getValueType();
42160 return DAG.getBitcast(ShuffleVT,
42161 DAG.getNode(SrcOpcode, DL, OpVT,
42162 DAG.getBitcast(OpVT, LHS),
42163 DAG.getBitcast(OpVT, RHS)));
42164 }
42165 }
42166 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42167 N0.getValueType() == N1.getValueType() &&
42168 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42169 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42172 SDValue Res;
42173 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42174 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42175 if (N.getNumOperands() == 3) {
42176 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42177 } else {
42178 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42179 }
42180 EVT OpVT = N0.getValueType();
42181 return DAG.getBitcast(
42182 ShuffleVT,
42183 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42184 }
42185 // TODO: We can generalize this for other shuffles/conversions.
42186 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42187 N1.getOpcode() == SrcOpcode &&
42188 N0.getValueType() == N1.getValueType() &&
42189 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42190 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42191 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42192 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42193 EVT OpSrcVT = N0.getOperand(0).getValueType();
42194 EVT OpDstVT = N0.getValueType();
42195 SDValue Res =
42196 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42197 return DAG.getBitcast(ShuffleVT,
42198 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42199 }
42200 }
42201 break;
42202 }
42203 }
42204 return SDValue();
42205}
42206
42207/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42209 SelectionDAG &DAG,
42210 const SDLoc &DL) {
42211 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42212
42213 MVT VT = V.getSimpleValueType();
42214 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42215 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42216 unsigned SrcOpc0 = Src0.getOpcode();
42217 unsigned SrcOpc1 = Src1.getOpcode();
42218 EVT SrcVT0 = Src0.getValueType();
42219 EVT SrcVT1 = Src1.getValueType();
42220
42221 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42222 return SDValue();
42223
42224 switch (SrcOpc0) {
42225 case X86ISD::MOVDDUP: {
42226 SDValue LHS = Src0.getOperand(0);
42227 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42228 SDValue Res =
42229 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42230 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42231 return DAG.getBitcast(VT, Res);
42232 }
42233 case X86ISD::VPERMILPI:
42234 // TODO: Handle v4f64 permutes with different low/high lane masks.
42235 if (SrcVT0 == MVT::v4f64) {
42236 uint64_t Mask = Src0.getConstantOperandVal(1);
42237 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42238 break;
42239 }
42240 [[fallthrough]];
42241 case X86ISD::VSHLI:
42242 case X86ISD::VSRLI:
42243 case X86ISD::VSRAI:
42244 case X86ISD::PSHUFD:
42245 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42246 SDValue LHS = Src0.getOperand(0);
42247 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42248 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42249 V.getOperand(2));
42250 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42251 return DAG.getBitcast(VT, Res);
42252 }
42253 break;
42254 }
42255
42256 return SDValue();
42257}
42258
42259/// Try to combine x86 target specific shuffles.
42261 SelectionDAG &DAG,
42263 const X86Subtarget &Subtarget) {
42264 using namespace SDPatternMatch;
42265
42266 MVT VT = N.getSimpleValueType();
42267 unsigned NumElts = VT.getVectorNumElements();
42269 unsigned Opcode = N.getOpcode();
42270 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42271
42272 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42273 return R;
42274
42275 // Handle specific target shuffles.
42276 switch (Opcode) {
42277 case X86ISD::MOVDDUP: {
42278 SDValue Src = N.getOperand(0);
42279 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42280 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42281 ISD::isNormalLoad(Src.getNode())) {
42282 LoadSDNode *LN = cast<LoadSDNode>(Src);
42283 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42284 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42285 DCI.CombineTo(N.getNode(), Movddup);
42286 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42288 return N; // Return N so it doesn't get rechecked!
42289 }
42290 }
42291
42292 return SDValue();
42293 }
42294 case X86ISD::VBROADCAST: {
42295 SDValue Src = N.getOperand(0);
42296 SDValue BC = peekThroughBitcasts(Src);
42297 EVT SrcVT = Src.getValueType();
42298 EVT BCVT = BC.getValueType();
42299
42300 // If broadcasting from another shuffle, attempt to simplify it.
42301 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42302 if (isTargetShuffle(BC.getOpcode()) &&
42303 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42304 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42305 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42307 for (unsigned i = 0; i != Scale; ++i)
42308 DemandedMask[i] = i;
42310 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42311 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42312 /*AllowVariableCrossLaneMask=*/true,
42313 /*AllowVariablePerLaneMask=*/true,
42314 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42315 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42316 DAG.getBitcast(SrcVT, Res));
42317 }
42318
42319 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42320 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42321 if (Src.getOpcode() == ISD::BITCAST &&
42322 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42323 TLI.isTypeLegal(BCVT) &&
42325 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42326 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42328 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42329 }
42330
42331 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42332 // If we're re-broadcasting a smaller type then broadcast with that type and
42333 // bitcast.
42334 // TODO: Do this for any splat?
42335 if (Src.getOpcode() == ISD::BITCAST &&
42336 (BC.getOpcode() == X86ISD::VBROADCAST ||
42338 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42339 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42340 MVT NewVT =
42342 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42343 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42344 }
42345
42346 // Reduce broadcast source vector to lowest 128-bits.
42347 if (SrcVT.getSizeInBits() > 128)
42348 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42349 extract128BitVector(Src, 0, DAG, DL));
42350
42351 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42352 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42353 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42354 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42355
42356 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42357 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42358 isNullConstant(Src.getOperand(1)) &&
42359 Src.getValueType() ==
42360 Src.getOperand(0).getValueType().getScalarType() &&
42361 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42362 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42363
42364 // Share broadcast with the longest vector and extract low subvector (free).
42365 // Ensure the same SDValue from the SDNode use is being used.
42366 for (SDNode *User : Src->users())
42367 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42368 Src == User->getOperand(0) &&
42369 User->getValueSizeInBits(0).getFixedValue() >
42370 VT.getFixedSizeInBits()) {
42371 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42372 VT.getSizeInBits());
42373 }
42374
42375 // vbroadcast(scalarload X) -> vbroadcast_load X
42376 // For float loads, extract other uses of the scalar from the broadcast.
42377 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42378 ISD::isNormalLoad(Src.getNode())) {
42379 LoadSDNode *LN = cast<LoadSDNode>(Src);
42380 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42381 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42382 SDValue BcastLd =
42384 LN->getMemoryVT(), LN->getMemOperand());
42385 // If the load value is used only by N, replace it via CombineTo N.
42386 bool NoReplaceExtract = Src.hasOneUse();
42387 DCI.CombineTo(N.getNode(), BcastLd);
42388 if (NoReplaceExtract) {
42389 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42391 } else {
42392 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42393 DAG.getVectorIdxConstant(0, DL));
42394 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42395 }
42396 return N; // Return N so it doesn't get rechecked!
42397 }
42398
42399 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42400 // i16. So shrink it ourselves if we can make a broadcast_load.
42401 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42402 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42403 assert(Subtarget.hasAVX2() && "Expected AVX2");
42404 SDValue TruncIn = Src.getOperand(0);
42405
42406 // If this is a truncate of a non extending load we can just narrow it to
42407 // use a broadcast_load.
42408 if (ISD::isNormalLoad(TruncIn.getNode())) {
42409 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42410 // Unless its volatile or atomic.
42411 if (LN->isSimple()) {
42412 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42413 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42414 SDValue BcastLd = DAG.getMemIntrinsicNode(
42415 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42416 LN->getPointerInfo(), LN->getBaseAlign(),
42417 LN->getMemOperand()->getFlags());
42418 DCI.CombineTo(N.getNode(), BcastLd);
42419 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42420 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42421 return N; // Return N so it doesn't get rechecked!
42422 }
42423 }
42424
42425 // If this is a truncate of an i16 extload, we can directly replace it.
42426 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42427 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42428 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42429 if (LN->getMemoryVT().getSizeInBits() == 16) {
42430 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42431 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42432 SDValue BcastLd =
42434 LN->getMemoryVT(), LN->getMemOperand());
42435 DCI.CombineTo(N.getNode(), BcastLd);
42436 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42437 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42438 return N; // Return N so it doesn't get rechecked!
42439 }
42440 }
42441
42442 // If this is a truncate of load that has been shifted right, we can
42443 // offset the pointer and use a narrower load.
42444 if (TruncIn.getOpcode() == ISD::SRL &&
42445 TruncIn.getOperand(0).hasOneUse() &&
42446 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42447 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42448 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42449 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42450 // Make sure the shift amount and the load size are divisible by 16.
42451 // Don't do this if the load is volatile or atomic.
42452 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42453 LN->isSimple()) {
42454 unsigned Offset = ShiftAmt / 8;
42455 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42458 SDValue Ops[] = { LN->getChain(), Ptr };
42459 SDValue BcastLd = DAG.getMemIntrinsicNode(
42460 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42462 LN->getMemOperand()->getFlags());
42463 DCI.CombineTo(N.getNode(), BcastLd);
42464 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42465 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42466 return N; // Return N so it doesn't get rechecked!
42467 }
42468 }
42469 }
42470
42471 // vbroadcast(vzload X) -> vbroadcast_load X
42472 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42474 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42475 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42476 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42477 SDValue BcastLd =
42479 LN->getMemoryVT(), LN->getMemOperand());
42480 DCI.CombineTo(N.getNode(), BcastLd);
42481 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42483 return N; // Return N so it doesn't get rechecked!
42484 }
42485 }
42486
42487 // vbroadcast(vector load X) -> vbroadcast_load
42488 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42489 LoadSDNode *LN = cast<LoadSDNode>(Src);
42490 // Unless the load is volatile or atomic.
42491 if (LN->isSimple()) {
42492 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42493 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42494 SDValue BcastLd = DAG.getMemIntrinsicNode(
42496 LN->getPointerInfo(), LN->getBaseAlign(),
42497 LN->getMemOperand()->getFlags());
42498 DCI.CombineTo(N.getNode(), BcastLd);
42499 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42501 return N; // Return N so it doesn't get rechecked!
42502 }
42503 }
42504
42505 return SDValue();
42506 }
42507 case X86ISD::VZEXT_MOVL: {
42508 SDValue N0 = N.getOperand(0);
42509
42510 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42511 // Zeroing out the upper elements means we're just shifting a zero value.
42512 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42513 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42514 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42515 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42516 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42517 if (N0.hasOneUse())
42518 return DAG.getNode(
42519 N0.getOpcode(), DL, VT,
42520 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42521 N0.getOperand(1));
42522 }
42523
42524 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42525 // the load is volatile.
42526 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42527 auto *LN = cast<LoadSDNode>(N0);
42528 if (SDValue VZLoad =
42529 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42530 DCI.CombineTo(N.getNode(), VZLoad);
42531 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42533 return N;
42534 }
42535 }
42536
42537 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42538 // and can just use a VZEXT_LOAD.
42539 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42540 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42541 auto *LN = cast<MemSDNode>(N0);
42542 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42543 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42544 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42545 SDValue VZLoad =
42547 LN->getMemoryVT(), LN->getMemOperand());
42548 DCI.CombineTo(N.getNode(), VZLoad);
42549 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42551 return N;
42552 }
42553 }
42554
42555 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42556 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42557 // if the upper bits of the i64 are zero.
42558 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42559 N0.getOperand(0).hasOneUse() &&
42560 N0.getOperand(0).getValueType() == MVT::i64) {
42561 SDValue In = N0.getOperand(0);
42562 APInt Mask = APInt::getHighBitsSet(64, 32);
42563 if (DAG.MaskedValueIsZero(In, Mask)) {
42564 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42565 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42566 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42567 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42568 return DAG.getBitcast(VT, Movl);
42569 }
42570 }
42571
42572 // Load a scalar integer constant directly to XMM instead of transferring an
42573 // immediate value from GPR.
42574 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42575 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42576 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42577 // Create a vector constant - scalar constant followed by zeros.
42578 EVT ScalarVT = N0.getOperand(0).getValueType();
42579 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42580 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42581 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42582 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42583
42584 // Load the vector constant from constant pool.
42585 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42586 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42587 MachinePointerInfo MPI =
42589 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42590 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42592 }
42593 }
42594
42595 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42596 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42597 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42598 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42599 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42601
42602 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42603 isNullConstant(V.getOperand(2))) {
42604 SDValue In = V.getOperand(1);
42606 In.getValueSizeInBits() /
42607 VT.getScalarSizeInBits());
42608 In = DAG.getBitcast(SubVT, In);
42609 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42610 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42611 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42612 V.getOperand(2));
42613 }
42614 }
42615
42616 return SDValue();
42617 }
42618 case X86ISD::BLENDI: {
42619 SDValue N0 = N.getOperand(0);
42620 SDValue N1 = N.getOperand(1);
42621 unsigned EltBits = VT.getScalarSizeInBits();
42622
42623 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42624 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42625 // TODO: Handle MVT::v16i16 repeated blend mask.
42626 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42627 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42628 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42629 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42630 unsigned NewSize = SrcVT.getVectorNumElements();
42631 APInt BlendMask = getBLENDIBlendMask(N);
42632 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42633 return DAG.getBitcast(
42634 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42635 N1.getOperand(0),
42636 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42637 DL, MVT::i8)));
42638 }
42639 }
42640 // Share PSHUFB masks:
42641 // blend(pshufb(x,m1),pshufb(y,m2))
42642 // --> m3 = blend(m1,m2)
42643 // blend(pshufb(x,m3),pshufb(y,m3))
42644 if (N0.hasOneUse() && N1.hasOneUse()) {
42645 SmallVector<int> Mask, ByteMask;
42649 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42650 RHS.getOpcode() == X86ISD::PSHUFB &&
42651 LHS.getOperand(1) != RHS.getOperand(1) &&
42652 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42653 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42654 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42656 "BLENDI decode mismatch");
42657 MVT ShufVT = LHS.getSimpleValueType();
42658 SDValue MaskLHS = LHS.getOperand(1);
42659 SDValue MaskRHS = RHS.getOperand(1);
42660 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42662 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42663 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42664 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42665 LHS.getOperand(0), NewMask);
42666 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42667 RHS.getOperand(0), NewMask);
42668 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42669 DAG.getBitcast(VT, NewLHS),
42670 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42671 }
42672 }
42673 }
42674 }
42675 return SDValue();
42676 }
42677 case X86ISD::SHUFP: {
42678 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42679 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42680 // TODO: Support types other than v4f32.
42681 if (VT == MVT::v4f32) {
42682 bool Updated = false;
42683 SmallVector<int> Mask;
42685 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42686 for (int i = 0; i != 2; ++i) {
42687 SmallVector<SDValue> SubOps;
42688 SmallVector<int> SubMask, SubScaledMask;
42690 // TODO: Scaling might be easier if we specify the demanded elts.
42691 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42692 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42693 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42694 int Ofs = i * 2;
42695 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42696 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42697 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42698 Updated = true;
42699 }
42700 }
42701 }
42702 if (Updated) {
42703 for (int &M : Mask)
42704 M %= 4;
42705 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42706 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42707 }
42708 }
42709 return SDValue();
42710 }
42711 case X86ISD::VPERMI: {
42712 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42713 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42714 SDValue N0 = N.getOperand(0);
42715 SDValue N1 = N.getOperand(1);
42716 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42717 if (N0.getOpcode() == ISD::BITCAST &&
42718 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42719 SDValue Src = N0.getOperand(0);
42720 EVT SrcVT = Src.getValueType();
42721 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42722 return DAG.getBitcast(VT, Res);
42723 }
42724 return SDValue();
42725 }
42726 case X86ISD::SHUF128: {
42727 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42728 // see if we can peek through and access the subvector directly.
42729 if (VT.is512BitVector()) {
42730 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42731 // the upper subvector is used.
42732 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42733 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42734 uint64_t Mask = N->getConstantOperandVal(2);
42735 SmallVector<SDValue> LHSOps, RHSOps;
42736 SDValue NewLHS, NewRHS;
42737 if ((Mask & 0x0A) == 0x0A &&
42738 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42739 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42740 Mask &= ~0x0A;
42741 }
42742 if ((Mask & 0xA0) == 0xA0 &&
42743 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42744 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42745 Mask &= ~0xA0;
42746 }
42747 if (NewLHS || NewRHS)
42748 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42749 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42750 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42751 DAG.getTargetConstant(Mask, DL, MVT::i8));
42752 }
42753 return SDValue();
42754 }
42755 case X86ISD::VPERM2X128: {
42756 SDValue LHS = N->getOperand(0);
42757 SDValue RHS = N->getOperand(1);
42758 unsigned Imm = N.getConstantOperandVal(2) & 255;
42759
42760 // Canonicalize unary/repeated operands to LHS.
42761 if (LHS.isUndef() && !RHS.isUndef())
42762 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42763 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42764 if (LHS == RHS)
42765 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42766 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42767
42768 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42769 if (LHS.getOpcode() == ISD::BITCAST &&
42770 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42771 EVT SrcVT = LHS.getOperand(0).getValueType();
42772 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42773 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42774 DAG.getBitcast(SrcVT, LHS),
42775 DAG.getBitcast(SrcVT, RHS),
42776 N->getOperand(2)));
42777 }
42778 }
42779
42780 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42782 return Res;
42783
42784 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42785 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42786 auto FindSubVector128 = [&](unsigned Idx) {
42787 if (Idx > 3)
42788 return SDValue();
42789 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42790 SmallVector<SDValue> SubOps;
42791 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42792 return SubOps[Idx & 1];
42793 unsigned NumElts = Src.getValueType().getVectorNumElements();
42794 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42795 Src.getOperand(1).getValueSizeInBits() == 128 &&
42796 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42797 return Src.getOperand(1);
42798 }
42799 return SDValue();
42800 };
42801 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42802 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42803 MVT SubVT = VT.getHalfNumVectorElementsVT();
42804 SubLo = DAG.getBitcast(SubVT, SubLo);
42805 SubHi = DAG.getBitcast(SubVT, SubHi);
42806 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42807 }
42808 }
42809
42810 // Attempt to match VBROADCAST*128 subvector broadcast load.
42811 if (RHS.isUndef()) {
42813 DecodeVPERM2X128Mask(4, Imm, Mask);
42814 if (isUndefOrInRange(Mask, 0, 4)) {
42815 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42816 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42817 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42818 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42819 MVT MemVT = VT.getHalfNumVectorElementsVT();
42820 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42822 cast<LoadSDNode>(LHS), Ofs, DAG);
42823 }
42824 }
42825 }
42826
42827 return SDValue();
42828 }
42829 case X86ISD::PSHUFD:
42830 case X86ISD::PSHUFLW:
42831 case X86ISD::PSHUFHW: {
42832 SDValue N0 = N.getOperand(0);
42833 SDValue N1 = N.getOperand(1);
42834 if (N0->hasOneUse()) {
42836 switch (V.getOpcode()) {
42837 case X86ISD::VSHL:
42838 case X86ISD::VSRL:
42839 case X86ISD::VSRA:
42840 case X86ISD::VSHLI:
42841 case X86ISD::VSRLI:
42842 case X86ISD::VSRAI:
42843 case X86ISD::VROTLI:
42844 case X86ISD::VROTRI: {
42845 MVT InnerVT = V.getSimpleValueType();
42846 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42847 SDValue Res = DAG.getNode(Opcode, DL, VT,
42848 DAG.getBitcast(VT, V.getOperand(0)), N1);
42849 Res = DAG.getBitcast(InnerVT, Res);
42850 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42851 return DAG.getBitcast(VT, Res);
42852 }
42853 break;
42854 }
42855 }
42856 }
42857
42858 Mask = getPSHUFShuffleMask(N);
42859 assert(Mask.size() == 4);
42860 break;
42861 }
42862 case X86ISD::MOVSD:
42863 case X86ISD::MOVSH:
42864 case X86ISD::MOVSS: {
42865 SDValue N0 = N.getOperand(0);
42866 SDValue N1 = N.getOperand(1);
42867
42868 // Canonicalize scalar FPOps:
42869 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42870 // If commutable, allow OP(N1[0], N0[0]).
42871 unsigned Opcode1 = N1.getOpcode();
42872 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42873 Opcode1 == ISD::FDIV) {
42874 SDValue N10 = N1.getOperand(0);
42875 SDValue N11 = N1.getOperand(1);
42876 if (N10 == N0 ||
42877 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42878 if (N10 != N0)
42879 std::swap(N10, N11);
42880 MVT SVT = VT.getVectorElementType();
42881 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42882 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42883 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42884 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42885 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42886 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42887 }
42888 }
42889
42890 return SDValue();
42891 }
42892 case X86ISD::INSERTPS: {
42893 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42894 SDValue Op0 = N.getOperand(0);
42895 SDValue Op1 = N.getOperand(1);
42896 unsigned InsertPSMask = N.getConstantOperandVal(2);
42897 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42898 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42899 unsigned ZeroMask = InsertPSMask & 0xF;
42900
42901 // If we zero out all elements from Op0 then we don't need to reference it.
42902 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42903 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42904 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42905
42906 // If we zero out the element from Op1 then we don't need to reference it.
42907 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42908 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42909 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42910
42911 // Attempt to merge insertps Op1 with an inner target shuffle node.
42912 SmallVector<int, 8> TargetMask1;
42914 APInt KnownUndef1, KnownZero1;
42915 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42916 KnownZero1)) {
42917 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42918 // Zero/UNDEF insertion - zero out element and remove dependency.
42919 InsertPSMask |= (1u << DstIdx);
42920 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42921 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42922 }
42923 // Update insertps mask srcidx and reference the source input directly.
42924 int M = TargetMask1[SrcIdx];
42925 assert(0 <= M && M < 8 && "Shuffle index out of range");
42926 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42927 Op1 = Ops1[M < 4 ? 0 : 1];
42928 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42929 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42930 }
42931
42932 // Attempt to merge insertps Op0 with an inner target shuffle node.
42933 SmallVector<int, 8> TargetMask0;
42935 APInt KnownUndef0, KnownZero0;
42936 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42937 KnownZero0)) {
42938 bool Updated = false;
42939 bool UseInput00 = false;
42940 bool UseInput01 = false;
42941 for (int i = 0; i != 4; ++i) {
42942 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42943 // No change if element is already zero or the inserted element.
42944 continue;
42945 }
42946
42947 if (KnownUndef0[i] || KnownZero0[i]) {
42948 // If the target mask is undef/zero then we must zero the element.
42949 InsertPSMask |= (1u << i);
42950 Updated = true;
42951 continue;
42952 }
42953
42954 // The input vector element must be inline.
42955 int M = TargetMask0[i];
42956 if (M != i && M != (i + 4))
42957 return SDValue();
42958
42959 // Determine which inputs of the target shuffle we're using.
42960 UseInput00 |= (0 <= M && M < 4);
42961 UseInput01 |= (4 <= M);
42962 }
42963
42964 // If we're not using both inputs of the target shuffle then use the
42965 // referenced input directly.
42966 if (UseInput00 && !UseInput01) {
42967 Updated = true;
42968 Op0 = Ops0[0];
42969 } else if (!UseInput00 && UseInput01) {
42970 Updated = true;
42971 Op0 = Ops0[1];
42972 }
42973
42974 if (Updated)
42975 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42976 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42977 }
42978
42979 // If we're inserting an element from a vbroadcast load, fold the
42980 // load into the X86insertps instruction. We need to convert the scalar
42981 // load to a vector and clear the source lane of the INSERTPS control.
42982 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42983 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42984 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42985 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42986 MemIntr->getBasePtr(),
42987 MemIntr->getMemOperand());
42988 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42990 Load),
42991 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42992 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42993 return Insert;
42994 }
42995 }
42996
42997 return SDValue();
42998 }
42999 case X86ISD::VPERMV: {
43000 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
43002 SmallVector<SDValue, 2> SrcOps, SubOps;
43003 SDValue Src = peekThroughBitcasts(N.getOperand(1));
43004 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
43005 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
43006 collectConcatOps(Src.getNode(), SubOps, DAG)) {
43007 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43008 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
43009 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
43010 "Unexpected split ops");
43011 // Bail if we were permuting a widened vector.
43012 if (SubOps[1].isUndef() &&
43013 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
43014 return SDValue();
43015 // Bail if any subops would have folded into the concat.
43016 if (any_of(SubOps, isShuffleFoldableLoad))
43017 return SDValue();
43018 // Concat 4x128 back to 2x256.
43019 if (SubOps.size() == 4) {
43020 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
43021 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
43022 }
43023 // Convert mask to 2 operand shuffle.
43024 int HalfElts = NumElts / 2;
43025 for (int &M : Mask)
43026 M += M >= HalfElts ? HalfElts : 0;
43027 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
43028 VT.getSizeInBits());
43029 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
43030 VT.getSizeInBits());
43031 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
43032 DAG.getBitcast(VT, Hi), Subtarget, DAG);
43033 }
43034 return SDValue();
43035 }
43036 case X86ISD::VPERMV3: {
43037 MVT WideVT = VT.getDoubleNumVectorElementsVT();
43038 bool CanConcat = VT.is128BitVector() ||
43039 (VT.is256BitVector() && Subtarget.useAVX512Regs());
43042 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43043 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43044 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43045 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43046 // Canonicalize to VPERMV if both sources are the same.
43047 if (V1 == V2) {
43048 for (int &M : Mask)
43049 M = (M < 0 ? M : (M & (NumElts - 1)));
43050 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43051 DAG.getUNDEF(VT), Subtarget, DAG);
43052 }
43053 // If sources are half width, then concat and use VPERMV with adjusted
43054 // mask.
43055 SDValue Ops[2];
43056 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43057 if (sd_match(V1,
43059 sd_match(V2,
43061 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43062 if (SDValue ConcatSrc =
43063 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43064 for (int &M : Mask)
43065 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43066 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43067 DAG.getUNDEF(VT), Subtarget, DAG);
43068 }
43069 }
43070 // Commute foldable source to the RHS.
43071 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43072 !isShuffleFoldableLoad(N.getOperand(2))) {
43074 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43075 N.getOperand(0), Subtarget, DAG);
43076 }
43077 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43078 // freely concatenated, with a commuted shuffle mask.
43079 if (CanConcat) {
43080 if (SDValue ConcatSrc = combineConcatVectorOps(
43081 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43082 Subtarget)) {
43084 Mask.append(NumElts, SM_SentinelUndef);
43085 SDValue Perm =
43086 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43087 DAG.getUNDEF(WideVT), Subtarget, DAG);
43088 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43089 DAG.getVectorIdxConstant(0, DL));
43090 }
43091 }
43092 }
43093 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43094 // freely concatenated.
43095 if (CanConcat) {
43096 if (SDValue ConcatSrc = combineConcatVectorOps(
43097 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43098 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43099 DL, WideVT.getSizeInBits());
43100 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43101 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43102 DAG.getVectorIdxConstant(0, DL));
43103 }
43104 }
43105 return SDValue();
43106 }
43107 default:
43108 return SDValue();
43109 }
43110
43111 // Nuke no-op shuffles that show up after combining.
43112 if (isNoopShuffleMask(Mask))
43113 return N.getOperand(0);
43114
43115 // Look for simplifications involving one or two shuffle instructions.
43116 SDValue V = N.getOperand(0);
43117 switch (N.getOpcode()) {
43118 default:
43119 break;
43120 case X86ISD::PSHUFLW:
43121 case X86ISD::PSHUFHW:
43122 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43123
43124 // See if this reduces to a PSHUFD which is no more expensive and can
43125 // combine with more operations. Note that it has to at least flip the
43126 // dwords as otherwise it would have been removed as a no-op.
43127 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43128 int DMask[] = {0, 1, 2, 3};
43129 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43130 DMask[DOffset + 0] = DOffset + 1;
43131 DMask[DOffset + 1] = DOffset + 0;
43132 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43133 V = DAG.getBitcast(DVT, V);
43134 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43135 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43136 return DAG.getBitcast(VT, V);
43137 }
43138
43139 // Look for shuffle patterns which can be implemented as a single unpack.
43140 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43141 // only works when we have a PSHUFD followed by two half-shuffles.
43142 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43143 (V.getOpcode() == X86ISD::PSHUFLW ||
43144 V.getOpcode() == X86ISD::PSHUFHW) &&
43145 V.getOpcode() != N.getOpcode() &&
43146 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43147 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43148 if (D.getOpcode() == X86ISD::PSHUFD) {
43151 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43152 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43153 int WordMask[8];
43154 for (int i = 0; i < 4; ++i) {
43155 WordMask[i + NOffset] = Mask[i] + NOffset;
43156 WordMask[i + VOffset] = VMask[i] + VOffset;
43157 }
43158 // Map the word mask through the DWord mask.
43159 int MappedMask[8];
43160 for (int i = 0; i < 8; ++i)
43161 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43162 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43163 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43164 // We can replace all three shuffles with an unpack.
43165 V = DAG.getBitcast(VT, D.getOperand(0));
43166 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43168 DL, VT, V, V);
43169 }
43170 }
43171 }
43172
43173 break;
43174
43175 case X86ISD::PSHUFD:
43176 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43177 return NewN;
43178
43179 break;
43180 }
43181
43182 return SDValue();
43183}
43184
43185/// Checks if the shuffle mask takes subsequent elements
43186/// alternately from two vectors.
43187/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43188static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43189
43190 int ParitySrc[2] = {-1, -1};
43191 unsigned Size = Mask.size();
43192 for (unsigned i = 0; i != Size; ++i) {
43193 int M = Mask[i];
43194 if (M < 0)
43195 continue;
43196
43197 // Make sure we are using the matching element from the input.
43198 if ((M % Size) != i)
43199 return false;
43200
43201 // Make sure we use the same input for all elements of the same parity.
43202 int Src = M / Size;
43203 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43204 return false;
43205 ParitySrc[i % 2] = Src;
43206 }
43207
43208 // Make sure each input is used.
43209 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43210 return false;
43211
43212 Op0Even = ParitySrc[0] == 0;
43213 return true;
43214}
43215
43216/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43217/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43218/// are written to the parameters \p Opnd0 and \p Opnd1.
43219///
43220/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43221/// so it is easier to generically match. We also insert dummy vector shuffle
43222/// nodes for the operands which explicitly discard the lanes which are unused
43223/// by this operation to try to flow through the rest of the combiner
43224/// the fact that they're unused.
43225static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43226 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43227 bool &IsSubAdd, bool &HasAllowContract) {
43228
43229 EVT VT = N->getValueType(0);
43230 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43231 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43233 return false;
43234
43235 // We only handle target-independent shuffles.
43236 // FIXME: It would be easy and harmless to use the target shuffle mask
43237 // extraction tool to support more.
43238 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43239 return false;
43240
43241 SDValue V1 = N->getOperand(0);
43242 SDValue V2 = N->getOperand(1);
43243
43244 // Make sure we have an FADD and an FSUB.
43245 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43246 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43247 V1.getOpcode() == V2.getOpcode())
43248 return false;
43249
43250 // If there are other uses of these operations we can't fold them.
43251 if (!V1->hasOneUse() || !V2->hasOneUse())
43252 return false;
43253
43254 // Ensure that both operations have the same operands. Note that we can
43255 // commute the FADD operands.
43256 SDValue LHS, RHS;
43257 if (V1.getOpcode() == ISD::FSUB) {
43258 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43259 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43260 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43261 return false;
43262 } else {
43263 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43264 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43265 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43266 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43267 return false;
43268 }
43269
43270 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43271 bool Op0Even;
43272 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43273 return false;
43274
43275 // It's a subadd if the vector in the even parity is an FADD.
43276 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43277 : V2->getOpcode() == ISD::FADD;
43278 HasAllowContract =
43280
43281 Opnd0 = LHS;
43282 Opnd1 = RHS;
43283 return true;
43284}
43285
43286/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43288 const X86Subtarget &Subtarget,
43289 SelectionDAG &DAG) {
43290 // We only handle target-independent shuffles.
43291 // FIXME: It would be easy and harmless to use the target shuffle mask
43292 // extraction tool to support more.
43293 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43294 return SDValue();
43295
43296 MVT VT = N->getSimpleValueType(0);
43297 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43298 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43299 return SDValue();
43300
43301 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43302 SDValue Op0 = N->getOperand(0);
43303 SDValue Op1 = N->getOperand(1);
43304 SDValue FMAdd = Op0, FMSub = Op1;
43305 if (FMSub.getOpcode() != X86ISD::FMSUB)
43306 std::swap(FMAdd, FMSub);
43307
43308 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43309 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43310 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43311 FMAdd.getOperand(2) != FMSub.getOperand(2))
43312 return SDValue();
43313
43314 // Check for correct shuffle mask.
43315 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43316 bool Op0Even;
43317 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43318 return SDValue();
43319
43320 // FMAddSub takes zeroth operand from FMSub node.
43321 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43322 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43323 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43324 FMAdd.getOperand(2));
43325}
43326
43327/// Try to combine a shuffle into a target-specific add-sub or
43328/// mul-add-sub node.
43330 const X86Subtarget &Subtarget,
43331 SelectionDAG &DAG) {
43332 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43333 return V;
43334
43335 SDValue Opnd0, Opnd1;
43336 bool IsSubAdd;
43337 bool HasAllowContract;
43338 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43339 HasAllowContract))
43340 return SDValue();
43341
43342 MVT VT = N->getSimpleValueType(0);
43343
43344 // Try to generate X86ISD::FMADDSUB node here.
43345 SDValue Opnd2;
43346 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43347 HasAllowContract)) {
43348 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43349 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43350 }
43351
43352 if (IsSubAdd)
43353 return SDValue();
43354
43355 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43356 // the ADDSUB idiom has been successfully recognized. There are no known
43357 // X86 targets with 512-bit ADDSUB instructions!
43358 if (VT.is512BitVector())
43359 return SDValue();
43360
43361 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43362 // the ADDSUB idiom has been successfully recognized. There are no known
43363 // X86 targets with FP16 ADDSUB instructions!
43364 if (VT.getVectorElementType() == MVT::f16)
43365 return SDValue();
43366
43367 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43368}
43369
43370/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43371/// low half of each source vector and does not set any high half elements in
43372/// the destination vector, narrow the shuffle to half its original size.
43374 EVT VT = Shuf->getValueType(0);
43375 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43376 return SDValue();
43377 if (!VT.is256BitVector() && !VT.is512BitVector())
43378 return SDValue();
43379
43380 // See if we can ignore all of the high elements of the shuffle.
43381 ArrayRef<int> Mask = Shuf->getMask();
43382 if (!isUndefUpperHalf(Mask))
43383 return SDValue();
43384
43385 // Check if the shuffle mask accesses only the low half of each input vector
43386 // (half-index output is 0 or 2).
43387 int HalfIdx1, HalfIdx2;
43388 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43389 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43390 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43391 return SDValue();
43392
43393 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43394 // The trick is knowing that all of the insert/extract are actually free
43395 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43396 // of narrow inputs into a narrow output, and that is always cheaper than
43397 // the wide shuffle that we started with.
43398 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43399 Shuf->getOperand(1), HalfMask, HalfIdx1,
43400 HalfIdx2, false, DAG, /*UseConcat*/ true);
43401}
43402
43405 const X86Subtarget &Subtarget) {
43406 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43407 if (SDValue V = narrowShuffle(Shuf, DAG))
43408 return V;
43409
43410 // If we have legalized the vector types, look for blends of FADD and FSUB
43411 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43412 SDLoc dl(N);
43413 EVT VT = N->getValueType(0);
43414 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43415 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43416 if (SDValue AddSub =
43417 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43418 return AddSub;
43419
43420 // Attempt to combine into a vector load/broadcast.
43422 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43423 return LD;
43424
43425 if (isTargetShuffle(N->getOpcode())) {
43426 SDValue Op(N, 0);
43427 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43428 return Shuffle;
43429
43430 // Try recursively combining arbitrary sequences of x86 shuffle
43431 // instructions into higher-order shuffles. We do this after combining
43432 // specific PSHUF instruction sequences into their minimal form so that we
43433 // can evaluate how many specialized shuffle instructions are involved in
43434 // a particular chain.
43435 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43436 return Res;
43437
43438 // Simplify source operands based on shuffle mask.
43439 // TODO - merge this into combineX86ShufflesRecursively.
43440 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43441 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43442 return SDValue(N, 0);
43443
43444 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43445 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43446 // Perform this after other shuffle combines to allow inner shuffles to be
43447 // combined away first.
43448 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43449 return BinOp;
43450 }
43451
43452 return SDValue();
43453}
43454
43455// Simplify variable target shuffle masks based on the demanded elements.
43456// TODO: Handle DemandedBits in mask indices as well?
43458 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43459 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43460 // If we're demanding all elements don't bother trying to simplify the mask.
43461 unsigned NumElts = DemandedElts.getBitWidth();
43462 if (DemandedElts.isAllOnes())
43463 return false;
43464
43465 SDValue Mask = Op.getOperand(MaskIndex);
43466 if (!Mask.hasOneUse())
43467 return false;
43468
43469 // Attempt to generically simplify the variable shuffle mask.
43470 APInt MaskUndef, MaskZero;
43471 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43472 Depth + 1))
43473 return true;
43474
43475 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43476 // TODO: Support other types from getTargetShuffleMaskIndices?
43478 EVT BCVT = BC.getValueType();
43479 auto *Load = dyn_cast<LoadSDNode>(BC);
43480 if (!Load || !Load->getBasePtr().hasOneUse())
43481 return false;
43482
43483 const Constant *C = getTargetConstantFromNode(Load);
43484 if (!C)
43485 return false;
43486
43487 Type *CTy = C->getType();
43488 if (!CTy->isVectorTy() ||
43489 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43490 return false;
43491
43492 // Handle scaling for i64 elements on 32-bit targets.
43493 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43494 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43495 return false;
43496 unsigned Scale = NumCstElts / NumElts;
43497
43498 // Simplify mask if we have an undemanded element that is not undef.
43499 bool Simplified = false;
43500 SmallVector<Constant *, 32> ConstVecOps;
43501 for (unsigned i = 0; i != NumCstElts; ++i) {
43502 Constant *Elt = C->getAggregateElement(i);
43503 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43504 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43505 Simplified = true;
43506 continue;
43507 }
43508 ConstVecOps.push_back(Elt);
43509 }
43510 if (!Simplified)
43511 return false;
43512
43513 // Generate new constant pool entry + legalize immediately for the load.
43514 SDLoc DL(Op);
43515 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43516 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43517 SDValue NewMask = TLO.DAG.getLoad(
43518 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43520 Load->getAlign());
43521 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43522}
43523
43525 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43526 TargetLoweringOpt &TLO, unsigned Depth) const {
43527 int NumElts = DemandedElts.getBitWidth();
43528 unsigned Opc = Op.getOpcode();
43529 EVT VT = Op.getValueType();
43530
43531 // Handle special case opcodes.
43532 switch (Opc) {
43533 case X86ISD::PMULDQ:
43534 case X86ISD::PMULUDQ: {
43535 APInt LHSUndef, LHSZero;
43536 APInt RHSUndef, RHSZero;
43537 SDValue LHS = Op.getOperand(0);
43538 SDValue RHS = Op.getOperand(1);
43539 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43540 Depth + 1))
43541 return true;
43542 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43543 Depth + 1))
43544 return true;
43545 // Multiply by zero.
43546 KnownZero = LHSZero | RHSZero;
43547 break;
43548 }
43549 case X86ISD::VPMADDUBSW:
43550 case X86ISD::VPMADDWD: {
43551 APInt LHSUndef, LHSZero;
43552 APInt RHSUndef, RHSZero;
43553 SDValue LHS = Op.getOperand(0);
43554 SDValue RHS = Op.getOperand(1);
43555 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43556
43557 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43558 Depth + 1))
43559 return true;
43560 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43561 Depth + 1))
43562 return true;
43563
43564 // TODO: Multiply by zero.
43565
43566 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43567 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43568 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43569 Depth + 1))
43570 return true;
43571 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43572 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43573 Depth + 1))
43574 return true;
43575 break;
43576 }
43577 case X86ISD::PSADBW: {
43578 SDValue LHS = Op.getOperand(0);
43579 SDValue RHS = Op.getOperand(1);
43580 assert(VT.getScalarType() == MVT::i64 &&
43581 LHS.getValueType() == RHS.getValueType() &&
43582 LHS.getValueType().getScalarType() == MVT::i8 &&
43583 "Unexpected PSADBW types");
43584
43585 // Aggressively peek through ops to get at the demanded elts.
43586 if (!DemandedElts.isAllOnes()) {
43587 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43588 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43590 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43592 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43593 if (NewLHS || NewRHS) {
43594 NewLHS = NewLHS ? NewLHS : LHS;
43595 NewRHS = NewRHS ? NewRHS : RHS;
43596 return TLO.CombineTo(
43597 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43598 }
43599 }
43600 break;
43601 }
43602 case X86ISD::VSHL:
43603 case X86ISD::VSRL:
43604 case X86ISD::VSRA: {
43605 // We only need the bottom 64-bits of the (128-bit) shift amount.
43606 SDValue Amt = Op.getOperand(1);
43607 MVT AmtVT = Amt.getSimpleValueType();
43608 assert(AmtVT.is128BitVector() && "Unexpected value type");
43609
43610 // If we reuse the shift amount just for sse shift amounts then we know that
43611 // only the bottom 64-bits are only ever used.
43612 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43613 unsigned UseOpc = Use->getOpcode();
43614 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43615 UseOpc == X86ISD::VSRA) &&
43616 Use->getOperand(0) != Amt;
43617 });
43618
43619 APInt AmtUndef, AmtZero;
43620 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43621 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43622 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43623 Depth + 1, AssumeSingleUse))
43624 return true;
43625 [[fallthrough]];
43626 }
43627 case X86ISD::VSHLI:
43628 case X86ISD::VSRLI:
43629 case X86ISD::VSRAI: {
43630 SDValue Src = Op.getOperand(0);
43631 APInt SrcUndef;
43632 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43633 Depth + 1))
43634 return true;
43635
43636 // Fold shift(0,x) -> 0
43637 if (DemandedElts.isSubsetOf(KnownZero))
43638 return TLO.CombineTo(
43639 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43640
43641 // Aggressively peek through ops to get at the demanded elts.
43642 if (!DemandedElts.isAllOnes())
43644 Src, DemandedElts, TLO.DAG, Depth + 1))
43645 return TLO.CombineTo(
43646 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43647 break;
43648 }
43649 case X86ISD::VPSHA:
43650 case X86ISD::VPSHL:
43651 case X86ISD::VSHLV:
43652 case X86ISD::VSRLV:
43653 case X86ISD::VSRAV: {
43654 APInt LHSUndef, LHSZero;
43655 APInt RHSUndef, RHSZero;
43656 SDValue LHS = Op.getOperand(0);
43657 SDValue RHS = Op.getOperand(1);
43658 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43659 Depth + 1))
43660 return true;
43661
43662 // Fold shift(0,x) -> 0
43663 if (DemandedElts.isSubsetOf(LHSZero))
43664 return TLO.CombineTo(
43665 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43666
43667 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43668 Depth + 1))
43669 return true;
43670
43671 KnownZero = LHSZero;
43672 break;
43673 }
43674 case X86ISD::CMPM:
43675 case X86ISD::CMPP: {
43676 // Scalarize packed fp comparison if we only require element 0.
43677 if (DemandedElts == 1) {
43678 SDLoc dl(Op);
43679 MVT VT = Op.getSimpleValueType();
43680 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43681 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43682 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43683 SDValue CC = Op.getOperand(2);
43684 if (Opc == X86ISD::CMPM) {
43685 SDValue Cmp =
43686 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43687 return TLO.CombineTo(
43688 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43689 }
43690 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43691 return TLO.CombineTo(Op,
43692 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43693 }
43694 break;
43695 }
43696 case X86ISD::PCMPEQ:
43697 case X86ISD::PCMPGT: {
43698 APInt LHSUndef, LHSZero;
43699 APInt RHSUndef, RHSZero;
43700 SDValue LHS = Op.getOperand(0);
43701 SDValue RHS = Op.getOperand(1);
43702 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43703 Depth + 1))
43704 return true;
43705 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43706 Depth + 1))
43707 return true;
43708 break;
43709 }
43710 case X86ISD::KSHIFTL: {
43711 SDValue Src = Op.getOperand(0);
43712 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43713 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43714 unsigned ShiftAmt = Amt->getZExtValue();
43715
43716 if (ShiftAmt == 0)
43717 return TLO.CombineTo(Op, Src);
43718
43719 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43720 // single shift. We can do this if the bottom bits (which are shifted
43721 // out) are never demanded.
43722 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43723 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43724 unsigned C1 = Src.getConstantOperandVal(1);
43725 unsigned NewOpc = X86ISD::KSHIFTL;
43726 int Diff = ShiftAmt - C1;
43727 if (Diff < 0) {
43728 Diff = -Diff;
43729 NewOpc = X86ISD::KSHIFTR;
43730 }
43731
43732 SDLoc dl(Op);
43733 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43734 return TLO.CombineTo(
43735 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43736 }
43737 }
43738
43739 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43740 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43741 Depth + 1))
43742 return true;
43743
43744 KnownUndef <<= ShiftAmt;
43745 KnownZero <<= ShiftAmt;
43746 KnownZero.setLowBits(ShiftAmt);
43747 break;
43748 }
43749 case X86ISD::KSHIFTR: {
43750 SDValue Src = Op.getOperand(0);
43751 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43752 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43753 unsigned ShiftAmt = Amt->getZExtValue();
43754
43755 if (ShiftAmt == 0)
43756 return TLO.CombineTo(Op, Src);
43757
43758 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43759 // single shift. We can do this if the top bits (which are shifted
43760 // out) are never demanded.
43761 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43762 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43763 unsigned C1 = Src.getConstantOperandVal(1);
43764 unsigned NewOpc = X86ISD::KSHIFTR;
43765 int Diff = ShiftAmt - C1;
43766 if (Diff < 0) {
43767 Diff = -Diff;
43768 NewOpc = X86ISD::KSHIFTL;
43769 }
43770
43771 SDLoc dl(Op);
43772 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43773 return TLO.CombineTo(
43774 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43775 }
43776 }
43777
43778 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43779 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43780 Depth + 1))
43781 return true;
43782
43783 KnownUndef.lshrInPlace(ShiftAmt);
43784 KnownZero.lshrInPlace(ShiftAmt);
43785 KnownZero.setHighBits(ShiftAmt);
43786 break;
43787 }
43788 case X86ISD::ANDNP: {
43789 // ANDNP = (~LHS & RHS);
43790 SDValue LHS = Op.getOperand(0);
43791 SDValue RHS = Op.getOperand(1);
43792
43793 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43794 APInt UndefElts;
43795 SmallVector<APInt> EltBits;
43796 int NumElts = VT.getVectorNumElements();
43797 int EltSizeInBits = VT.getScalarSizeInBits();
43798 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43799 APInt OpElts = DemandedElts;
43800 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43801 EltBits)) {
43802 OpBits.clearAllBits();
43803 OpElts.clearAllBits();
43804 for (int I = 0; I != NumElts; ++I) {
43805 if (!DemandedElts[I])
43806 continue;
43807 if (UndefElts[I]) {
43808 // We can't assume an undef src element gives an undef dst - the
43809 // other src might be zero.
43810 OpBits.setAllBits();
43811 OpElts.setBit(I);
43812 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43813 (!Invert && !EltBits[I].isZero())) {
43814 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43815 OpElts.setBit(I);
43816 }
43817 }
43818 }
43819 return std::make_pair(OpBits, OpElts);
43820 };
43821 APInt BitsLHS, EltsLHS;
43822 APInt BitsRHS, EltsRHS;
43823 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43824 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43825
43826 APInt LHSUndef, LHSZero;
43827 APInt RHSUndef, RHSZero;
43828 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43829 Depth + 1))
43830 return true;
43831 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43832 Depth + 1))
43833 return true;
43834
43835 if (!DemandedElts.isAllOnes()) {
43836 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43837 TLO.DAG, Depth + 1);
43838 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43839 TLO.DAG, Depth + 1);
43840 if (NewLHS || NewRHS) {
43841 NewLHS = NewLHS ? NewLHS : LHS;
43842 NewRHS = NewRHS ? NewRHS : RHS;
43843 return TLO.CombineTo(
43844 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43845 }
43846 }
43847 break;
43848 }
43849 case X86ISD::CVTSI2P:
43850 case X86ISD::CVTUI2P:
43851 case X86ISD::CVTPH2PS:
43852 case X86ISD::CVTPS2PH: {
43853 SDValue Src = Op.getOperand(0);
43854 EVT SrcVT = Src.getValueType();
43855 APInt SrcUndef, SrcZero;
43856 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43857 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43858 Depth + 1))
43859 return true;
43860 break;
43861 }
43862 case X86ISD::PACKSS:
43863 case X86ISD::PACKUS: {
43864 SDValue N0 = Op.getOperand(0);
43865 SDValue N1 = Op.getOperand(1);
43866
43867 APInt DemandedLHS, DemandedRHS;
43868 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43869
43870 APInt LHSUndef, LHSZero;
43871 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43872 Depth + 1))
43873 return true;
43874 APInt RHSUndef, RHSZero;
43875 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43876 Depth + 1))
43877 return true;
43878
43879 // TODO - pass on known zero/undef.
43880
43881 // Aggressively peek through ops to get at the demanded elts.
43882 // TODO - we should do this for all target/faux shuffles ops.
43883 if (!DemandedElts.isAllOnes()) {
43884 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43885 TLO.DAG, Depth + 1);
43886 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43887 TLO.DAG, Depth + 1);
43888 if (NewN0 || NewN1) {
43889 NewN0 = NewN0 ? NewN0 : N0;
43890 NewN1 = NewN1 ? NewN1 : N1;
43891 return TLO.CombineTo(Op,
43892 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43893 }
43894 }
43895 break;
43896 }
43897 case X86ISD::HADD:
43898 case X86ISD::HSUB:
43899 case X86ISD::FHADD:
43900 case X86ISD::FHSUB: {
43901 SDValue N0 = Op.getOperand(0);
43902 SDValue N1 = Op.getOperand(1);
43903
43904 APInt DemandedLHS, DemandedRHS;
43905 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43906
43907 APInt LHSUndef, LHSZero;
43908 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43909 Depth + 1))
43910 return true;
43911 APInt RHSUndef, RHSZero;
43912 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43913 Depth + 1))
43914 return true;
43915
43916 // TODO - pass on known zero/undef.
43917
43918 // Aggressively peek through ops to get at the demanded elts.
43919 // TODO: Handle repeated operands.
43920 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43921 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43922 TLO.DAG, Depth + 1);
43923 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43924 TLO.DAG, Depth + 1);
43925 if (NewN0 || NewN1) {
43926 NewN0 = NewN0 ? NewN0 : N0;
43927 NewN1 = NewN1 ? NewN1 : N1;
43928 return TLO.CombineTo(Op,
43929 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43930 }
43931 }
43932 break;
43933 }
43934 case X86ISD::VTRUNC:
43935 case X86ISD::VTRUNCS:
43936 case X86ISD::VTRUNCUS: {
43937 SDValue Src = Op.getOperand(0);
43938 MVT SrcVT = Src.getSimpleValueType();
43939 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43940 APInt SrcUndef, SrcZero;
43941 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43942 Depth + 1))
43943 return true;
43944 KnownZero = SrcZero.zextOrTrunc(NumElts);
43945 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43946 break;
43947 }
43948 case X86ISD::BLENDI: {
43949 SmallVector<int, 16> BlendMask;
43950 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43952 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43953 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43954 return TLO.CombineTo(Op, R);
43955 break;
43956 }
43957 case X86ISD::BLENDV: {
43958 APInt SelUndef, SelZero;
43959 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43960 SelZero, TLO, Depth + 1))
43961 return true;
43962
43963 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43964 APInt LHSUndef, LHSZero;
43965 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43966 LHSZero, TLO, Depth + 1))
43967 return true;
43968
43969 APInt RHSUndef, RHSZero;
43970 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43971 RHSZero, TLO, Depth + 1))
43972 return true;
43973
43974 KnownZero = LHSZero & RHSZero;
43975 KnownUndef = LHSUndef & RHSUndef;
43976 break;
43977 }
43978 case X86ISD::VZEXT_MOVL: {
43979 // If upper demanded elements are already zero then we have nothing to do.
43980 SDValue Src = Op.getOperand(0);
43981 APInt DemandedUpperElts = DemandedElts;
43982 DemandedUpperElts.clearLowBits(1);
43983 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43984 return TLO.CombineTo(Op, Src);
43985 break;
43986 }
43987 case X86ISD::VZEXT_LOAD: {
43988 // If upper demanded elements are not demanded then simplify to a
43989 // scalar_to_vector(load()).
43991 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43992 SDLoc DL(Op);
43993 auto *Mem = cast<MemSDNode>(Op);
43994 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43995 Mem->getMemOperand());
43996 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43997 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
43998 }
43999 break;
44000 }
44001 case X86ISD::VBROADCAST: {
44002 SDValue Src = Op.getOperand(0);
44003 MVT SrcVT = Src.getSimpleValueType();
44004 // Don't bother broadcasting if we just need the 0'th element.
44005 if (DemandedElts == 1) {
44006 if (!SrcVT.isVector())
44007 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
44008 else if (Src.getValueType() != VT)
44009 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
44010 SDLoc(Op));
44011 return TLO.CombineTo(Op, Src);
44012 }
44013 if (!SrcVT.isVector())
44014 break;
44015 APInt SrcUndef, SrcZero;
44016 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
44017 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
44018 Depth + 1))
44019 return true;
44020 // Aggressively peek through src to get at the demanded elt.
44021 // TODO - we should do this for all target/faux shuffles ops.
44023 Src, SrcElts, TLO.DAG, Depth + 1))
44024 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44025 break;
44026 }
44027 case X86ISD::VPERMV:
44028 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
44029 Depth))
44030 return true;
44031 break;
44032 case X86ISD::PSHUFB:
44033 case X86ISD::VPERMV3:
44034 case X86ISD::VPERMILPV:
44035 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
44036 Depth))
44037 return true;
44038 break;
44039 case X86ISD::VPPERM:
44040 case X86ISD::VPERMIL2:
44041 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
44042 Depth))
44043 return true;
44044 break;
44045 }
44046
44047 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44048 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44049 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44050 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44051 DemandedElts.lshr(NumElts / 2) == 0) {
44052 unsigned SizeInBits = VT.getSizeInBits();
44053 unsigned ExtSizeInBits = SizeInBits / 2;
44054
44055 // See if 512-bit ops only use the bottom 128-bits.
44056 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44057 ExtSizeInBits = SizeInBits / 4;
44058
44059 switch (Opc) {
44060 // Scalar broadcast.
44061 case X86ISD::VBROADCAST: {
44062 SDLoc DL(Op);
44063 SDValue Src = Op.getOperand(0);
44064 if (Src.getValueSizeInBits() > ExtSizeInBits)
44065 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44066 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44067 ExtSizeInBits / VT.getScalarSizeInBits());
44068 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44069 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44070 TLO.DAG, DL, ExtSizeInBits));
44071 }
44073 SDLoc DL(Op);
44074 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44075 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44076 ExtSizeInBits / VT.getScalarSizeInBits());
44077 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44078 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44079 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44080 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44081 MemIntr->getMemOperand());
44083 Bcst.getValue(1));
44084 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44085 TLO.DAG, DL, ExtSizeInBits));
44086 }
44087 // Subvector broadcast.
44089 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44090 EVT MemVT = MemIntr->getMemoryVT();
44091 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44092 SDLoc DL(Op);
44093 SDValue Ld =
44094 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44095 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44097 Ld.getValue(1));
44098 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44099 TLO.DAG, DL, ExtSizeInBits));
44100 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44101 SDLoc DL(Op);
44102 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44103 ExtSizeInBits / VT.getScalarSizeInBits());
44104 if (SDValue BcstLd =
44105 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44106 return TLO.CombineTo(Op,
44107 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44108 TLO.DAG, DL, ExtSizeInBits));
44109 }
44110 break;
44111 }
44112 // Byte shifts by immediate.
44113 case X86ISD::VSHLDQ:
44114 case X86ISD::VSRLDQ:
44115 // Shift by uniform.
44116 case X86ISD::VSHL:
44117 case X86ISD::VSRL:
44118 case X86ISD::VSRA:
44119 // Shift by immediate.
44120 case X86ISD::VSHLI:
44121 case X86ISD::VSRLI:
44122 case X86ISD::VSRAI: {
44123 SDLoc DL(Op);
44124 SDValue Ext0 =
44125 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44126 SDValue ExtOp =
44127 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44128 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44129 SDValue Insert =
44130 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44131 return TLO.CombineTo(Op, Insert);
44132 }
44133 case X86ISD::VPERMI: {
44134 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44135 // TODO: This should be done in shuffle combining.
44136 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44138 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44139 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44140 SDLoc DL(Op);
44141 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44142 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44143 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44144 return TLO.CombineTo(Op, Insert);
44145 }
44146 }
44147 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44148 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44149 SDLoc DL(Op);
44150 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44151 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44152 Op.getOperand(1));
44153 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44154 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44155 return TLO.CombineTo(Op, Insert);
44156 }
44157 break;
44158 }
44159 case X86ISD::VPERMV: {
44162 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44163 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44164 VT == MVT::v16f32) &&
44165 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44166 // For lane-crossing shuffles, only split in half in case we're still
44167 // referencing higher elements.
44168 unsigned HalfElts = NumElts / 2;
44169 unsigned HalfSize = SizeInBits / 2;
44170 Mask.resize(HalfElts);
44171 if (all_of(Mask,
44172 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44174 SDLoc DL(Op);
44175 SDValue Ext;
44176 SDValue M =
44177 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44178 SDValue V =
44179 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44180 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44181 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44182 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44183 else {
44185 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44186 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44187 TLO.DAG.getBitcast(ShufVT, V), M);
44188 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44189 }
44190 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44191 Subtarget, TLO.DAG, DL, SizeInBits);
44192 return TLO.CombineTo(Op, Insert);
44193 }
44194 }
44195 break;
44196 }
44197 case X86ISD::VPERMV3: {
44200 if (Subtarget.hasVLX() &&
44201 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44202 // For lane-crossing shuffles, only split in half in case we're still
44203 // referencing higher elements.
44204 unsigned HalfElts = NumElts / 2;
44205 unsigned HalfSize = SizeInBits / 2;
44206 Mask.resize(HalfElts);
44207 if (all_of(Mask, [&](int M) {
44208 return isUndefOrInRange(M, 0, HalfElts) ||
44209 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44210 })) {
44211 // Adjust mask elements for 2nd operand to point to half width.
44212 for (int &M : Mask)
44213 M = (M < NumElts) ? M : (M - HalfElts);
44215 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44216 SDLoc DL(Op);
44217 SDValue Ext = TLO.DAG.getNode(
44218 Opc, DL, HalfVT,
44219 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44220 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44221 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44222 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44223 Subtarget, TLO.DAG, DL, SizeInBits);
44224 return TLO.CombineTo(Op, Insert);
44225 }
44226 }
44227 break;
44228 }
44229 case X86ISD::VPERM2X128: {
44230 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44231 SDLoc DL(Op);
44232 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44233 if (LoMask & 0x8)
44234 return TLO.CombineTo(
44235 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44236 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44237 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44238 SDValue ExtOp =
44239 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44240 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44241 SDValue Insert =
44242 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44243 return TLO.CombineTo(Op, Insert);
44244 }
44245 // Conversions.
44246 // TODO: Add more CVT opcodes when we have test coverage.
44247 case X86ISD::CVTTP2UI: {
44248 if (!Subtarget.hasVLX())
44249 break;
44250 [[fallthrough]];
44251 }
44252 case X86ISD::CVTTP2SI: {
44253 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44254 !Subtarget.hasVLX())
44255 break;
44256 [[fallthrough]];
44257 }
44258 case X86ISD::CVTPH2PS: {
44259 SDLoc DL(Op);
44260 unsigned Scale = SizeInBits / ExtSizeInBits;
44261 SDValue SrcOp = Op.getOperand(0);
44262 MVT SrcVT = SrcOp.getSimpleValueType();
44263 unsigned SrcExtSize =
44264 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44266 ExtSizeInBits / VT.getScalarSizeInBits());
44267 SDValue ExtOp = TLO.DAG.getNode(
44268 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44269 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44270 SDValue Insert =
44271 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44272 return TLO.CombineTo(Op, Insert);
44273 }
44274 // Zero upper elements.
44275 case X86ISD::VZEXT_MOVL:
44276 // Variable blend.
44277 case X86ISD::BLENDV:
44278 // Target unary shuffles:
44279 case X86ISD::MOVDDUP:
44280 // Target unary shuffles by immediate:
44281 case X86ISD::PSHUFD:
44282 case X86ISD::PSHUFLW:
44283 case X86ISD::PSHUFHW:
44284 case X86ISD::VPERMILPI:
44285 // (Non-Lane Crossing) Target Shuffles.
44286 case X86ISD::VPERMILPV:
44287 case X86ISD::VPERMIL2:
44288 case X86ISD::PSHUFB:
44289 case X86ISD::UNPCKL:
44290 case X86ISD::UNPCKH:
44291 case X86ISD::BLENDI:
44292 // Integer ops.
44293 case X86ISD::PACKSS:
44294 case X86ISD::PACKUS:
44295 case X86ISD::PCMPEQ:
44296 case X86ISD::PCMPGT:
44297 case X86ISD::PMULUDQ:
44298 case X86ISD::PMULDQ:
44299 case X86ISD::VSHLV:
44300 case X86ISD::VSRLV:
44301 case X86ISD::VSRAV:
44302 // Float ops.
44303 case X86ISD::FMAX:
44304 case X86ISD::FMIN:
44305 case X86ISD::FMAXC:
44306 case X86ISD::FMINC:
44307 case X86ISD::FRSQRT:
44308 case X86ISD::FRCP:
44309 // Horizontal Ops.
44310 case X86ISD::HADD:
44311 case X86ISD::HSUB:
44312 case X86ISD::FHADD:
44313 case X86ISD::FHSUB: {
44314 SDLoc DL(Op);
44316 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44317 SDValue SrcOp = Op.getOperand(i);
44318 EVT SrcVT = SrcOp.getValueType();
44319 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44320 "Unsupported vector size");
44321 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44322 ExtSizeInBits)
44323 : SrcOp);
44324 }
44325 MVT ExtVT = VT.getSimpleVT();
44326 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44327 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44328 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44329 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44330 SDValue Insert =
44331 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44332 return TLO.CombineTo(Op, Insert);
44333 }
44334 }
44335 }
44336
44337 // For splats, unless we *only* demand the 0'th element,
44338 // stop attempts at simplification here, we aren't going to improve things,
44339 // this is better than any potential shuffle.
44340 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44341 return false;
44342
44343 // Get target/faux shuffle mask.
44344 APInt OpUndef, OpZero;
44345 SmallVector<int, 64> OpMask;
44346 SmallVector<SDValue, 2> OpInputs;
44347 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44348 OpZero, TLO.DAG, Depth, false))
44349 return false;
44350
44351 // Shuffle inputs must be the same size as the result.
44352 if (OpMask.size() != (unsigned)NumElts ||
44353 llvm::any_of(OpInputs, [VT](SDValue V) {
44354 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44355 !V.getValueType().isVector();
44356 }))
44357 return false;
44358
44359 KnownZero = OpZero;
44360 KnownUndef = OpUndef;
44361
44362 // Check if shuffle mask can be simplified to undef/zero/identity.
44363 int NumSrcs = OpInputs.size();
44364 for (int i = 0; i != NumElts; ++i)
44365 if (!DemandedElts[i])
44366 OpMask[i] = SM_SentinelUndef;
44367
44368 if (isUndefInRange(OpMask, 0, NumElts)) {
44369 KnownUndef.setAllBits();
44370 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44371 }
44372 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44373 KnownZero.setAllBits();
44374 return TLO.CombineTo(
44375 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44376 }
44377 for (int Src = 0; Src != NumSrcs; ++Src)
44378 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44379 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44380
44381 // Attempt to simplify inputs.
44382 for (int Src = 0; Src != NumSrcs; ++Src) {
44383 // TODO: Support inputs of different types.
44384 if (OpInputs[Src].getValueType() != VT)
44385 continue;
44386
44387 int Lo = Src * NumElts;
44388 APInt SrcElts = APInt::getZero(NumElts);
44389 for (int i = 0; i != NumElts; ++i)
44390 if (DemandedElts[i]) {
44391 int M = OpMask[i] - Lo;
44392 if (0 <= M && M < NumElts)
44393 SrcElts.setBit(M);
44394 }
44395
44396 // TODO - Propagate input undef/zero elts.
44397 APInt SrcUndef, SrcZero;
44398 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44399 TLO, Depth + 1))
44400 return true;
44401 }
44402
44403 // If we don't demand all elements, then attempt to combine to a simpler
44404 // shuffle.
44405 // We need to convert the depth to something combineX86ShufflesRecursively
44406 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44407 // to match. This prevents combineX86ShuffleChain from returning a
44408 // combined shuffle that's the same as the original root, causing an
44409 // infinite loop.
44410 if (!DemandedElts.isAllOnes()) {
44411 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44412
44413 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44414 for (int i = 0; i != NumElts; ++i)
44415 if (DemandedElts[i])
44416 DemandedMask[i] = i;
44417
44419 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44421 /*AllowVariableCrossLaneMask=*/true,
44422 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44423 TLO.DAG, SDLoc(Op), Subtarget);
44424 if (NewShuffle)
44425 return TLO.CombineTo(Op, NewShuffle);
44426 }
44427
44428 return false;
44429}
44430
44432 SDValue Op, const APInt &OriginalDemandedBits,
44433 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44434 unsigned Depth) const {
44435 EVT VT = Op.getValueType();
44436 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44437 unsigned Opc = Op.getOpcode();
44438 switch(Opc) {
44439 case X86ISD::VTRUNC: {
44440 KnownBits KnownOp;
44441 SDValue Src = Op.getOperand(0);
44442 MVT SrcVT = Src.getSimpleValueType();
44443
44444 // Simplify the input, using demanded bit information.
44445 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44446 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44447 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44448 return true;
44449 break;
44450 }
44451 case X86ISD::PMULDQ:
44452 case X86ISD::PMULUDQ: {
44453 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44454 KnownBits KnownLHS, KnownRHS;
44455 SDValue LHS = Op.getOperand(0);
44456 SDValue RHS = Op.getOperand(1);
44457
44458 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44459 // FIXME: Can we bound this better?
44460 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44461 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44462 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44463
44464 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44465 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44466 DemandedMaskLHS = DemandedMask;
44467 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44468 DemandedMaskRHS = DemandedMask;
44469
44470 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44471 KnownLHS, TLO, Depth + 1))
44472 return true;
44473 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44474 KnownRHS, TLO, Depth + 1))
44475 return true;
44476
44477 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44478 KnownRHS = KnownRHS.trunc(32);
44479 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44480 KnownRHS.getConstant().isOne()) {
44481 SDLoc DL(Op);
44482 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44483 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44484 }
44485
44486 // Aggressively peek through ops to get at the demanded low bits.
44488 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44490 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44491 if (DemandedLHS || DemandedRHS) {
44492 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44493 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44494 return TLO.CombineTo(
44495 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44496 }
44497 break;
44498 }
44499 case X86ISD::ANDNP: {
44500 KnownBits Known2;
44501 SDValue Op0 = Op.getOperand(0);
44502 SDValue Op1 = Op.getOperand(1);
44503
44504 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44505 Known, TLO, Depth + 1))
44506 return true;
44507
44508 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44509 OriginalDemandedElts, Known2, TLO, Depth + 1))
44510 return true;
44511
44512 // If the RHS is a constant, see if we can simplify it.
44513 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44514 OriginalDemandedElts, TLO))
44515 return true;
44516
44517 // ANDNP = (~Op0 & Op1);
44518 Known.One &= Known2.Zero;
44519 Known.Zero |= Known2.One;
44520 break;
44521 }
44522 case X86ISD::VSHLI: {
44523 SDValue Op0 = Op.getOperand(0);
44524 SDValue Op1 = Op.getOperand(1);
44525
44526 unsigned ShAmt = Op1->getAsZExtVal();
44527 if (ShAmt >= BitWidth)
44528 break;
44529
44530 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44531
44532 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44533 // single shift. We can do this if the bottom bits (which are shifted
44534 // out) are never demanded.
44535 if (Op0.getOpcode() == X86ISD::VSRLI &&
44536 OriginalDemandedBits.countr_zero() >= ShAmt) {
44537 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44538 if (Shift2Amt < BitWidth) {
44539 int Diff = ShAmt - Shift2Amt;
44540 if (Diff == 0)
44541 return TLO.CombineTo(Op, Op0.getOperand(0));
44542
44543 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44544 SDValue NewShift = TLO.DAG.getNode(
44545 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44546 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44547 return TLO.CombineTo(Op, NewShift);
44548 }
44549 }
44550
44551 // If we are only demanding sign bits then we can use the shift source directly.
44552 unsigned NumSignBits =
44553 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44554 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44555 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44556 return TLO.CombineTo(Op, Op0);
44557
44558 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44559 TLO, Depth + 1))
44560 return true;
44561
44562 Known <<= ShAmt;
44563
44564 // Low bits known zero.
44565 Known.Zero.setLowBits(ShAmt);
44566
44567 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44568 // Attempt to avoid multi-use ops if we don't need anything from them.
44569 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44570 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44571 SDValue NewOp =
44572 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44573 return TLO.CombineTo(Op, NewOp);
44574 }
44575 }
44576 return false;
44577 }
44578 case X86ISD::VSRLI: {
44579 SDValue Op0 = Op.getOperand(0);
44580 SDValue Op1 = Op.getOperand(1);
44581
44582 unsigned ShAmt = Op1->getAsZExtVal();
44583 if (ShAmt >= BitWidth)
44584 break;
44585
44586 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44587
44588 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44589 TLO, Depth + 1))
44590 return true;
44591
44592 Known >>= ShAmt;
44593
44594 // High bits known zero.
44595 Known.Zero.setHighBits(ShAmt);
44596
44597 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44598 // Attempt to avoid multi-use ops if we don't need anything from them.
44599 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44600 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44601 SDValue NewOp =
44602 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44603 return TLO.CombineTo(Op, NewOp);
44604 }
44605 }
44606 return false;
44607 }
44608 case X86ISD::VSRAI: {
44609 SDValue Op0 = Op.getOperand(0);
44610 SDValue Op1 = Op.getOperand(1);
44611
44612 unsigned ShAmt = Op1->getAsZExtVal();
44613 if (ShAmt >= BitWidth)
44614 break;
44615
44616 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44617
44618 // If we only want bits that already match the signbit then we don't need
44619 // to shift.
44620 unsigned NumHiDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44621 if (TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1) >=
44622 NumHiDemandedBits)
44623 return TLO.CombineTo(Op, Op0);
44624
44625 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44626 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44627 SDValue Op00 = Op0.getOperand(0);
44628 unsigned NumSignBits =
44629 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44630 if (ShAmt < NumSignBits)
44631 return TLO.CombineTo(Op, Op00);
44632 }
44633
44634 // If any of the demanded bits are produced by the sign extension, we also
44635 // demand the input sign bit.
44636 if (OriginalDemandedBits.countl_zero() < ShAmt)
44637 DemandedMask.setSignBit();
44638
44639 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44640 TLO, Depth + 1))
44641 return true;
44642
44643 Known >>= ShAmt;
44644
44645 // If the input sign bit is known to be zero, or if none of the top bits
44646 // are demanded, turn this into an unsigned shift right.
44647 if (Known.Zero[BitWidth - ShAmt - 1] ||
44648 OriginalDemandedBits.countl_zero() >= ShAmt)
44649 return TLO.CombineTo(
44650 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44651
44652 // High bits are known one.
44653 if (Known.One[BitWidth - ShAmt - 1])
44654 Known.One.setHighBits(ShAmt);
44655
44656 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44657 // Attempt to avoid multi-use ops if we don't need anything from them.
44658 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44659 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44660 SDValue NewOp =
44661 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44662 return TLO.CombineTo(Op, NewOp);
44663 }
44664 }
44665 return false;
44666 }
44667 case X86ISD::BLENDI: {
44668 SDValue LHS = Op.getOperand(0);
44669 SDValue RHS = Op.getOperand(1);
44670 APInt Mask = getBLENDIBlendMask(Op);
44671
44672 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44673 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44674 TLO, Depth + 1))
44675 return true;
44676
44677 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44678 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44679 TLO, Depth + 1))
44680 return true;
44681
44682 // Attempt to avoid multi-use ops if we don't need anything from them.
44684 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44686 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44687 if (NewLHS || NewRHS) {
44688 NewLHS = NewLHS ? NewLHS : LHS;
44689 NewRHS = NewRHS ? NewRHS : RHS;
44690 return TLO.CombineTo(Op,
44691 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44692 NewLHS, NewRHS, Op.getOperand(2)));
44693 }
44694 break;
44695 }
44696 case X86ISD::BLENDV: {
44697 SDValue Sel = Op.getOperand(0);
44698 SDValue LHS = Op.getOperand(1);
44699 SDValue RHS = Op.getOperand(2);
44700
44701 APInt SignMask = APInt::getSignMask(BitWidth);
44703 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44705 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44707 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44708
44709 if (NewSel || NewLHS || NewRHS) {
44710 NewSel = NewSel ? NewSel : Sel;
44711 NewLHS = NewLHS ? NewLHS : LHS;
44712 NewRHS = NewRHS ? NewRHS : RHS;
44713 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44714 NewSel, NewLHS, NewRHS));
44715 }
44716 break;
44717 }
44718 case X86ISD::PEXTRB:
44719 case X86ISD::PEXTRW: {
44720 SDValue Vec = Op.getOperand(0);
44721 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44722 MVT VecVT = Vec.getSimpleValueType();
44723 unsigned NumVecElts = VecVT.getVectorNumElements();
44724
44725 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44726 unsigned Idx = CIdx->getZExtValue();
44727 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44728
44729 // If we demand no bits from the vector then we must have demanded
44730 // bits from the implict zext - simplify to zero.
44731 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44732 if (DemandedVecBits == 0)
44733 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44734
44735 APInt KnownUndef, KnownZero;
44736 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44737 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44738 KnownZero, TLO, Depth + 1))
44739 return true;
44740
44741 KnownBits KnownVec;
44742 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44743 KnownVec, TLO, Depth + 1))
44744 return true;
44745
44747 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44748 return TLO.CombineTo(
44749 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44750
44751 Known = KnownVec.zext(BitWidth);
44752 return false;
44753 }
44754 break;
44755 }
44756 case X86ISD::PINSRB:
44757 case X86ISD::PINSRW: {
44758 SDValue Vec = Op.getOperand(0);
44759 SDValue Scl = Op.getOperand(1);
44760 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44761 MVT VecVT = Vec.getSimpleValueType();
44762
44763 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44764 unsigned Idx = CIdx->getZExtValue();
44765 if (!OriginalDemandedElts[Idx])
44766 return TLO.CombineTo(Op, Vec);
44767
44768 KnownBits KnownVec;
44769 APInt DemandedVecElts(OriginalDemandedElts);
44770 DemandedVecElts.clearBit(Idx);
44771 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44772 KnownVec, TLO, Depth + 1))
44773 return true;
44774
44775 KnownBits KnownScl;
44776 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44777 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44778 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44779 return true;
44780
44781 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44782 Known = KnownVec.intersectWith(KnownScl);
44783 return false;
44784 }
44785 break;
44786 }
44787 case X86ISD::PACKSS:
44788 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44789 // sign bit then we can just ask for the source operands sign bit.
44790 // TODO - add known bits handling.
44791 if (OriginalDemandedBits.isSignMask()) {
44792 APInt DemandedLHS, DemandedRHS;
44793 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44794
44795 KnownBits KnownLHS, KnownRHS;
44796 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44797 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44798 KnownLHS, TLO, Depth + 1))
44799 return true;
44800 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44801 KnownRHS, TLO, Depth + 1))
44802 return true;
44803
44804 // Attempt to avoid multi-use ops if we don't need anything from them.
44806 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44808 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44809 if (DemandedOp0 || DemandedOp1) {
44810 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44811 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44812 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44813 }
44814 }
44815 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44816 break;
44817 case X86ISD::VBROADCAST: {
44818 SDValue Src = Op.getOperand(0);
44819 MVT SrcVT = Src.getSimpleValueType();
44820 APInt DemandedElts = APInt::getOneBitSet(
44821 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44822 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44823 TLO, Depth + 1))
44824 return true;
44825 // If we don't need the upper bits, attempt to narrow the broadcast source.
44826 // Don't attempt this on AVX512 as it might affect broadcast folding.
44827 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44828 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44829 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44830 Src->hasOneUse()) {
44831 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44832 SDValue NewSrc =
44833 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44834 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44835 SDValue NewBcst =
44836 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44837 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44838 }
44839 break;
44840 }
44841 case X86ISD::PCMPGT:
44842 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44843 // iff we only need the sign bit then we can use R directly.
44844 if (OriginalDemandedBits.isSignMask() &&
44845 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44846 return TLO.CombineTo(Op, Op.getOperand(1));
44847 break;
44848 case X86ISD::MOVMSK: {
44849 SDValue Src = Op.getOperand(0);
44850 MVT SrcVT = Src.getSimpleValueType();
44851 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44852 unsigned NumElts = SrcVT.getVectorNumElements();
44853
44854 // If we don't need the sign bits at all just return zero.
44855 if (OriginalDemandedBits.countr_zero() >= NumElts)
44856 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44857
44858 // See if we only demand bits from the lower 128-bit vector.
44859 if (SrcVT.is256BitVector() &&
44860 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44861 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44862 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44863 }
44864
44865 // Only demand the vector elements of the sign bits we need.
44866 APInt KnownUndef, KnownZero;
44867 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44868 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44869 TLO, Depth + 1))
44870 return true;
44871
44872 Known.Zero = KnownZero.zext(BitWidth);
44873 Known.Zero.setHighBits(BitWidth - NumElts);
44874
44875 // MOVMSK only uses the MSB from each vector element.
44876 KnownBits KnownSrc;
44877 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44878 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44879 Depth + 1))
44880 return true;
44881
44882 if (KnownSrc.One[SrcBits - 1])
44883 Known.One.setLowBits(NumElts);
44884 else if (KnownSrc.Zero[SrcBits - 1])
44885 Known.Zero.setLowBits(NumElts);
44886
44887 // Attempt to avoid multi-use os if we don't need anything from it.
44889 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44890 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44891 return false;
44892 }
44893 case X86ISD::TESTP: {
44894 SDValue Op0 = Op.getOperand(0);
44895 SDValue Op1 = Op.getOperand(1);
44896 MVT OpVT = Op0.getSimpleValueType();
44897 assert((OpVT.getVectorElementType() == MVT::f32 ||
44898 OpVT.getVectorElementType() == MVT::f64) &&
44899 "Illegal vector type for X86ISD::TESTP");
44900
44901 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44902 KnownBits KnownSrc;
44903 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44904 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44905 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44906 AssumeSingleUse) ||
44907 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44908 AssumeSingleUse);
44909 }
44910 case X86ISD::CMOV: {
44911 KnownBits Known2;
44912 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44913 OriginalDemandedElts, Known2, TLO, Depth + 1))
44914 return true;
44915 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44916 OriginalDemandedElts, Known, TLO, Depth + 1))
44917 return true;
44918
44919 // Only known if known in both the LHS and RHS.
44920 Known = Known.intersectWith(Known2);
44921 return false;
44922 }
44923 case X86ISD::BEXTR:
44924 case X86ISD::BEXTRI: {
44925 SDValue Op0 = Op.getOperand(0);
44926 SDValue Op1 = Op.getOperand(1);
44927
44928 // Only bottom 16-bits of the control bits are required.
44929 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44930 // NOTE: SimplifyDemandedBits won't do this for constants.
44931 uint64_t Val1 = Cst1->getZExtValue();
44932 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44933 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44934 SDLoc DL(Op);
44935 return TLO.CombineTo(
44936 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44937 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44938 }
44939
44940 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44941 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44942
44943 // If the length is 0, the result is 0.
44944 if (Length == 0) {
44945 Known.setAllZero();
44946 return false;
44947 }
44948
44949 if ((Shift + Length) <= BitWidth) {
44950 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44951 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44952 return true;
44953
44954 Known = Known.extractBits(Length, Shift);
44955 Known = Known.zextOrTrunc(BitWidth);
44956 return false;
44957 }
44958 } else {
44959 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44960 KnownBits Known1;
44961 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44962 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44963 return true;
44964
44965 // If the length is 0, replace with 0.
44966 KnownBits LengthBits = Known1.extractBits(8, 8);
44967 if (LengthBits.isZero())
44968 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44969 }
44970
44971 break;
44972 }
44973 case X86ISD::PDEP: {
44974 SDValue Op0 = Op.getOperand(0);
44975 SDValue Op1 = Op.getOperand(1);
44976
44977 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44978 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44979
44980 // If the demanded bits has leading zeroes, we don't demand those from the
44981 // mask.
44982 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44983 return true;
44984
44985 // The number of possible 1s in the mask determines the number of LSBs of
44986 // operand 0 used. Undemanded bits from the mask don't matter so filter
44987 // them before counting.
44988 KnownBits Known2;
44989 uint64_t Count = (~Known.Zero & LoMask).popcount();
44990 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44991 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44992 return true;
44993
44994 // Zeroes are retained from the mask, but not ones.
44995 Known.One.clearAllBits();
44996 // The result will have at least as many trailing zeros as the non-mask
44997 // operand since bits can only map to the same or higher bit position.
44998 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44999 return false;
45000 }
45001 case X86ISD::VPMADD52L:
45002 case X86ISD::VPMADD52H: {
45003 KnownBits KnownOp0, KnownOp1, KnownOp2;
45004 SDValue Op0 = Op.getOperand(0);
45005 SDValue Op1 = Op.getOperand(1);
45006 SDValue Op2 = Op.getOperand(2);
45007 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
45008 // operand 2).
45009 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
45010 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
45011 TLO, Depth + 1))
45012 return true;
45013
45014 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
45015 TLO, Depth + 1))
45016 return true;
45017
45018 if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
45019 KnownOp2, TLO, Depth + 1))
45020 return true;
45021
45022 KnownBits KnownMul;
45023 KnownOp0 = KnownOp0.trunc(52);
45024 KnownOp1 = KnownOp1.trunc(52);
45025 KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
45026 : KnownBits::mulhu(KnownOp0, KnownOp1);
45027 KnownMul = KnownMul.zext(64);
45028
45029 // lo/hi(X * Y) + Z --> C + Z
45030 if (KnownMul.isConstant()) {
45031 SDLoc DL(Op);
45032 SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
45033 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
45034 }
45035
45036 Known = KnownBits::add(KnownMul, KnownOp2);
45037 return false;
45038 }
45039 }
45040
45042 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
45043}
45044
45046 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
45047 SelectionDAG &DAG, unsigned Depth) const {
45048 int NumElts = DemandedElts.getBitWidth();
45049 unsigned Opc = Op.getOpcode();
45050 EVT VT = Op.getValueType();
45051
45052 switch (Opc) {
45053 case X86ISD::PINSRB:
45054 case X86ISD::PINSRW: {
45055 // If we don't demand the inserted element, return the base vector.
45056 SDValue Vec = Op.getOperand(0);
45057 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
45058 MVT VecVT = Vec.getSimpleValueType();
45059 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
45060 !DemandedElts[CIdx->getZExtValue()])
45061 return Vec;
45062 break;
45063 }
45064 case X86ISD::VSHLI: {
45065 // If we are only demanding sign bits then we can use the shift source
45066 // directly.
45067 SDValue Op0 = Op.getOperand(0);
45068 unsigned ShAmt = Op.getConstantOperandVal(1);
45069 unsigned BitWidth = DemandedBits.getBitWidth();
45070 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45071 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45072 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45073 return Op0;
45074 break;
45075 }
45076 case X86ISD::VSRAI:
45077 // iff we only need the sign bit then we can use the source directly.
45078 // TODO: generalize where we only demand extended signbits.
45079 if (DemandedBits.isSignMask())
45080 return Op.getOperand(0);
45081 break;
45082 case X86ISD::PCMPGT:
45083 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45084 // iff we only need the sign bit then we can use R directly.
45085 if (DemandedBits.isSignMask() &&
45086 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45087 return Op.getOperand(1);
45088 break;
45089 case X86ISD::BLENDV: {
45090 // BLENDV: Cond (MSB) ? LHS : RHS
45091 SDValue Cond = Op.getOperand(0);
45092 SDValue LHS = Op.getOperand(1);
45093 SDValue RHS = Op.getOperand(2);
45094
45095 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45096 if (CondKnown.isNegative())
45097 return LHS;
45098 if (CondKnown.isNonNegative())
45099 return RHS;
45100 break;
45101 }
45102 case X86ISD::ANDNP: {
45103 // ANDNP = (~LHS & RHS);
45104 SDValue LHS = Op.getOperand(0);
45105 SDValue RHS = Op.getOperand(1);
45106
45107 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45108 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45109
45110 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45111 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45112 // this context, so return RHS.
45113 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45114 return RHS;
45115 break;
45116 }
45117 }
45118
45119 APInt ShuffleUndef, ShuffleZero;
45120 SmallVector<int, 16> ShuffleMask;
45122 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45123 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45124 // If all the demanded elts are from one operand and are inline,
45125 // then we can use the operand directly.
45126 int NumOps = ShuffleOps.size();
45127 if (ShuffleMask.size() == (unsigned)NumElts &&
45129 return VT.getSizeInBits() == V.getValueSizeInBits();
45130 })) {
45131
45132 if (DemandedElts.isSubsetOf(ShuffleUndef))
45133 return DAG.getUNDEF(VT);
45134 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45135 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45136
45137 // Bitmask that indicates which ops have only been accessed 'inline'.
45138 APInt IdentityOp = APInt::getAllOnes(NumOps);
45139 for (int i = 0; i != NumElts; ++i) {
45140 int M = ShuffleMask[i];
45141 if (!DemandedElts[i] || ShuffleUndef[i])
45142 continue;
45143 int OpIdx = M / NumElts;
45144 int EltIdx = M % NumElts;
45145 if (M < 0 || EltIdx != i) {
45146 IdentityOp.clearAllBits();
45147 break;
45148 }
45149 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45150 if (IdentityOp == 0)
45151 break;
45152 }
45153 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45154 "Multiple identity shuffles detected");
45155
45156 if (IdentityOp != 0)
45157 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45158 }
45159 }
45160
45162 Op, DemandedBits, DemandedElts, DAG, Depth);
45163}
45164
45166 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45167 bool PoisonOnly, unsigned Depth) const {
45168 unsigned NumElts = DemandedElts.getBitWidth();
45169
45170 switch (Op.getOpcode()) {
45172 case X86ISD::Wrapper:
45173 case X86ISD::WrapperRIP:
45174 return true;
45175 case X86ISD::PACKSS:
45176 case X86ISD::PACKUS: {
45177 APInt DemandedLHS, DemandedRHS;
45178 getPackDemandedElts(Op.getSimpleValueType(), DemandedElts, DemandedLHS,
45179 DemandedRHS);
45180 return (!DemandedLHS ||
45181 DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedLHS,
45182 PoisonOnly, Depth + 1)) &&
45183 (!DemandedRHS ||
45184 DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(1), DemandedRHS,
45185 PoisonOnly, Depth + 1));
45186 }
45187 case X86ISD::INSERTPS:
45188 case X86ISD::BLENDI:
45189 case X86ISD::PSHUFB:
45190 case X86ISD::PSHUFD:
45191 case X86ISD::UNPCKL:
45192 case X86ISD::UNPCKH:
45193 case X86ISD::VPERMILPV:
45194 case X86ISD::VPERMILPI:
45195 case X86ISD::VPERMV:
45196 case X86ISD::VPERMV3: {
45199 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45200 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45201 APInt::getZero(NumElts));
45202 for (auto M : enumerate(Mask)) {
45203 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45204 continue;
45205 if (M.value() == SM_SentinelUndef)
45206 return false;
45207 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45208 "Shuffle mask index out of range");
45209 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45210 }
45211 for (auto Op : enumerate(Ops))
45212 if (!DemandedSrcElts[Op.index()].isZero() &&
45214 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45215 return false;
45216 return true;
45217 }
45218 break;
45219 }
45220 }
45222 Op, DemandedElts, DAG, PoisonOnly, Depth);
45223}
45224
45226 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45227 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45228
45229 switch (Op.getOpcode()) {
45230 // SSE bit logic.
45231 case X86ISD::FAND:
45232 case X86ISD::FOR:
45233 case X86ISD::FXOR:
45234 case X86ISD::FANDN:
45235 case X86ISD::ANDNP:
45236 case X86ISD::VPTERNLOG:
45237 return false;
45238 // SSE vector insert/extracts use modulo indices.
45239 case X86ISD::PINSRB:
45240 case X86ISD::PINSRW:
45241 case X86ISD::PEXTRB:
45242 case X86ISD::PEXTRW:
45243 return false;
45244 // SSE vector multiplies are either inbounds or saturate.
45245 case X86ISD::VPMADDUBSW:
45246 case X86ISD::VPMADDWD:
45247 return false;
45248 // SSE vector shifts handle out of bounds shift amounts.
45249 case X86ISD::VSHLI:
45250 case X86ISD::VSRLI:
45251 case X86ISD::VSRAI:
45252 return false;
45253 // SSE blends.
45254 case X86ISD::BLENDI:
45255 case X86ISD::BLENDV:
45256 return false;
45257 // SSE packs.
45258 case X86ISD::PACKSS:
45259 case X86ISD::PACKUS:
45260 return false;
45261 // SSE target shuffles.
45262 case X86ISD::INSERTPS:
45263 case X86ISD::PSHUFB:
45264 case X86ISD::PSHUFD:
45265 case X86ISD::UNPCKL:
45266 case X86ISD::UNPCKH:
45267 case X86ISD::VPERMILPV:
45268 case X86ISD::VPERMILPI:
45269 case X86ISD::VPERMV:
45270 case X86ISD::VPERMV3:
45271 return false;
45272 // SSE comparisons handle all icmp/fcmp cases.
45273 // TODO: Add CMPM/MM with test coverage.
45274 case X86ISD::CMPP:
45275 case X86ISD::PCMPEQ:
45276 case X86ISD::PCMPGT:
45277 return false;
45278 // SSE signbit extraction.
45279 case X86ISD::MOVMSK:
45280 return false;
45281 // GFNI instructions.
45284 case X86ISD::GF2P8MULB:
45285 return false;
45287 switch (Op->getConstantOperandVal(0)) {
45288 case Intrinsic::x86_sse2_pmadd_wd:
45289 case Intrinsic::x86_avx2_pmadd_wd:
45290 case Intrinsic::x86_avx512_pmaddw_d_512:
45291 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45292 case Intrinsic::x86_avx2_pmadd_ub_sw:
45293 case Intrinsic::x86_avx512_pmaddubs_w_512:
45294 return false;
45295 case Intrinsic::x86_avx512_vpermi2var_d_128:
45296 case Intrinsic::x86_avx512_vpermi2var_d_256:
45297 case Intrinsic::x86_avx512_vpermi2var_d_512:
45298 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45299 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45300 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45301 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45302 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45303 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45304 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45305 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45306 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45307 case Intrinsic::x86_avx512_vpermi2var_q_128:
45308 case Intrinsic::x86_avx512_vpermi2var_q_256:
45309 case Intrinsic::x86_avx512_vpermi2var_q_512:
45310 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45311 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45312 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45313 return false;
45314 }
45315 }
45317 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45318}
45319
45321 const APInt &DemandedElts,
45322 APInt &UndefElts,
45323 const SelectionDAG &DAG,
45324 unsigned Depth) const {
45325 unsigned NumElts = DemandedElts.getBitWidth();
45326 unsigned Opc = Op.getOpcode();
45327
45328 switch (Opc) {
45329 case X86ISD::VBROADCAST:
45331 UndefElts = APInt::getZero(NumElts);
45332 return true;
45333 }
45334
45335 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45336 DAG, Depth);
45337}
45338
45339// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45340// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45341static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45342 bool AllowTruncate, unsigned Depth) {
45343 // Limit recursion.
45345 return false;
45346 switch (Src.getOpcode()) {
45347 case ISD::TRUNCATE:
45348 if (!AllowTruncate)
45349 return false;
45350 [[fallthrough]];
45351 case ISD::SETCC:
45352 return Src.getOperand(0).getValueSizeInBits() == Size;
45353 case ISD::FREEZE:
45354 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45355 Depth + 1);
45356 case ISD::AND:
45357 case ISD::XOR:
45358 case ISD::OR:
45359 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45360 Depth + 1) &&
45361 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45362 Depth + 1);
45363 case ISD::SELECT:
45364 case ISD::VSELECT:
45365 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45366 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45367 Depth + 1) &&
45368 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45369 Depth + 1);
45370 case ISD::BUILD_VECTOR:
45371 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45372 ISD::isBuildVectorAllOnes(Src.getNode());
45373 }
45374 return false;
45375}
45376
45377// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45378static unsigned getAltBitOpcode(unsigned Opcode) {
45379 switch(Opcode) {
45380 // clang-format off
45381 case ISD::AND: return X86ISD::FAND;
45382 case ISD::OR: return X86ISD::FOR;
45383 case ISD::XOR: return X86ISD::FXOR;
45384 case X86ISD::ANDNP: return X86ISD::FANDN;
45385 // clang-format on
45386 }
45387 llvm_unreachable("Unknown bitwise opcode");
45388}
45389
45390// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45392 const SDLoc &DL) {
45393 EVT SrcVT = Src.getValueType();
45394 if (SrcVT != MVT::v4i1)
45395 return SDValue();
45396
45397 switch (Src.getOpcode()) {
45398 case ISD::SETCC:
45399 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45400 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45401 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45402 SDValue Op0 = Src.getOperand(0);
45403 if (ISD::isNormalLoad(Op0.getNode()))
45404 return DAG.getBitcast(MVT::v4f32, Op0);
45405 if (Op0.getOpcode() == ISD::BITCAST &&
45406 Op0.getOperand(0).getValueType() == MVT::v4f32)
45407 return Op0.getOperand(0);
45408 }
45409 break;
45410 case ISD::AND:
45411 case ISD::XOR:
45412 case ISD::OR: {
45413 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45414 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45415 if (Op0 && Op1)
45416 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45417 Op1);
45418 break;
45419 }
45420 }
45421 return SDValue();
45422}
45423
45424// Helper to push sign extension of vXi1 SETCC result through bitops.
45426 SDValue Src, const SDLoc &DL) {
45427 switch (Src.getOpcode()) {
45428 case ISD::SETCC:
45429 case ISD::FREEZE:
45430 case ISD::TRUNCATE:
45431 case ISD::BUILD_VECTOR:
45432 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45433 case ISD::AND:
45434 case ISD::XOR:
45435 case ISD::OR:
45436 return DAG.getNode(
45437 Src.getOpcode(), DL, SExtVT,
45438 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45439 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45440 case ISD::SELECT:
45441 case ISD::VSELECT:
45442 return DAG.getSelect(
45443 DL, SExtVT, Src.getOperand(0),
45444 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45445 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45446 }
45447 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45448}
45449
45450// Try to match patterns such as
45451// (i16 bitcast (v16i1 x))
45452// ->
45453// (i16 movmsk (16i8 sext (v16i1 x)))
45454// before the illegal vector is scalarized on subtargets that don't have legal
45455// vxi1 types.
45457 const SDLoc &DL,
45458 const X86Subtarget &Subtarget) {
45459 EVT SrcVT = Src.getValueType();
45460 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
45461 return SDValue();
45462
45463 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45464 // legalization destroys the v4i32 type.
45465 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45466 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45467 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45468 DAG.getBitcast(MVT::v4f32, V));
45469 return DAG.getZExtOrTrunc(V, DL, VT);
45470 }
45471 }
45472
45473 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45474 // movmskb even with avx512. This will be better than truncating to vXi1 and
45475 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45476 // vpcmpeqb/vpcmpgtb.
45477 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45478 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45479 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45480 Src.getOperand(0).getValueType() == MVT::v64i8);
45481
45482 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45483 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45484 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45485 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45486 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45487 EVT CmpVT = Src.getOperand(0).getValueType();
45488 EVT EltVT = CmpVT.getVectorElementType();
45489 if (CmpVT.getSizeInBits() <= 256 &&
45490 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45491 PreferMovMsk = true;
45492 }
45493
45494 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45495 // MOVMSK is supported in SSE2 or later.
45496 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45497 return SDValue();
45498
45499 // If the upper ops of a concatenation are undef, then try to bitcast the
45500 // lower op and extend.
45501 SmallVector<SDValue, 4> SubSrcOps;
45502 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45503 SubSrcOps.size() >= 2) {
45504 SDValue LowerOp = SubSrcOps[0];
45505 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45506 if (LowerOp.getOpcode() == ISD::SETCC &&
45507 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45508 EVT SubVT = VT.getIntegerVT(
45509 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45510 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45511 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45512 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45513 }
45514 }
45515 }
45516
45517 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45518 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45519 // v8i16 and v16i16.
45520 // For these two cases, we can shuffle the upper element bytes to a
45521 // consecutive sequence at the start of the vector and treat the results as
45522 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45523 // for v16i16 this is not the case, because the shuffle is expensive, so we
45524 // avoid sign-extending to this type entirely.
45525 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45526 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45527 MVT SExtVT;
45528 bool PropagateSExt = false;
45529 switch (SrcVT.getSimpleVT().SimpleTy) {
45530 default:
45531 return SDValue();
45532 case MVT::v2i1:
45533 SExtVT = MVT::v2i64;
45534 break;
45535 case MVT::v4i1:
45536 SExtVT = MVT::v4i32;
45537 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45538 // sign-extend to a 256-bit operation to avoid truncation.
45539 if (Subtarget.hasAVX() &&
45540 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45541 SExtVT = MVT::v4i64;
45542 PropagateSExt = true;
45543 }
45544 break;
45545 case MVT::v8i1:
45546 SExtVT = MVT::v8i16;
45547 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45548 // sign-extend to a 256-bit operation to match the compare.
45549 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45550 // 256-bit because the shuffle is cheaper than sign extending the result of
45551 // the compare.
45552 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45553 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45554 SExtVT = MVT::v8i32;
45555 PropagateSExt = true;
45556 }
45557 break;
45558 case MVT::v16i1:
45559 SExtVT = MVT::v16i8;
45560 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45561 // it is not profitable to sign-extend to 256-bit because this will
45562 // require an extra cross-lane shuffle which is more expensive than
45563 // truncating the result of the compare to 128-bits.
45564 break;
45565 case MVT::v32i1:
45566 SExtVT = MVT::v32i8;
45567 break;
45568 case MVT::v64i1:
45569 // If we have AVX512F, but not AVX512BW and the input is truncated from
45570 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45571 if (Subtarget.hasAVX512()) {
45572 if (Subtarget.hasBWI())
45573 return SDValue();
45574 SExtVT = MVT::v64i8;
45575 break;
45576 }
45577 // Split if this is a <64 x i8> comparison result.
45578 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45579 SExtVT = MVT::v64i8;
45580 break;
45581 }
45582 return SDValue();
45583 };
45584
45585 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45586 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45587
45588 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45589 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45590 } else {
45591 if (SExtVT == MVT::v8i16) {
45592 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45593 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45594 }
45595 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45596 }
45597
45598 EVT IntVT =
45600 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45601 return DAG.getBitcast(VT, V);
45602}
45603
45604// Convert a vXi1 constant build vector to the same width scalar integer.
45606 EVT SrcVT = Op.getValueType();
45607 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45608 "Expected a vXi1 vector");
45610 "Expected a constant build vector");
45611
45612 APInt Imm(SrcVT.getVectorNumElements(), 0);
45613 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45614 SDValue In = Op.getOperand(Idx);
45615 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45616 Imm.setBit(Idx);
45617 }
45618 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45619 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45620}
45621
45624 const X86Subtarget &Subtarget) {
45625 using namespace SDPatternMatch;
45626 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45627
45628 if (!DCI.isBeforeLegalizeOps())
45629 return SDValue();
45630
45631 // Only do this if we have k-registers.
45632 if (!Subtarget.hasAVX512())
45633 return SDValue();
45634
45635 EVT DstVT = N->getValueType(0);
45636 SDValue Op = N->getOperand(0);
45637 EVT SrcVT = Op.getValueType();
45638
45639 // Make sure we have a bitcast between mask registers and a scalar type.
45640 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45641 DstVT.isScalarInteger()) &&
45642 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45643 SrcVT.isScalarInteger()))
45644 return SDValue();
45645
45646 SDValue LHS, RHS;
45647
45648 // Look for logic ops.
45650 return SDValue();
45651
45652 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45653 // least one of the getBitcast() will fold away).
45654 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45656 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45657 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45658
45659 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45660 // Most of these have to move a constant from the scalar domain anyway.
45663 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45664 DAG.getBitcast(DstVT, LHS), RHS);
45665 }
45666
45667 return SDValue();
45668}
45669
45671 const X86Subtarget &Subtarget) {
45672 SDLoc DL(BV);
45673 unsigned NumElts = BV->getNumOperands();
45674 SDValue Splat = BV->getSplatValue();
45675
45676 // Build MMX element from integer GPR or SSE float values.
45677 auto CreateMMXElement = [&](SDValue V) {
45678 if (V.isUndef())
45679 return DAG.getUNDEF(MVT::x86mmx);
45680 if (V.getValueType().isFloatingPoint()) {
45681 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45682 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45683 V = DAG.getBitcast(MVT::v2i64, V);
45684 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45685 }
45686 V = DAG.getBitcast(MVT::i32, V);
45687 } else {
45688 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45689 }
45690 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45691 };
45692
45693 // Convert build vector ops to MMX data in the bottom elements.
45695
45696 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45697
45698 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45699 if (Splat) {
45700 if (Splat.isUndef())
45701 return DAG.getUNDEF(MVT::x86mmx);
45702
45703 Splat = CreateMMXElement(Splat);
45704
45705 if (Subtarget.hasSSE1()) {
45706 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45707 if (NumElts == 8)
45708 Splat = DAG.getNode(
45709 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45710 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45711 TLI.getPointerTy(DAG.getDataLayout())),
45712 Splat, Splat);
45713
45714 // Use PSHUFW to repeat 16-bit elements.
45715 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45716 return DAG.getNode(
45717 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45718 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45719 TLI.getPointerTy(DAG.getDataLayout())),
45720 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45721 }
45722 Ops.append(NumElts, Splat);
45723 } else {
45724 for (unsigned i = 0; i != NumElts; ++i)
45725 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45726 }
45727
45728 // Use tree of PUNPCKLs to build up general MMX vector.
45729 while (Ops.size() > 1) {
45730 unsigned NumOps = Ops.size();
45731 unsigned IntrinOp =
45732 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45733 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45734 : Intrinsic::x86_mmx_punpcklbw));
45735 SDValue Intrin = DAG.getTargetConstant(
45736 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45737 for (unsigned i = 0; i != NumOps; i += 2)
45738 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45739 Ops[i], Ops[i + 1]);
45740 Ops.resize(NumOps / 2);
45741 }
45742
45743 return Ops[0];
45744}
45745
45746// Recursive function that attempts to find if a bool vector node was originally
45747// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45748// integer. If so, replace the scalar ops with bool vector equivalents back down
45749// the chain.
45751 SelectionDAG &DAG,
45752 const X86Subtarget &Subtarget,
45753 unsigned Depth = 0) {
45755 return SDValue(); // Limit search depth.
45756
45757 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45758 unsigned Opc = V.getOpcode();
45759 switch (Opc) {
45760 case ISD::BITCAST: {
45761 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45762 SDValue Src = V.getOperand(0);
45763 EVT SrcVT = Src.getValueType();
45764 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45765 return DAG.getBitcast(VT, Src);
45766 break;
45767 }
45768 case ISD::Constant: {
45769 auto *C = cast<ConstantSDNode>(V);
45770 if (C->isZero())
45771 return DAG.getConstant(0, DL, VT);
45772 if (C->isAllOnes())
45773 return DAG.getAllOnesConstant(DL, VT);
45774 break;
45775 }
45776 case ISD::TRUNCATE: {
45777 // If we find a suitable source, a truncated scalar becomes a subvector.
45778 SDValue Src = V.getOperand(0);
45779 EVT NewSrcVT =
45780 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45781 if (TLI.isTypeLegal(NewSrcVT))
45782 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45783 Subtarget, Depth + 1))
45784 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45785 DAG.getVectorIdxConstant(0, DL));
45786 break;
45787 }
45788 case ISD::ANY_EXTEND:
45789 case ISD::ZERO_EXTEND: {
45790 // If we find a suitable source, an extended scalar becomes a subvector.
45791 SDValue Src = V.getOperand(0);
45792 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45793 Src.getScalarValueSizeInBits());
45794 if (TLI.isTypeLegal(NewSrcVT))
45795 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45796 Subtarget, Depth + 1))
45797 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45798 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45799 : DAG.getConstant(0, DL, VT),
45800 N0, DAG.getVectorIdxConstant(0, DL));
45801 break;
45802 }
45803 case ISD::OR:
45804 case ISD::XOR: {
45805 // If we find suitable sources, we can just move the op to the vector
45806 // domain.
45807 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45808 Subtarget, Depth + 1))
45809 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45810 Subtarget, Depth + 1))
45811 return DAG.getNode(Opc, DL, VT, N0, N1);
45812 break;
45813 }
45814 case ISD::SHL: {
45815 // If we find a suitable source, a SHL becomes a KSHIFTL.
45816 SDValue Src0 = V.getOperand(0);
45817 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45818 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45819 break;
45820
45821 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45822 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45823 Depth + 1))
45824 return DAG.getNode(
45825 X86ISD::KSHIFTL, DL, VT, N0,
45826 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45827 break;
45828 }
45829 }
45830
45831 // Does the inner bitcast already exist?
45832 if (Depth > 0)
45833 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45834 return SDValue(Alt, 0);
45835
45836 return SDValue();
45837}
45838
45841 const X86Subtarget &Subtarget) {
45842 SDValue N0 = N->getOperand(0);
45843 EVT VT = N->getValueType(0);
45844 EVT SrcVT = N0.getValueType();
45845 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45846
45847 // Try to match patterns such as
45848 // (i16 bitcast (v16i1 x))
45849 // ->
45850 // (i16 movmsk (16i8 sext (v16i1 x)))
45851 // before the setcc result is scalarized on subtargets that don't have legal
45852 // vxi1 types.
45853 if (DCI.isBeforeLegalize()) {
45854 SDLoc dl(N);
45855 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45856 return V;
45857
45858 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45859 // type, widen both sides to avoid a trip through memory.
45860 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45861 Subtarget.hasAVX512()) {
45862 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45863 N0 = DAG.getBitcast(MVT::v8i1, N0);
45864 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45865 DAG.getVectorIdxConstant(0, dl));
45866 }
45867
45868 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45869 // type, widen both sides to avoid a trip through memory.
45870 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45871 Subtarget.hasAVX512()) {
45872 // Use zeros for the widening if we already have some zeroes. This can
45873 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45874 // stream of this.
45875 // FIXME: It might make sense to detect a concat_vectors with a mix of
45876 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45877 // a separate combine. What we can't do is canonicalize the operands of
45878 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45879 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45880 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45881 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45882 SrcVT = LastOp.getValueType();
45883 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45885 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45886 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45887 N0 = DAG.getBitcast(MVT::i8, N0);
45888 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45889 }
45890 }
45891
45892 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45893 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45894 Ops[0] = N0;
45895 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45896 N0 = DAG.getBitcast(MVT::i8, N0);
45897 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45898 }
45899 } else if (DCI.isAfterLegalizeDAG()) {
45900 // If we're bitcasting from iX to vXi1, see if the integer originally
45901 // began as a vXi1 and whether we can remove the bitcast entirely.
45902 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45903 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45904 if (SDValue V =
45905 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45906 return V;
45907 }
45908 }
45909
45910 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45911 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45912 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45913 // we can help with known bits propagation from the vXi1 domain to the
45914 // scalar domain.
45915 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45916 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45917 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45919 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45920 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45921
45922 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45923 // and the vbroadcast_load are both integer or both fp. In some cases this
45924 // will remove the bitcast entirely.
45925 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45926 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45927 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45928 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45929 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45930 // Don't swap i8/i16 since don't have fp types that size.
45931 if (MemSize >= 32) {
45932 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45933 : MVT::getIntegerVT(MemSize);
45934 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45935 : MVT::getIntegerVT(SrcVTSize);
45936 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45937
45938 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45939 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45940 SDValue ResNode =
45942 MemVT, BCast->getMemOperand());
45943 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45944 return DAG.getBitcast(VT, ResNode);
45945 }
45946 }
45947
45948 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45949 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45950 SDValue Src = peekThroughTruncates(N0);
45951 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45952 Src.getOperand(0).getValueSizeInBits() == 128 &&
45953 isNullConstant(Src.getOperand(1))) {
45954 SDLoc DL(N);
45955 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45956 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45957 DAG.getVectorIdxConstant(0, DL));
45958 }
45959 }
45960
45961 // Since MMX types are special and don't usually play with other vector types,
45962 // it's better to handle them early to be sure we emit efficient code by
45963 // avoiding store-load conversions.
45964 if (VT == MVT::x86mmx) {
45965 // Detect MMX constant vectors.
45966 APInt UndefElts;
45967 SmallVector<APInt, 1> EltBits;
45968 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45969 /*AllowWholeUndefs*/ true,
45970 /*AllowPartialUndefs*/ true)) {
45971 SDLoc DL(N0);
45972 // Handle zero-extension of i32 with MOVD.
45973 if (EltBits[0].countl_zero() >= 32)
45974 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45975 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45976 // Else, bitcast to a double.
45977 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45978 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45979 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45980 }
45981
45982 // Detect bitcasts to x86mmx low word.
45983 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45984 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45985 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45986 bool LowUndef = true, AllUndefOrZero = true;
45987 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45988 SDValue Op = N0.getOperand(i);
45989 LowUndef &= Op.isUndef() || (i >= e/2);
45990 AllUndefOrZero &= isNullConstantOrUndef(Op);
45991 }
45992 if (AllUndefOrZero) {
45993 SDValue N00 = N0.getOperand(0);
45994 SDLoc dl(N00);
45995 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45996 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45997 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45998 }
45999 }
46000
46001 // Detect bitcasts of 64-bit build vectors and convert to a
46002 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
46003 // lowest element.
46004 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
46005 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
46006 SrcVT == MVT::v8i8))
46007 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
46008
46009 // Detect bitcasts between element or subvector extraction to x86mmx.
46010 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
46012 isNullConstant(N0.getOperand(1))) {
46013 SDValue N00 = N0.getOperand(0);
46014 if (N00.getValueType().is128BitVector())
46015 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
46016 DAG.getBitcast(MVT::v2i64, N00));
46017 }
46018
46019 // Detect bitcasts from FP_TO_SINT to x86mmx.
46020 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
46021 SDLoc DL(N0);
46022 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
46023 DAG.getUNDEF(MVT::v2i32));
46024 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
46025 DAG.getBitcast(MVT::v2i64, Res));
46026 }
46027 }
46028
46029 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
46030 // most of these to scalar anyway.
46031 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
46032 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
46034 return combinevXi1ConstantToInteger(N0, DAG);
46035 }
46036
46037 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
46038 VT.getVectorElementType() == MVT::i1) {
46039 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
46040 if (C->isAllOnes())
46041 return DAG.getConstant(1, SDLoc(N0), VT);
46042 if (C->isZero())
46043 return DAG.getConstant(0, SDLoc(N0), VT);
46044 }
46045 }
46046
46047 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
46048 // Turn it into a sign bit compare that produces a k-register. This avoids
46049 // a trip through a GPR.
46050 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
46051 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
46053 unsigned NumElts = VT.getVectorNumElements();
46054 SDValue Src = N0;
46055
46056 // Peek through truncate.
46057 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
46058 Src = N0.getOperand(0);
46059
46060 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
46061 SDValue MovmskIn = Src.getOperand(0);
46062 MVT MovmskVT = MovmskIn.getSimpleValueType();
46063 unsigned MovMskElts = MovmskVT.getVectorNumElements();
46064
46065 // We allow extra bits of the movmsk to be used since they are known zero.
46066 // We can't convert a VPMOVMSKB without avx512bw.
46067 if (MovMskElts <= NumElts &&
46068 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
46069 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
46070 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
46071 SDLoc dl(N);
46072 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
46073 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
46074 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
46075 if (EVT(CmpVT) == VT)
46076 return Cmp;
46077
46078 // Pad with zeroes up to original VT to replace the zeroes that were
46079 // being used from the MOVMSK.
46080 unsigned NumConcats = NumElts / MovMskElts;
46081 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
46082 Ops[0] = Cmp;
46083 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
46084 }
46085 }
46086 }
46087
46088 // Try to remove bitcasts from input and output of mask arithmetic to
46089 // remove GPR<->K-register crossings.
46090 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46091 return V;
46092
46093 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46094 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46095 SrcVT.getVectorNumElements() == 1)
46096 return N0.getOperand(1);
46097
46098 // Convert a bitcasted integer logic operation that has one bitcasted
46099 // floating-point operand into a floating-point logic operation. This may
46100 // create a load of a constant, but that is cheaper than materializing the
46101 // constant in an integer register and transferring it to an SSE register or
46102 // transferring the SSE operand to integer register and back.
46103 unsigned FPOpcode;
46104 switch (N0.getOpcode()) {
46105 // clang-format off
46106 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46107 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46108 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46109 default: return SDValue();
46110 // clang-format on
46111 }
46112
46113 // Check if we have a bitcast from another integer type as well.
46114 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46115 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46116 (Subtarget.hasFP16() && VT == MVT::f16) ||
46117 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46118 TLI.isTypeLegal(VT))))
46119 return SDValue();
46120
46121 SDValue LogicOp0 = N0.getOperand(0);
46122 SDValue LogicOp1 = N0.getOperand(1);
46123 SDLoc DL0(N0);
46124
46125 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46126 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46127 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46128 LogicOp0.getOperand(0).getValueType() == VT &&
46129 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46130 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46131 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46132 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46133 }
46134 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46135 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46136 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46137 LogicOp1.getOperand(0).getValueType() == VT &&
46138 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46139 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46140 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46141 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46142 }
46143
46144 return SDValue();
46145}
46146
46147// (mul (zext a), (sext, b))
46148static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46149 SDValue &Op1) {
46150 Op0 = Mul.getOperand(0);
46151 Op1 = Mul.getOperand(1);
46152
46153 // The operand1 should be signed extend
46154 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46155 std::swap(Op0, Op1);
46156
46157 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46158 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46159 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46160 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46161 return true;
46162
46163 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46164 return (BV && BV->isConstant());
46165 };
46166
46167 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46168 // value, we need to check Op0 is zero extended value. Op1 should be signed
46169 // value, so we just check the signed bits.
46170 if ((IsFreeTruncation(Op0) &&
46171 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46172 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46173 return true;
46174
46175 return false;
46176}
46177
46179 unsigned &LogBias, const SDLoc &DL,
46180 const X86Subtarget &Subtarget) {
46181 // Extend or truncate to MVT::i8 first.
46182 MVT Vi8VT =
46183 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46184 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46185 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46186
46187 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46188 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46189 // The src A, B element type is i8, but the dst C element type is i32.
46190 // When we calculate the reduce stage, we use src vector type vXi8 for it
46191 // so we need logbias 2 to avoid extra 2 stages.
46192 LogBias = 2;
46193
46194 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46195 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46196 RegSize = std::max(512u, RegSize);
46197
46198 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46199 // fill in the missing vector elements with 0.
46200 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46201 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46202 Ops[0] = LHS;
46203 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46204 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46205 Ops[0] = RHS;
46206 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46207
46208 // Actually build the DotProduct, split as 256/512 bits for
46209 // AVXVNNI/AVX512VNNI.
46210 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46212 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46213 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46214 };
46215 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46216 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46217
46218 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46219 DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI());
46220}
46221
46222// Create a PSADBW given two sources representable as zexts of vXi8.
46224 const SDLoc &DL, const X86Subtarget &Subtarget) {
46225 // Find the appropriate width for the PSADBW.
46226 EVT DstVT = N0.getValueType();
46227 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46228 DstVT.getVectorElementCount());
46229 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46230
46231 // Widen the vXi8 vectors, padding with zero vector elements.
46232 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46233 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46234 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46235 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46236 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46237 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46238 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46239
46240 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46241 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46243 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46244 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46245 };
46246 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46247 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46248 PSADBWBuilder);
46249}
46250
46251// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46252// PHMINPOSUW.
46254 const X86Subtarget &Subtarget) {
46255 // Bail without SSE41.
46256 if (!Subtarget.hasSSE41())
46257 return SDValue();
46258
46259 EVT ExtractVT = Extract->getValueType(0);
46260 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46261 return SDValue();
46262
46263 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46264 ISD::NodeType BinOp;
46265 SDValue Src = DAG.matchBinOpReduction(
46266 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46267 if (!Src)
46268 return SDValue();
46269
46270 EVT SrcVT = Src.getValueType();
46271 EVT SrcSVT = SrcVT.getScalarType();
46272 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46273 return SDValue();
46274
46275 SDLoc DL(Extract);
46276 SDValue MinPos = Src;
46277
46278 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46279 while (SrcVT.getSizeInBits() > 128) {
46280 SDValue Lo, Hi;
46281 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46282 SrcVT = Lo.getValueType();
46283 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46284 }
46285 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46286 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46287 "Unexpected value type");
46288
46289 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46290 // to flip the value accordingly.
46291 SDValue Mask;
46292 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46293 if (BinOp == ISD::SMAX)
46294 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46295 else if (BinOp == ISD::SMIN)
46296 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46297 else if (BinOp == ISD::UMAX)
46298 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46299
46300 if (Mask)
46301 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46302
46303 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46304 // shuffling each upper element down and insert zeros. This means that the
46305 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46306 // ready for the PHMINPOS.
46307 if (ExtractVT == MVT::i8) {
46309 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46310 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46311 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46312 }
46313
46314 // Perform the PHMINPOS on a v8i16 vector,
46315 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46316 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46317 MinPos = DAG.getBitcast(SrcVT, MinPos);
46318
46319 if (Mask)
46320 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46321
46322 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46323 DAG.getVectorIdxConstant(0, DL));
46324}
46325
46326// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46328 const X86Subtarget &Subtarget) {
46329 // Bail without SSE2.
46330 if (!Subtarget.hasSSE2())
46331 return SDValue();
46332
46333 EVT ExtractVT = Extract->getValueType(0);
46334 unsigned BitWidth = ExtractVT.getSizeInBits();
46335 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46336 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46337 return SDValue();
46338
46339 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46340 ISD::NodeType BinOp;
46341 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46342 if (!Match && ExtractVT == MVT::i1)
46343 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46344 if (!Match)
46345 return SDValue();
46346
46347 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46348 // which we can't support here for now.
46349 if (Match.getScalarValueSizeInBits() != BitWidth)
46350 return SDValue();
46351
46352 SDValue Movmsk;
46353 SDLoc DL(Extract);
46354 EVT MatchVT = Match.getValueType();
46355 unsigned NumElts = MatchVT.getVectorNumElements();
46356 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46357 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46358 LLVMContext &Ctx = *DAG.getContext();
46359
46360 if (ExtractVT == MVT::i1) {
46361 // Special case for (pre-legalization) vXi1 reductions.
46362 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46363 return SDValue();
46364 if (Match.getOpcode() == ISD::SETCC) {
46365 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46366 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46367 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46368 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46369 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46370 X86::CondCode X86CC;
46371 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46372 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46373 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46374 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46375 DAG, X86CC))
46376 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46377 getSETCC(X86CC, V, DL, DAG));
46378 }
46379 }
46380 if (TLI.isTypeLegal(MatchVT)) {
46381 // If this is a legal AVX512 predicate type then we can just bitcast.
46382 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46383 Movmsk = DAG.getBitcast(MovmskVT, Match);
46384 } else {
46385 // Use combineBitcastvxi1 to create the MOVMSK.
46386 while (NumElts > MaxElts) {
46387 SDValue Lo, Hi;
46388 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46389 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46390 NumElts /= 2;
46391 }
46392 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46393 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46394 }
46395 if (!Movmsk)
46396 return SDValue();
46397 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46398 } else {
46399 // FIXME: Better handling of k-registers or 512-bit vectors?
46400 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46401 if (!(MatchSizeInBits == 128 ||
46402 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46403 return SDValue();
46404
46405 // Make sure this isn't a vector of 1 element. The perf win from using
46406 // MOVMSK diminishes with less elements in the reduction, but it is
46407 // generally better to get the comparison over to the GPRs as soon as
46408 // possible to reduce the number of vector ops.
46409 if (Match.getValueType().getVectorNumElements() < 2)
46410 return SDValue();
46411
46412 // Check that we are extracting a reduction of all sign bits.
46413 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46414 return SDValue();
46415
46416 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46417 SDValue Lo, Hi;
46418 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46419 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46420 MatchSizeInBits = Match.getValueSizeInBits();
46421 }
46422
46423 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46424 MVT MaskSrcVT;
46425 if (64 == BitWidth || 32 == BitWidth)
46427 MatchSizeInBits / BitWidth);
46428 else
46429 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46430
46431 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46432 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46433 NumElts = MaskSrcVT.getVectorNumElements();
46434 }
46435 assert((NumElts <= 32 || NumElts == 64) &&
46436 "Not expecting more than 64 elements");
46437
46438 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46439 if (BinOp == ISD::XOR) {
46440 // parity -> (PARITY(MOVMSK X))
46441 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46442 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46443 }
46444
46445 SDValue CmpC;
46446 ISD::CondCode CondCode;
46447 if (BinOp == ISD::OR) {
46448 // any_of -> MOVMSK != 0
46449 CmpC = DAG.getConstant(0, DL, CmpVT);
46450 CondCode = ISD::CondCode::SETNE;
46451 } else {
46452 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46453 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46454 DL, CmpVT);
46455 CondCode = ISD::CondCode::SETEQ;
46456 }
46457
46458 // The setcc produces an i8 of 0/1, so extend that to the result width and
46459 // negate to get the final 0/-1 mask value.
46460 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46461 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46462 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46463 return DAG.getNegative(Zext, DL, ExtractVT);
46464}
46465
46467 const X86Subtarget &Subtarget) {
46468 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46469 return SDValue();
46470
46471 EVT ExtractVT = Extract->getValueType(0);
46472 // Verify the type we're extracting is i32, as the output element type of
46473 // vpdpbusd is i32.
46474 if (ExtractVT != MVT::i32)
46475 return SDValue();
46476
46477 EVT VT = Extract->getOperand(0).getValueType();
46479 return SDValue();
46480
46481 // Match shuffle + add pyramid.
46482 ISD::NodeType BinOp;
46483 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46484
46485 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46486 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46487 // before adding into the accumulator.
46488 // TODO:
46489 // We also need to verify that the multiply has at least 2x the number of bits
46490 // of the input. We shouldn't match
46491 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46492 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46493 // Root = Root.getOperand(0);
46494
46495 // If there was a match, we want Root to be a mul.
46496 if (!Root || Root.getOpcode() != ISD::MUL)
46497 return SDValue();
46498
46499 // Check whether we have an extend and mul pattern
46500 SDValue LHS, RHS;
46501 if (!detectExtMul(DAG, Root, LHS, RHS))
46502 return SDValue();
46503
46504 // Create the dot product instruction.
46505 SDLoc DL(Extract);
46506 unsigned StageBias;
46507 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46508
46509 // If the original vector was wider than 4 elements, sum over the results
46510 // in the DP vector.
46511 unsigned Stages = Log2_32(VT.getVectorNumElements());
46512 EVT DpVT = DP.getValueType();
46513
46514 if (Stages > StageBias) {
46515 unsigned DpElems = DpVT.getVectorNumElements();
46516
46517 for (unsigned i = Stages - StageBias; i > 0; --i) {
46518 SmallVector<int, 16> Mask(DpElems, -1);
46519 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46520 Mask[j] = MaskEnd + j;
46521
46522 SDValue Shuffle =
46523 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46524 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46525 }
46526 }
46527
46528 // Return the lowest ExtractSizeInBits bits.
46529 EVT ResVT =
46530 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46531 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46532 DP = DAG.getBitcast(ResVT, DP);
46533 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46534 Extract->getOperand(1));
46535}
46536
46538 const X86Subtarget &Subtarget) {
46539 using namespace SDPatternMatch;
46540
46541 // PSADBW is only supported on SSE2 and up.
46542 if (!Subtarget.hasSSE2())
46543 return SDValue();
46544
46545 EVT ExtractVT = Extract->getValueType(0);
46546 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46547 ExtractVT != MVT::i64)
46548 return SDValue();
46549
46550 EVT VT = Extract->getOperand(0).getValueType();
46552 return SDValue();
46553
46554 // Match shuffle + add pyramid.
46555 ISD::NodeType BinOp;
46556 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46557 if (!Root)
46558 return SDValue();
46559
46560 // The operand is expected to be zero extended from i8.
46561 // In order to convert to i64 and above, additional any/zero/sign
46562 // extend is expected.
46563 // The zero extend from 32 bit has no mathematical effect on the result.
46564 // Also the sign extend is basically zero extend
46565 // (extends the sign bit which is zero).
46566 // So it is correct to skip the sign/zero extend instruction.
46567 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46568 Root.getOpcode() == ISD::ZERO_EXTEND ||
46569 Root.getOpcode() == ISD::ANY_EXTEND)
46570 Root = Root.getOperand(0);
46571
46572 // Check whether we have an vXi8 abdu pattern.
46573 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46574 SDValue Src0, Src1;
46575 if (!sd_match(
46576 Root,
46577 m_AnyOf(
46579 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46581 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46582 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46583 m_Abs(
46584 m_Sub(m_AllOf(m_Value(Src0),
46586 m_AllOf(m_Value(Src1),
46587 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46588 return SDValue();
46589
46590 // Create the SAD instruction.
46591 SDLoc DL(Extract);
46592 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46593
46594 // If the original vector was wider than 8 elements, sum over the results
46595 // in the SAD vector.
46596 unsigned Stages = Log2_32(VT.getVectorNumElements());
46597 EVT SadVT = SAD.getValueType();
46598 if (Stages > 3) {
46599 unsigned SadElems = SadVT.getVectorNumElements();
46600
46601 for(unsigned i = Stages - 3; i > 0; --i) {
46602 SmallVector<int, 16> Mask(SadElems, -1);
46603 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46604 Mask[j] = MaskEnd + j;
46605
46606 SDValue Shuffle =
46607 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46608 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46609 }
46610 }
46611
46612 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46613 // Return the lowest ExtractSizeInBits bits.
46614 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46615 SadVT.getSizeInBits() / ExtractSizeInBits);
46616 SAD = DAG.getBitcast(ResVT, SAD);
46617 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46618 Extract->getOperand(1));
46619}
46620
46621// If this extract is from a loaded vector value and will be used as an
46622// integer, that requires a potentially expensive XMM -> GPR transfer.
46623// Additionally, if we can convert to a scalar integer load, that will likely
46624// be folded into a subsequent integer op.
46625// Note: SrcVec might not have a VecVT type, but it must be the same size.
46626// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46627// to a single-use of the loaded vector. For the reasons above, we
46628// expect this to be profitable even if it creates an extra load.
46629static SDValue
46631 const SDLoc &dl, SelectionDAG &DAG,
46633 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46634 "Only EXTRACT_VECTOR_ELT supported so far");
46635
46636 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46637 EVT VT = N->getValueType(0);
46638
46639 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46640 return Use->getOpcode() == ISD::STORE ||
46641 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46642 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46643 });
46644
46645 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46646 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46647 VecVT.getVectorElementType() == VT &&
46648 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46649 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46650 SDValue NewPtr = TLI.getVectorElementPointer(
46651 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46652 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46653 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46654 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46655 SDValue Load =
46656 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46657 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46658 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46659 return Load;
46660 }
46661
46662 return SDValue();
46663}
46664
46665// Attempt to peek through a target shuffle and extract the scalar from the
46666// source.
46669 const X86Subtarget &Subtarget) {
46670 if (DCI.isBeforeLegalizeOps())
46671 return SDValue();
46672
46673 SDLoc dl(N);
46674 SDValue Src = N->getOperand(0);
46675 SDValue Idx = N->getOperand(1);
46676
46677 EVT VT = N->getValueType(0);
46678 EVT SrcVT = Src.getValueType();
46679 EVT SrcSVT = SrcVT.getVectorElementType();
46680 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46681 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46682
46683 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46684 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46685 return SDValue();
46686
46687 const APInt &IdxC = N->getConstantOperandAPInt(1);
46688 if (IdxC.uge(NumSrcElts))
46689 return SDValue();
46690
46691 SDValue SrcBC = peekThroughBitcasts(Src);
46692
46693 // Handle extract(bitcast(broadcast(scalar_value))).
46694 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46695 SDValue SrcOp = SrcBC.getOperand(0);
46696 EVT SrcOpVT = SrcOp.getValueType();
46697 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46698 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46699 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46700 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46701 // TODO support non-zero offsets.
46702 if (Offset == 0) {
46703 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46704 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46705 return SrcOp;
46706 }
46707 }
46708 }
46709
46710 // If we're extracting a single element from a broadcast load and there are
46711 // no other users, just create a single load.
46713 SrcBC.hasOneUse()) {
46714 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46715 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46716 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46717 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46718 SDValue Load =
46719 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46720 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46721 MemIntr->getMemOperand()->getFlags());
46722 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46723 return Load;
46724 }
46725 }
46726
46727 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46728 // TODO: Move to DAGCombine?
46729 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46730 SrcBC.getValueType().isInteger() &&
46731 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46732 SrcBC.getScalarValueSizeInBits() ==
46733 SrcBC.getOperand(0).getValueSizeInBits()) {
46734 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46735 if (IdxC.ult(Scale)) {
46736 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46737 SDValue Scl = SrcBC.getOperand(0);
46738 EVT SclVT = Scl.getValueType();
46739 if (Offset) {
46740 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46741 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46742 }
46743 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46744 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46745 return Scl;
46746 }
46747 }
46748
46749 // Handle extract(truncate(x)) for 0'th index.
46750 // TODO: Treat this as a faux shuffle?
46751 // TODO: When can we use this for general indices?
46752 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46753 (SrcVT.getSizeInBits() % 128) == 0) {
46754 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46755 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46756 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46757 Idx);
46758 }
46759
46760 // We can only legally extract other elements from 128-bit vectors and in
46761 // certain circumstances, depending on SSE-level.
46762 // TODO: Investigate float/double extraction if it will be just stored.
46763 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46764 unsigned Idx) {
46765 EVT VecSVT = VecVT.getScalarType();
46766 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46767 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46768 VecSVT == MVT::i64)) {
46769 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46770 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46771 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46772 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46773 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46774 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46775 Idx &= (NumEltsPerLane - 1);
46776 }
46777 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46778 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46779 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46780 DAG.getBitcast(VecVT, Vec),
46781 DAG.getVectorIdxConstant(Idx, dl));
46782 }
46783 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46784 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46785 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46786 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46787 DAG.getTargetConstant(Idx, dl, MVT::i8));
46788 }
46789 return SDValue();
46790 };
46791
46792 // Resolve the target shuffle inputs and mask.
46795 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46796 return SDValue();
46797
46798 // Shuffle inputs must be the same size as the result.
46799 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46800 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46801 }))
46802 return SDValue();
46803
46804 // Attempt to narrow/widen the shuffle mask to the correct size.
46805 if (Mask.size() != NumSrcElts) {
46806 if ((NumSrcElts % Mask.size()) == 0) {
46807 SmallVector<int, 16> ScaledMask;
46808 int Scale = NumSrcElts / Mask.size();
46809 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46810 Mask = std::move(ScaledMask);
46811 } else if ((Mask.size() % NumSrcElts) == 0) {
46812 // Simplify Mask based on demanded element.
46813 int ExtractIdx = (int)IdxC.getZExtValue();
46814 int Scale = Mask.size() / NumSrcElts;
46815 int Lo = Scale * ExtractIdx;
46816 int Hi = Scale * (ExtractIdx + 1);
46817 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46818 if (i < Lo || Hi <= i)
46819 Mask[i] = SM_SentinelUndef;
46820
46821 SmallVector<int, 16> WidenedMask;
46822 while (Mask.size() > NumSrcElts &&
46823 canWidenShuffleElements(Mask, WidenedMask))
46824 Mask = std::move(WidenedMask);
46825 }
46826 }
46827
46828 // If narrowing/widening failed, see if we can extract+zero-extend.
46829 int ExtractIdx;
46830 EVT ExtractVT;
46831 if (Mask.size() == NumSrcElts) {
46832 ExtractIdx = Mask[IdxC.getZExtValue()];
46833 ExtractVT = SrcVT;
46834 } else {
46835 unsigned Scale = Mask.size() / NumSrcElts;
46836 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46837 return SDValue();
46838 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46839 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46840 return SDValue();
46841 ExtractIdx = Mask[ScaledIdx];
46842 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46843 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46844 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46845 "Failed to widen vector type");
46846 }
46847
46848 // If the shuffle source element is undef/zero then we can just accept it.
46849 if (ExtractIdx == SM_SentinelUndef)
46850 return DAG.getUNDEF(VT);
46851
46852 if (ExtractIdx == SM_SentinelZero)
46853 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46854 : DAG.getConstant(0, dl, VT);
46855
46856 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46857 ExtractIdx = ExtractIdx % Mask.size();
46858 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46859 return DAG.getZExtOrTrunc(V, dl, VT);
46860
46861 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46863 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46864 return V;
46865
46866 return SDValue();
46867}
46868
46869/// Extracting a scalar FP value from vector element 0 is free, so extract each
46870/// operand first, then perform the math as a scalar op.
46872 const X86Subtarget &Subtarget,
46874 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46875 SDValue Vec = ExtElt->getOperand(0);
46876 SDValue Index = ExtElt->getOperand(1);
46877 EVT VT = ExtElt->getValueType(0);
46878 EVT VecVT = Vec.getValueType();
46879
46880 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46881 // non-zero element because the shuffle+scalar op will be cheaper?
46882 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46883 return SDValue();
46884
46885 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46886 // extract, the condition code), so deal with those as a special-case.
46887 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46888 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46889 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46890 return SDValue();
46891
46892 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46893 SDLoc DL(ExtElt);
46894 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46895 Vec.getOperand(0), Index);
46896 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46897 Vec.getOperand(1), Index);
46898 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46899 }
46900
46901 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46902 VT != MVT::f64)
46903 return SDValue();
46904
46905 // Vector FP selects don't fit the pattern of FP math ops (because the
46906 // condition has a different type and we have to change the opcode), so deal
46907 // with those here.
46908 // FIXME: This is restricted to pre type legalization. If we loosen this we
46909 // need to convert vector bool to a scalar bool.
46910 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46911 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46912 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46913 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46914 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46915 SDLoc DL(ExtElt);
46918 Vec.getOperand(0), Index);
46919 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46920 Vec.getOperand(1), Index);
46921 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46922 Vec.getOperand(2), Index);
46923 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46924 }
46925
46926 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46927 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46928 // missed load folding and fma+fneg combining.
46929 switch (Vec.getOpcode()) {
46930 case ISD::FMA: // Begin 3 operands
46931 case ISD::FMAD:
46932 case ISD::FADD: // Begin 2 operands
46933 case ISD::FSUB:
46934 case ISD::FMUL:
46935 case ISD::FDIV:
46936 case ISD::FREM:
46937 case ISD::FCOPYSIGN:
46938 case ISD::FMINNUM:
46939 case ISD::FMAXNUM:
46940 case ISD::FMINNUM_IEEE:
46941 case ISD::FMAXNUM_IEEE:
46942 case ISD::FMAXIMUM:
46943 case ISD::FMINIMUM:
46944 case ISD::FMAXIMUMNUM:
46945 case ISD::FMINIMUMNUM:
46946 case X86ISD::FMAX:
46947 case X86ISD::FMIN:
46948 case ISD::FABS: // Begin 1 operand
46949 case ISD::FSQRT:
46950 case ISD::FRINT:
46951 case ISD::FCEIL:
46952 case ISD::FTRUNC:
46953 case ISD::FNEARBYINT:
46954 case ISD::FROUNDEVEN:
46955 case ISD::FROUND:
46956 case ISD::FFLOOR:
46957 case X86ISD::FRCP:
46958 case X86ISD::FRSQRT: {
46959 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46960 SDLoc DL(ExtElt);
46962 for (SDValue Op : Vec->ops())
46963 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46964 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46965 }
46966 default:
46967 return SDValue();
46968 }
46969 llvm_unreachable("All opcodes should return within switch");
46970}
46971
46972/// Try to convert a vector reduction sequence composed of binops and shuffles
46973/// into horizontal ops.
46975 const X86Subtarget &Subtarget) {
46976 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46977
46978 // We need at least SSE2 to anything here.
46979 if (!Subtarget.hasSSE2())
46980 return SDValue();
46981
46983 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46984 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46985 if (!Rdx)
46986 return SDValue();
46987
46988 SDValue Index = ExtElt->getOperand(1);
46989 assert(isNullConstant(Index) &&
46990 "Reduction doesn't end in an extract from index 0");
46991
46992 EVT VT = ExtElt->getValueType(0);
46993 EVT VecVT = Rdx.getValueType();
46994 if (VecVT.getScalarType() != VT)
46995 return SDValue();
46996
46997 SDLoc DL(ExtElt);
46998 unsigned NumElts = VecVT.getVectorNumElements();
46999 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
47000
47001 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
47002 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
47003 if (V.getValueType() == MVT::v4i8) {
47004 if (ZeroExtend && Subtarget.hasSSE41()) {
47005 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
47006 DAG.getConstant(0, DL, MVT::v4i32),
47007 DAG.getBitcast(MVT::i32, V),
47008 DAG.getVectorIdxConstant(0, DL));
47009 return DAG.getBitcast(MVT::v16i8, V);
47010 }
47011 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
47012 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
47013 : DAG.getUNDEF(MVT::v4i8));
47014 }
47015 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
47016 DAG.getUNDEF(MVT::v8i8));
47017 };
47018
47019 // vXi8 mul reduction - promote to vXi16 mul reduction.
47020 if (Opc == ISD::MUL) {
47021 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
47022 return SDValue();
47023 if (VecVT.getSizeInBits() >= 128) {
47024 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
47025 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47026 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47027 Lo = DAG.getBitcast(WideVT, Lo);
47028 Hi = DAG.getBitcast(WideVT, Hi);
47029 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
47030 while (Rdx.getValueSizeInBits() > 128) {
47031 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47032 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
47033 }
47034 } else {
47035 Rdx = WidenToV16I8(Rdx, false);
47036 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
47037 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
47038 }
47039 if (NumElts >= 8)
47040 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47041 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47042 {4, 5, 6, 7, -1, -1, -1, -1}));
47043 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47044 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47045 {2, 3, -1, -1, -1, -1, -1, -1}));
47046 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47047 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47048 {1, -1, -1, -1, -1, -1, -1, -1}));
47049 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47050 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47051 }
47052
47053 // vXi8 add reduction - sub 128-bit vector.
47054 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
47055 Rdx = WidenToV16I8(Rdx, true);
47056 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47057 DAG.getConstant(0, DL, MVT::v16i8));
47058 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47059 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47060 }
47061
47062 // Must be a >=128-bit vector with pow2 elements.
47063 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
47064 return SDValue();
47065
47066 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
47067 if (VT == MVT::i8) {
47068 while (Rdx.getValueSizeInBits() > 128) {
47069 SDValue Lo, Hi;
47070 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47071 VecVT = Lo.getValueType();
47072 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47073 }
47074 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
47075
47077 MVT::v16i8, DL, Rdx, Rdx,
47078 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
47079 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
47080 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47081 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
47082 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47083 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47084 }
47085
47086 // See if we can use vXi8 PSADBW add reduction for larger zext types.
47087 // If the source vector values are 0-255, then we can use PSADBW to
47088 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
47089 // TODO: See if its worth avoiding vXi16/i32 truncations?
47090 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47091 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47092 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47093 Subtarget.hasAVX512())) {
47094 if (Rdx.getValueType() == MVT::v8i16) {
47095 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47096 DAG.getUNDEF(MVT::v8i16));
47097 } else {
47098 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47099 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47100 if (ByteVT.getSizeInBits() < 128)
47101 Rdx = WidenToV16I8(Rdx, true);
47102 }
47103
47104 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47105 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47107 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47108 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47109 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47110 };
47111 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47112 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47113
47114 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47115 while (Rdx.getValueSizeInBits() > 128) {
47116 SDValue Lo, Hi;
47117 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47118 VecVT = Lo.getValueType();
47119 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47120 }
47121 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47122
47123 if (NumElts > 8) {
47124 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47125 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47126 }
47127
47128 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47129 Rdx = DAG.getBitcast(VecVT, Rdx);
47130 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47131 }
47132
47133 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47134 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47135 return SDValue();
47136
47137 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47138
47139 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47140 // across the whole vector, so we need an extract + hop preliminary stage.
47141 // This is the only step where the operands of the hop are not the same value.
47142 // TODO: We could extend this to handle 512-bit or even longer vectors.
47143 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47144 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47145 unsigned NumElts = VecVT.getVectorNumElements();
47146 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47147 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47148 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47149 VecVT = Rdx.getValueType();
47150 }
47151 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47152 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47153 return SDValue();
47154
47155 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47156 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47157 for (unsigned i = 0; i != ReductionSteps; ++i)
47158 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47159
47160 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47161}
47162
47163/// Detect vector gather/scatter index generation and convert it from being a
47164/// bunch of shuffles and extracts into a somewhat faster sequence.
47165/// For i686, the best sequence is apparently storing the value and loading
47166/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47169 const X86Subtarget &Subtarget) {
47170 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47171 return NewOp;
47172
47173 SDValue InputVector = N->getOperand(0);
47174 SDValue EltIdx = N->getOperand(1);
47175 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47176
47177 EVT SrcVT = InputVector.getValueType();
47178 EVT VT = N->getValueType(0);
47179 SDLoc dl(InputVector);
47180 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47181 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47182 unsigned NumEltBits = VT.getScalarSizeInBits();
47183 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47184
47185 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47186 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47187
47188 // Integer Constant Folding.
47189 if (CIdx && VT.isInteger()) {
47190 APInt UndefVecElts;
47191 SmallVector<APInt, 16> EltBits;
47192 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47193 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47194 EltBits, /*AllowWholeUndefs*/ true,
47195 /*AllowPartialUndefs*/ false)) {
47196 uint64_t Idx = CIdx->getZExtValue();
47197 if (UndefVecElts[Idx])
47198 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47199 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47200 }
47201
47202 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47203 // Improves lowering of bool masks on rust which splits them into byte array.
47204 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47205 SDValue Src = peekThroughBitcasts(InputVector);
47206 if (Src.getValueType().getScalarType() == MVT::i1 &&
47207 TLI.isTypeLegal(Src.getValueType())) {
47208 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47209 SDValue Sub = DAG.getNode(
47210 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47211 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47212 return DAG.getBitcast(VT, Sub);
47213 }
47214 }
47215 }
47216
47217 if (IsPextr) {
47218 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47219 DCI))
47220 return SDValue(N, 0);
47221
47222 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47223 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47224 InputVector.getOpcode() == X86ISD::PINSRW) &&
47225 InputVector.getOperand(2) == EltIdx) {
47226 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47227 "Vector type mismatch");
47228 SDValue Scl = InputVector.getOperand(1);
47229 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47230 return DAG.getZExtOrTrunc(Scl, dl, VT);
47231 }
47232
47233 // TODO - Remove this once we can handle the implicit zero-extension of
47234 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47235 // combineBasicSADPattern.
47236 return SDValue();
47237 }
47238
47239 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47240 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47241 InputVector.getOpcode() == ISD::BITCAST &&
47242 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47243 isNullConstant(EltIdx) && InputVector.hasOneUse())
47244 return DAG.getBitcast(VT, InputVector);
47245
47246 // Detect mmx to i32 conversion through a v2i32 elt extract.
47247 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47248 InputVector.getOpcode() == ISD::BITCAST &&
47249 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47250 isNullConstant(EltIdx) && InputVector.hasOneUse())
47251 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47252 InputVector.getOperand(0));
47253
47254 // Check whether this extract is the root of a sum of absolute differences
47255 // pattern. This has to be done here because we really want it to happen
47256 // pre-legalization,
47257 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47258 return SAD;
47259
47260 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47261 return VPDPBUSD;
47262
47263 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47264 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47265 return Cmp;
47266
47267 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47268 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47269 return MinMax;
47270
47271 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47272 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47273 return V;
47274
47275 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47276 return V;
47277
47278 if (CIdx)
47280 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47281 dl, DAG, DCI))
47282 return V;
47283
47284 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47285 // and then testing the relevant element.
47286 //
47287 // Note that we only combine extracts on the *same* result number, i.e.
47288 // t0 = merge_values a0, a1, a2, a3
47289 // i1 = extract_vector_elt t0, Constant:i64<2>
47290 // i1 = extract_vector_elt t0, Constant:i64<3>
47291 // but not
47292 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47293 // since the latter would need its own MOVMSK.
47294 if (SrcVT.getScalarType() == MVT::i1) {
47295 bool IsVar = !CIdx;
47296 SmallVector<SDNode *, 16> BoolExtracts;
47297 unsigned ResNo = InputVector.getResNo();
47298 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47299 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47300 Use->getOperand(0).getResNo() == ResNo &&
47301 Use->getValueType(0) == MVT::i1) {
47302 BoolExtracts.push_back(Use);
47303 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47304 return true;
47305 }
47306 return false;
47307 };
47308 // TODO: Can we drop the oneuse check for constant extracts?
47309 if (all_of(InputVector->users(), IsBoolExtract) &&
47310 (IsVar || BoolExtracts.size() > 1)) {
47311 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47312 if (SDValue BC =
47313 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47314 for (SDNode *Use : BoolExtracts) {
47315 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47316 // Mask = 1 << MaskIdx
47317 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47318 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47319 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47320 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47321 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47322 DCI.CombineTo(Use, Res);
47323 }
47324 return SDValue(N, 0);
47325 }
47326 }
47327 }
47328
47329 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47330 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47331 SDValue TruncSrc = InputVector.getOperand(0);
47332 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47333 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47334 SDValue NewExt =
47335 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47336 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47337 }
47338 }
47339
47340 return SDValue();
47341}
47342
47343// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47344// This is more or less the reverse of combineBitcastvxi1.
47346 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47347 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47348 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47349 Opcode != ISD::ANY_EXTEND)
47350 return SDValue();
47351 if (!DCI.isBeforeLegalizeOps())
47352 return SDValue();
47353 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47354 return SDValue();
47355
47356 EVT SVT = VT.getScalarType();
47357 EVT InSVT = N0.getValueType().getScalarType();
47358 unsigned EltSizeInBits = SVT.getSizeInBits();
47359
47360 // Input type must be extending a bool vector (bit-casted from a scalar
47361 // integer) to legal integer types.
47362 if (!VT.isVector())
47363 return SDValue();
47364 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47365 return SDValue();
47366 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47367 return SDValue();
47368
47369 SDValue N00 = N0.getOperand(0);
47370 EVT SclVT = N00.getValueType();
47371 if (!SclVT.isScalarInteger())
47372 return SDValue();
47373
47374 SDValue Vec;
47375 SmallVector<int> ShuffleMask;
47376 unsigned NumElts = VT.getVectorNumElements();
47377 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47378
47379 // Broadcast the scalar integer to the vector elements.
47380 if (NumElts > EltSizeInBits) {
47381 // If the scalar integer is greater than the vector element size, then we
47382 // must split it down into sub-sections for broadcasting. For example:
47383 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47384 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47385 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47386 unsigned Scale = NumElts / EltSizeInBits;
47387 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47388 bool UseBroadcast = Subtarget.hasInt256() &&
47389 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47390 Vec = UseBroadcast
47391 ? DAG.getSplat(BroadcastVT, DL, N00)
47392 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47393 Vec = DAG.getBitcast(VT, Vec);
47394
47395 for (unsigned i = 0; i != Scale; ++i) {
47396 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47397 ShuffleMask.append(EltSizeInBits, i + Offset);
47398 }
47399 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47400 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47401 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47402 // If we have register broadcast instructions, use the scalar size as the
47403 // element type for the shuffle. Then cast to the wider element type. The
47404 // widened bits won't be used, and this might allow the use of a broadcast
47405 // load.
47406 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47407 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47408 (NumElts * EltSizeInBits) / NumElts);
47409 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47410 } else {
47411 // For smaller scalar integers, we can simply any-extend it to the vector
47412 // element size (we don't care about the upper bits) and broadcast it to all
47413 // elements.
47414 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47415 }
47416
47417 // Now, mask the relevant bit in each element.
47419 for (unsigned i = 0; i != NumElts; ++i) {
47420 int BitIdx = (i % EltSizeInBits);
47421 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47422 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47423 }
47424 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47425 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47426
47427 // Compare against the bitmask and extend the result.
47428 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47429 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47430 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47431
47432 // For SEXT, this is now done, otherwise shift the result down for
47433 // zero-extension.
47434 if (Opcode == ISD::SIGN_EXTEND)
47435 return Vec;
47436 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47437 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47438}
47439
47440/// If both arms of a vector select are concatenated vectors, split the select,
47441/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47442/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47443/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47445 const X86Subtarget &Subtarget) {
47446 unsigned Opcode = N->getOpcode();
47447 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47448 return SDValue();
47449
47450 // TODO: Split 512-bit vectors too?
47451 EVT VT = N->getValueType(0);
47452 if (!VT.is256BitVector())
47453 return SDValue();
47454
47455 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47456 SDValue Cond = N->getOperand(0);
47457 SDValue TVal = N->getOperand(1);
47458 SDValue FVal = N->getOperand(2);
47459 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47460 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47461 return SDValue();
47462
47463 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47465 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47466 };
47467 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47468 /*CheckBWI*/ false);
47469}
47470
47472 const SDLoc &DL) {
47473 SDValue Cond = N->getOperand(0);
47474 SDValue LHS = N->getOperand(1);
47475 SDValue RHS = N->getOperand(2);
47476
47477 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47478 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47479 if (!TrueC || !FalseC)
47480 return SDValue();
47481
47482 // Don't do this for crazy integer types.
47483 EVT VT = N->getValueType(0);
47484 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47485 return SDValue();
47486
47487 // We're going to use the condition bit in math or logic ops. We could allow
47488 // this with a wider condition value (post-legalization it becomes an i8),
47489 // but if nothing is creating selects that late, it doesn't matter.
47490 if (Cond.getValueType() != MVT::i1)
47491 return SDValue();
47492
47493 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47494 // 3, 5, or 9 with i32/i64, so those get transformed too.
47495 // TODO: For constants that overflow or do not differ by power-of-2 or small
47496 // multiplier, convert to 'and' + 'add'.
47497 const APInt &TrueVal = TrueC->getAPIntValue();
47498 const APInt &FalseVal = FalseC->getAPIntValue();
47499
47500 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47501 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47502 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47503 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47504 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47505 return SDValue();
47506 }
47507
47508 bool OV;
47509 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47510 if (OV)
47511 return SDValue();
47512
47513 APInt AbsDiff = Diff.abs();
47514 if (AbsDiff.isPowerOf2() ||
47515 ((VT == MVT::i32 || VT == MVT::i64) &&
47516 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47517
47518 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47519 // of the condition can usually be folded into a compare predicate, but even
47520 // without that, the sequence should be cheaper than a CMOV alternative.
47521 if (TrueVal.slt(FalseVal)) {
47522 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47523 std::swap(TrueC, FalseC);
47524 }
47525
47526 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47527 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47528
47529 // Multiply condition by the difference if non-one.
47530 if (!AbsDiff.isOne())
47531 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47532
47533 // Add the base if non-zero.
47534 if (!FalseC->isZero())
47535 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47536
47537 return R;
47538 }
47539
47540 return SDValue();
47541}
47542
47543/// If this is a *dynamic* select (non-constant condition) and we can match
47544/// this node with one of the variable blend instructions, restructure the
47545/// condition so that blends can use the high (sign) bit of each element.
47546/// This function will also call SimplifyDemandedBits on already created
47547/// BLENDV to perform additional simplifications.
47549 const SDLoc &DL,
47551 const X86Subtarget &Subtarget) {
47552 SDValue Cond = N->getOperand(0);
47553 if ((N->getOpcode() != ISD::VSELECT &&
47554 N->getOpcode() != X86ISD::BLENDV) ||
47556 return SDValue();
47557
47558 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47559 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47560 EVT VT = N->getValueType(0);
47561
47562 // We can only handle the cases where VSELECT is directly legal on the
47563 // subtarget. We custom lower VSELECT nodes with constant conditions and
47564 // this makes it hard to see whether a dynamic VSELECT will correctly
47565 // lower, so we both check the operation's status and explicitly handle the
47566 // cases where a *dynamic* blend will fail even though a constant-condition
47567 // blend could be custom lowered.
47568 // FIXME: We should find a better way to handle this class of problems.
47569 // Potentially, we should combine constant-condition vselect nodes
47570 // pre-legalization into shuffles and not mark as many types as custom
47571 // lowered.
47573 return SDValue();
47574 // FIXME: We don't support i16-element blends currently. We could and
47575 // should support them by making *all* the bits in the condition be set
47576 // rather than just the high bit and using an i8-element blend.
47577 if (VT.getVectorElementType() == MVT::i16)
47578 return SDValue();
47579 // Dynamic blending was only available from SSE4.1 onward.
47580 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47581 return SDValue();
47582 // Byte blends are only available in AVX2
47583 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47584 return SDValue();
47585 // There are no 512-bit blend instructions that use sign bits.
47586 if (VT.is512BitVector())
47587 return SDValue();
47588
47589 // Don't optimize before the condition has been transformed to a legal type
47590 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47592 return SDValue();
47593
47594 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47595 for (SDUse &Use : Cond->uses())
47596 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47597 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47598 Use.getOperandNo() != 0)
47599 return false;
47600
47601 return true;
47602 };
47603
47605
47606 if (OnlyUsedAsSelectCond(Cond)) {
47607 KnownBits Known;
47609 !DCI.isBeforeLegalizeOps());
47610 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47611 return SDValue();
47612
47613 // If we changed the computation somewhere in the DAG, this change will
47614 // affect all users of Cond. Update all the nodes so that we do not use
47615 // the generic VSELECT anymore. Otherwise, we may perform wrong
47616 // optimizations as we messed with the actual expectation for the vector
47617 // boolean values.
47618 for (SDNode *U : Cond->users()) {
47619 if (U->getOpcode() == X86ISD::BLENDV)
47620 continue;
47621
47622 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47623 Cond, U->getOperand(1), U->getOperand(2));
47624 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47625 DCI.AddToWorklist(U);
47626 }
47627 DCI.CommitTargetLoweringOpt(TLO);
47628 return SDValue(N, 0);
47629 }
47630
47631 // Otherwise we can still at least try to simplify multiple use bits.
47633 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47634 N->getOperand(1), N->getOperand(2));
47635
47636 return SDValue();
47637}
47638
47639// Try to match:
47640// (or (and (M, (sub 0, X)), (pandn M, X)))
47641// which is a special case of:
47642// (select M, (sub 0, X), X)
47643// Per:
47644// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47645// We know that, if fNegate is 0 or 1:
47646// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47647//
47648// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47649// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47650// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47651// This lets us transform our vselect to:
47652// (add (xor X, M), (and M, 1))
47653// And further to:
47654// (sub (xor X, M), M)
47656 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47657 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47658 using namespace SDPatternMatch;
47659 EVT MaskVT = Mask.getValueType();
47660 assert(MaskVT.isInteger() &&
47661 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47662 "Mask must be zero/all-bits");
47663
47664 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47666 return SDValue();
47667
47668 SDValue V;
47669 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47671 return SDValue();
47672
47673 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47674 SDValue SubOp2 = Mask;
47675
47676 // If the negate was on the false side of the select, then
47677 // the operands of the SUB need to be swapped. PR 27251.
47678 // This is because the pattern being matched above is
47679 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47680 // but if the pattern matched was
47681 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47682 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47683 // pattern also needs to be a negation of the replacement pattern above.
47684 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47685 // sub accomplishes the negation of the replacement pattern.
47686 if (V == Y)
47687 std::swap(SubOp1, SubOp2);
47688
47689 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47690 return DAG.getBitcast(VT, Res);
47691}
47692
47694 const X86Subtarget &Subtarget) {
47695 using namespace SDPatternMatch;
47696 if (!Subtarget.hasAVX512())
47697 return SDValue();
47698
47699 ISD::CondCode CC;
47700 SDValue Cond, X, Y, LHS, RHS;
47703 m_CondCode(CC)))),
47704 m_Value(LHS), m_Value(RHS))))
47705 return SDValue();
47706
47707 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47708 !canCombineAsMaskOperation(RHS, Subtarget))
47709 return SDValue();
47710
47711 // Commute LHS and RHS to create opportunity to select mask instruction.
47712 // (vselect M, L, R) -> (vselect ~M, R, L)
47713 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47714 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47715 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47716}
47717
47718/// Do target-specific dag combines on SELECT and VSELECT nodes.
47721 const X86Subtarget &Subtarget) {
47722 SDLoc DL(N);
47723 SDValue Cond = N->getOperand(0);
47724 SDValue LHS = N->getOperand(1);
47725 SDValue RHS = N->getOperand(2);
47726
47727 // Try simplification again because we use this function to optimize
47728 // BLENDV nodes that are not handled by the generic combiner.
47729 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47730 return V;
47731
47732 // When avx512 is available the lhs operand of select instruction can be
47733 // folded with mask instruction, while the rhs operand can't. Commute the
47734 // lhs and rhs of the select instruction to create the opportunity of
47735 // folding.
47736 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47737 return V;
47738
47739 EVT VT = LHS.getValueType();
47740 EVT CondVT = Cond.getValueType();
47741 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47742 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47743
47744 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47745 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47746 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47747 if (CondVT.isVector() && CondVT.isInteger() &&
47748 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47749 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47752 DL, DAG, Subtarget))
47753 return V;
47754
47755 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47756 SmallVector<int, 64> CondMask;
47757 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47758 N->getOpcode() == X86ISD::BLENDV)) {
47759 // Convert vselects with constant condition into shuffles.
47760 if (DCI.isBeforeLegalizeOps())
47761 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47762
47763 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47764 // by forcing the unselected elements to zero.
47765 // TODO: Can we handle more shuffles with this?
47766 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47767 SmallVector<SDValue, 1> LHSOps, RHSOps;
47768 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47771 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47772 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47773 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47774 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47775 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47776 assert(ByteMask.size() == LHSMask.size() &&
47777 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47778 for (auto [I, M] : enumerate(ByteMask)) {
47779 // getConstVector sets negative shuffle mask values as undef, so
47780 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47781 if (M < (int)ByteMask.size()) {
47782 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47783 RHSMask[I] = 0x80;
47784 } else {
47785 LHSMask[I] = 0x80;
47786 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47787 }
47788 }
47789 MVT ByteVT = LHSShuf.getSimpleValueType();
47790 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47791 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47792 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47793 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47794 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47795 }
47796 }
47797
47798 // Attempt to combine as shuffle.
47799 SDValue Op(N, 0);
47800 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47801 return Res;
47802 }
47803 }
47804
47805 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47806 // instructions match the semantics of the common C idiom x<y?x:y but not
47807 // x<=y?x:y, because of how they handle negative zero (which can be
47808 // ignored in unsafe-math mode).
47809 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47810 if ((Cond.getOpcode() == ISD::SETCC ||
47811 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47812 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47813 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47814 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47815 (Subtarget.hasSSE2() ||
47816 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47817 bool IsStrict = Cond->isStrictFPOpcode();
47818 ISD::CondCode CC =
47819 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47820 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47821 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47822
47823 unsigned Opcode = 0;
47824 // Check for x CC y ? x : y.
47825 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47826 switch (CC) {
47827 default: break;
47828 case ISD::SETULT:
47829 // Converting this to a min would handle NaNs incorrectly, and swapping
47830 // the operands would cause it to handle comparisons between positive
47831 // and negative zero incorrectly.
47832 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47834 !(DAG.isKnownNeverZeroFloat(LHS) ||
47836 break;
47837 std::swap(LHS, RHS);
47838 }
47839 Opcode = X86ISD::FMIN;
47840 break;
47841 case ISD::SETOLE:
47842 // Converting this to a min would handle comparisons between positive
47843 // and negative zero incorrectly.
47846 break;
47847 Opcode = X86ISD::FMIN;
47848 break;
47849 case ISD::SETULE:
47850 // Converting this to a min would handle both negative zeros and NaNs
47851 // incorrectly, but we can swap the operands to fix both.
47852 std::swap(LHS, RHS);
47853 [[fallthrough]];
47854 case ISD::SETOLT:
47855 case ISD::SETLT:
47856 case ISD::SETLE:
47857 Opcode = X86ISD::FMIN;
47858 break;
47859
47860 case ISD::SETOGE:
47861 // Converting this to a max would handle comparisons between positive
47862 // and negative zero incorrectly.
47865 break;
47866 Opcode = X86ISD::FMAX;
47867 break;
47868 case ISD::SETUGT:
47869 // Converting this to a max would handle NaNs incorrectly, and swapping
47870 // the operands would cause it to handle comparisons between positive
47871 // and negative zero incorrectly.
47872 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47874 !(DAG.isKnownNeverZeroFloat(LHS) ||
47876 break;
47877 std::swap(LHS, RHS);
47878 }
47879 Opcode = X86ISD::FMAX;
47880 break;
47881 case ISD::SETUGE:
47882 // Converting this to a max would handle both negative zeros and NaNs
47883 // incorrectly, but we can swap the operands to fix both.
47884 std::swap(LHS, RHS);
47885 [[fallthrough]];
47886 case ISD::SETOGT:
47887 case ISD::SETGT:
47888 case ISD::SETGE:
47889 Opcode = X86ISD::FMAX;
47890 break;
47891 }
47892 // Check for x CC y ? y : x -- a min/max with reversed arms.
47893 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47894 switch (CC) {
47895 default: break;
47896 case ISD::SETOGE:
47897 // Converting this to a min would handle comparisons between positive
47898 // and negative zero incorrectly, and swapping the operands would
47899 // cause it to handle NaNs incorrectly.
47901 !(DAG.isKnownNeverZeroFloat(LHS) ||
47902 DAG.isKnownNeverZeroFloat(RHS))) {
47903 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47904 break;
47905 std::swap(LHS, RHS);
47906 }
47907 Opcode = X86ISD::FMIN;
47908 break;
47909 case ISD::SETUGT:
47910 // Converting this to a min would handle NaNs incorrectly.
47911 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47912 break;
47913 Opcode = X86ISD::FMIN;
47914 break;
47915 case ISD::SETUGE:
47916 // Converting this to a min would handle both negative zeros and NaNs
47917 // incorrectly, but we can swap the operands to fix both.
47918 std::swap(LHS, RHS);
47919 [[fallthrough]];
47920 case ISD::SETOGT:
47921 case ISD::SETGT:
47922 case ISD::SETGE:
47923 Opcode = X86ISD::FMIN;
47924 break;
47925
47926 case ISD::SETULT:
47927 // Converting this to a max would handle NaNs incorrectly.
47928 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47929 break;
47930 Opcode = X86ISD::FMAX;
47931 break;
47932 case ISD::SETOLE:
47933 // Converting this to a max would handle comparisons between positive
47934 // and negative zero incorrectly, and swapping the operands would
47935 // cause it to handle NaNs incorrectly.
47937 !DAG.isKnownNeverZeroFloat(LHS) &&
47938 !DAG.isKnownNeverZeroFloat(RHS)) {
47939 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47940 break;
47941 std::swap(LHS, RHS);
47942 }
47943 Opcode = X86ISD::FMAX;
47944 break;
47945 case ISD::SETULE:
47946 // Converting this to a max would handle both negative zeros and NaNs
47947 // incorrectly, but we can swap the operands to fix both.
47948 std::swap(LHS, RHS);
47949 [[fallthrough]];
47950 case ISD::SETOLT:
47951 case ISD::SETLT:
47952 case ISD::SETLE:
47953 Opcode = X86ISD::FMAX;
47954 break;
47955 }
47956 }
47957
47958 if (Opcode) {
47959 if (IsStrict) {
47960 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47962 DL, {N->getValueType(0), MVT::Other},
47963 {Cond.getOperand(0), LHS, RHS});
47964 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47965 return Ret;
47966 }
47967 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47968 }
47969 }
47970
47971 // Some mask scalar intrinsics rely on checking if only one bit is set
47972 // and implement it in C code like this:
47973 // A[0] = (U & 1) ? A[0] : W[0];
47974 // This creates some redundant instructions that break pattern matching.
47975 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47976 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47977 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47978 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47979 SDValue AndNode = Cond.getOperand(0);
47980 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47981 isNullConstant(Cond.getOperand(1)) &&
47982 isOneConstant(AndNode.getOperand(1))) {
47983 // LHS and RHS swapped due to
47984 // setcc outputting 1 when AND resulted in 0 and vice versa.
47985 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47986 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47987 }
47988 }
47989
47990 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47991 // lowering on KNL. In this case we convert it to
47992 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47993 // The same situation all vectors of i8 and i16 without BWI.
47994 // Make sure we extend these even before type legalization gets a chance to
47995 // split wide vectors.
47996 // Since SKX these selects have a proper lowering.
47997 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47998 CondVT.getVectorElementType() == MVT::i1 &&
47999 (VT.getVectorElementType() == MVT::i8 ||
48000 VT.getVectorElementType() == MVT::i16)) {
48001 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
48002 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
48003 }
48004
48005 // AVX512 - Extend select to merge with target shuffle.
48006 // select(mask, extract_subvector(shuffle(x)), y) -->
48007 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
48008 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
48009 if (Subtarget.hasAVX512() && CondVT.isVector() &&
48010 CondVT.getVectorElementType() == MVT::i1) {
48011 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
48012 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
48013 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
48014 isNullConstant(Op.getOperand(1)) &&
48015 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
48016 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
48017 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
48018 ISD::isBuildVectorAllZeros(Alt.getNode()));
48019 };
48020
48021 bool SelectableLHS = SelectableOp(LHS, RHS);
48022 bool SelectableRHS = SelectableOp(RHS, LHS);
48023 if (SelectableLHS || SelectableRHS) {
48024 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
48025 : RHS.getOperand(0).getValueType();
48026 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
48027 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
48028 VT.getSizeInBits());
48029 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
48030 VT.getSizeInBits());
48031 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
48032 DAG.getUNDEF(SrcCondVT), Cond,
48033 DAG.getVectorIdxConstant(0, DL));
48034 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
48035 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
48036 }
48037 }
48038
48039 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
48040 return V;
48041
48042 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
48043 Cond.hasOneUse()) {
48044 EVT CondVT = Cond.getValueType();
48045 SDValue Cond0 = Cond.getOperand(0);
48046 SDValue Cond1 = Cond.getOperand(1);
48047 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
48048
48049 // Canonicalize min/max:
48050 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
48051 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
48052 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
48053 // the need for an extra compare against zero. e.g.
48054 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
48055 // subl %esi, %edi
48056 // testl %edi, %edi
48057 // movl $0, %eax
48058 // cmovgl %edi, %eax
48059 // =>
48060 // xorl %eax, %eax
48061 // subl %esi, $edi
48062 // cmovsl %eax, %edi
48063 //
48064 // We can also canonicalize
48065 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
48066 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
48067 // This allows the use of a test instruction for the compare.
48068 if (LHS == Cond0 && RHS == Cond1) {
48069 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
48070 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
48072 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48073 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48074 }
48075 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
48076 ISD::CondCode NewCC = ISD::SETUGE;
48077 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48078 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48079 }
48080 }
48081
48082 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
48083 // fold eq + gt/lt nested selects into ge/le selects
48084 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
48085 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
48086 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
48087 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
48088 // .. etc ..
48089 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48090 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48091 SDValue InnerSetCC = RHS.getOperand(0);
48092 ISD::CondCode InnerCC =
48093 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48094 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48095 Cond0 == InnerSetCC.getOperand(0) &&
48096 Cond1 == InnerSetCC.getOperand(1)) {
48097 ISD::CondCode NewCC;
48098 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48099 // clang-format off
48100 case ISD::SETGT: NewCC = ISD::SETGE; break;
48101 case ISD::SETLT: NewCC = ISD::SETLE; break;
48102 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48103 case ISD::SETULT: NewCC = ISD::SETULE; break;
48104 default: NewCC = ISD::SETCC_INVALID; break;
48105 // clang-format on
48106 }
48107 if (NewCC != ISD::SETCC_INVALID) {
48108 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48109 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48110 }
48111 }
48112 }
48113 }
48114
48115 // Check if the first operand is all zeros and Cond type is vXi1.
48116 // If this an avx512 target we can improve the use of zero masking by
48117 // swapping the operands and inverting the condition.
48118 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48119 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48120 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48121 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48122 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48123 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48124 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48125 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48126 }
48127
48128 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48129 // get split by legalization.
48130 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48131 CondVT.getVectorElementType() == MVT::i1 &&
48132 TLI.isTypeLegal(VT.getScalarType())) {
48133 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48135 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48136 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48137 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48138 }
48139 }
48140
48141 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48142 // with out-of-bounds clamping.
48143
48144 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48145 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48146 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48147 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48148 // exceeding bitwidth-1.
48149 if (N->getOpcode() == ISD::VSELECT) {
48150 using namespace llvm::SDPatternMatch;
48151 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48152 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48153 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48154 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48156 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48159 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48160 : X86ISD::VSHLV,
48161 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48162 }
48163 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48164 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48165 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48166 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48168 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48171 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48172 : X86ISD::VSHLV,
48173 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48174 }
48175 }
48176
48177 // Early exit check
48178 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48179 return SDValue();
48180
48181 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48182 return V;
48183
48184 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48185 return V;
48186
48187 // select(~Cond, X, Y) -> select(Cond, Y, X)
48188 if (CondVT.getScalarType() != MVT::i1) {
48189 if (SDValue CondNot = IsNOT(Cond, DAG))
48190 return DAG.getNode(N->getOpcode(), DL, VT,
48191 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48192
48193 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48194 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48195 Cond.getOperand(0).getOpcode() == ISD::AND &&
48196 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48197 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48198 Cond.getScalarValueSizeInBits(),
48199 /*AllowUndefs=*/true) &&
48200 Cond.hasOneUse()) {
48201 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48202 Cond.getOperand(0).getOperand(1));
48203 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48204 }
48205
48206 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48207 // signbit.
48208 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48209 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48210 Cond.hasOneUse()) {
48211 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48212 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48213 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48214 }
48215 }
48216
48217 // Try to optimize vXi1 selects if both operands are either all constants or
48218 // bitcasts from scalar integer type. In that case we can convert the operands
48219 // to integer and use an integer select which will be converted to a CMOV.
48220 // We need to take a little bit of care to avoid creating an i64 type after
48221 // type legalization.
48222 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48223 VT.getVectorElementType() == MVT::i1 &&
48224 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48226 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48227 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48228 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48229
48230 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48231 LHS.getOperand(0).getValueType() == IntVT)) &&
48232 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48233 RHS.getOperand(0).getValueType() == IntVT))) {
48234 if (LHSIsConst)
48236 else
48237 LHS = LHS.getOperand(0);
48238
48239 if (RHSIsConst)
48241 else
48242 RHS = RHS.getOperand(0);
48243
48244 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48245 return DAG.getBitcast(VT, Select);
48246 }
48247 }
48248 }
48249
48250 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48251 // single bits, then invert the predicate and swap the select operands.
48252 // This can lower using a vector shift bit-hack rather than mask and compare.
48253 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48254 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48255 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48256 Cond.getOperand(0).getOpcode() == ISD::AND &&
48257 isNullOrNullSplat(Cond.getOperand(1)) &&
48258 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48259 Cond.getOperand(0).getValueType() == VT) {
48260 // The 'and' mask must be composed of power-of-2 constants.
48261 SDValue And = Cond.getOperand(0);
48262 auto *C = isConstOrConstSplat(And.getOperand(1));
48263 if (C && C->getAPIntValue().isPowerOf2()) {
48264 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48265 SDValue NotCond =
48266 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48267 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48268 }
48269
48270 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48271 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48272 // 16-bit lacks a proper blendv.
48273 unsigned EltBitWidth = VT.getScalarSizeInBits();
48274 bool CanShiftBlend =
48275 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48276 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48277 (Subtarget.hasXOP()));
48278 if (CanShiftBlend &&
48279 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48280 return C->getAPIntValue().isPowerOf2();
48281 })) {
48282 // Create a left-shift constant to get the mask bits over to the sign-bit.
48283 SDValue Mask = And.getOperand(1);
48284 SmallVector<int, 32> ShlVals;
48285 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48286 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48287 ShlVals.push_back(EltBitWidth - 1 -
48288 MaskVal->getAPIntValue().exactLogBase2());
48289 }
48290 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48291 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48292 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48293 SDValue NewCond =
48294 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48295 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48296 }
48297 }
48298
48299 return SDValue();
48300}
48301
48302/// Combine:
48303/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48304/// to:
48305/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48306/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48307/// Note that this is only legal for some op/cc combinations.
48309 SelectionDAG &DAG,
48310 const X86Subtarget &Subtarget) {
48311 // This combine only operates on CMP-like nodes.
48312 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48313 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48314 return SDValue();
48315
48316 // Can't replace the cmp if it has more uses than the one we're looking at.
48317 // FIXME: We would like to be able to handle this, but would need to make sure
48318 // all uses were updated.
48319 if (!Cmp.hasOneUse())
48320 return SDValue();
48321
48322 // This only applies to variations of the common case:
48323 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48324 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48325 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48326 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48327 // Using the proper condcodes (see below), overflow is checked for.
48328
48329 // FIXME: We can generalize both constraints:
48330 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48331 // - LHS != 1
48332 // if the result is compared.
48333
48334 SDValue CmpLHS = Cmp.getOperand(0);
48335 SDValue CmpRHS = Cmp.getOperand(1);
48336 EVT CmpVT = CmpLHS.getValueType();
48337
48338 if (!CmpLHS.hasOneUse())
48339 return SDValue();
48340
48341 unsigned Opc = CmpLHS.getOpcode();
48342 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
48343 return SDValue();
48344
48345 SDValue OpRHS = CmpLHS.getOperand(2);
48346 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48347 if (!OpRHSC)
48348 return SDValue();
48349
48350 APInt Addend = OpRHSC->getAPIntValue();
48351 if (Opc == ISD::ATOMIC_LOAD_SUB)
48352 Addend = -Addend;
48353
48354 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48355 if (!CmpRHSC)
48356 return SDValue();
48357
48358 APInt Comparison = CmpRHSC->getAPIntValue();
48359 APInt NegAddend = -Addend;
48360
48361 // See if we can adjust the CC to make the comparison match the negated
48362 // addend.
48363 if (Comparison != NegAddend) {
48364 APInt IncComparison = Comparison + 1;
48365 if (IncComparison == NegAddend) {
48366 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48367 Comparison = IncComparison;
48368 CC = X86::COND_AE;
48369 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48370 Comparison = IncComparison;
48371 CC = X86::COND_L;
48372 }
48373 }
48374 APInt DecComparison = Comparison - 1;
48375 if (DecComparison == NegAddend) {
48376 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48377 Comparison = DecComparison;
48378 CC = X86::COND_A;
48379 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48380 Comparison = DecComparison;
48381 CC = X86::COND_LE;
48382 }
48383 }
48384 }
48385
48386 // If the addend is the negation of the comparison value, then we can do
48387 // a full comparison by emitting the atomic arithmetic as a locked sub.
48388 if (Comparison == NegAddend) {
48389 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48390 // atomic sub.
48391 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48392 auto AtomicSub = DAG.getAtomic(
48393 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48394 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48395 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48396 AN->getMemOperand());
48397 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48398 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48399 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48400 return LockOp;
48401 }
48402
48403 // We can handle comparisons with zero in a number of cases by manipulating
48404 // the CC used.
48405 if (!Comparison.isZero())
48406 return SDValue();
48407
48408 if (CC == X86::COND_S && Addend == 1)
48409 CC = X86::COND_LE;
48410 else if (CC == X86::COND_NS && Addend == 1)
48411 CC = X86::COND_G;
48412 else if (CC == X86::COND_G && Addend == -1)
48413 CC = X86::COND_GE;
48414 else if (CC == X86::COND_LE && Addend == -1)
48415 CC = X86::COND_L;
48416 else
48417 return SDValue();
48418
48419 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48420 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48421 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48422 return LockOp;
48423}
48424
48425// Check whether we're just testing the signbit, and whether we can simplify
48426// this by tracking where the signbit came from.
48428 SelectionDAG &DAG) {
48429 if (CC != X86::COND_S && CC != X86::COND_NS)
48430 return SDValue();
48431
48432 if (!Cmp.hasOneUse())
48433 return SDValue();
48434
48435 SDValue Src;
48436 if (Cmp.getOpcode() == X86ISD::CMP) {
48437 // CMP(X,0) -> signbit test
48438 if (!isNullConstant(Cmp.getOperand(1)))
48439 return SDValue();
48440 Src = Cmp.getOperand(0);
48441 // Peek through a SRA node as we just need the signbit.
48442 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48443 // TODO: Use SimplifyDemandedBits instead of just SRA?
48444 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48445 return SDValue();
48446 Src = Src.getOperand(0);
48447 } else if (Cmp.getOpcode() == X86ISD::OR) {
48448 // OR(X,Y) -> see if only one operand contributes to the signbit.
48449 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48450 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48451 Src = Cmp.getOperand(1);
48452 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48453 Src = Cmp.getOperand(0);
48454 else
48455 return SDValue();
48456 } else {
48457 return SDValue();
48458 }
48459
48460 // Replace with a TEST on the MSB.
48461 SDLoc DL(Cmp);
48462 MVT SrcVT = Src.getSimpleValueType();
48463 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48464
48465 // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded
48466 // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.
48467 if (Src.getOpcode() == ISD::SHL) {
48468 if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48469 Src = Src.getOperand(0);
48470 BitMask.lshrInPlace(*ShiftAmt);
48471 }
48472 } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {
48473 EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();
48474 Src = Src.getOperand(0);
48475 BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());
48476 }
48477
48478 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48479 DAG.getConstant(BitMask, DL, SrcVT));
48480 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48481 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48482 DAG.getConstant(0, DL, SrcVT));
48483}
48484
48485// Check whether a boolean test is testing a boolean value generated by
48486// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48487// code.
48488//
48489// Simplify the following patterns:
48490// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48491// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48492// to (Op EFLAGS Cond)
48493//
48494// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48495// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48496// to (Op EFLAGS !Cond)
48497//
48498// where Op could be BRCOND or CMOV.
48499//
48501 // This combine only operates on CMP-like nodes.
48502 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48503 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48504 return SDValue();
48505
48506 // Quit if not used as a boolean value.
48507 if (CC != X86::COND_E && CC != X86::COND_NE)
48508 return SDValue();
48509
48510 // Check CMP operands. One of them should be 0 or 1 and the other should be
48511 // an SetCC or extended from it.
48512 SDValue Op1 = Cmp.getOperand(0);
48513 SDValue Op2 = Cmp.getOperand(1);
48514
48515 SDValue SetCC;
48516 const ConstantSDNode* C = nullptr;
48517 bool needOppositeCond = (CC == X86::COND_E);
48518 bool checkAgainstTrue = false; // Is it a comparison against 1?
48519
48520 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48521 SetCC = Op2;
48522 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48523 SetCC = Op1;
48524 else // Quit if all operands are not constants.
48525 return SDValue();
48526
48527 if (C->getZExtValue() == 1) {
48528 needOppositeCond = !needOppositeCond;
48529 checkAgainstTrue = true;
48530 } else if (C->getZExtValue() != 0)
48531 // Quit if the constant is neither 0 or 1.
48532 return SDValue();
48533
48534 bool truncatedToBoolWithAnd = false;
48535 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48536 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48537 SetCC.getOpcode() == ISD::TRUNCATE ||
48538 SetCC.getOpcode() == ISD::AND) {
48539 if (SetCC.getOpcode() == ISD::AND) {
48540 int OpIdx = -1;
48541 if (isOneConstant(SetCC.getOperand(0)))
48542 OpIdx = 1;
48543 if (isOneConstant(SetCC.getOperand(1)))
48544 OpIdx = 0;
48545 if (OpIdx < 0)
48546 break;
48547 SetCC = SetCC.getOperand(OpIdx);
48548 truncatedToBoolWithAnd = true;
48549 } else
48550 SetCC = SetCC.getOperand(0);
48551 }
48552
48553 switch (SetCC.getOpcode()) {
48555 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48556 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48557 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48558 // truncated to i1 using 'and'.
48559 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48560 break;
48562 "Invalid use of SETCC_CARRY!");
48563 [[fallthrough]];
48564 case X86ISD::SETCC:
48565 // Set the condition code or opposite one if necessary.
48566 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48567 if (needOppositeCond)
48569 return SetCC.getOperand(1);
48570 case X86ISD::CMOV: {
48571 // Check whether false/true value has canonical one, i.e. 0 or 1.
48574 // Quit if true value is not a constant.
48575 if (!TVal)
48576 return SDValue();
48577 // Quit if false value is not a constant.
48578 if (!FVal) {
48579 SDValue Op = SetCC.getOperand(0);
48580 // Skip 'zext' or 'trunc' node.
48581 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48582 Op.getOpcode() == ISD::TRUNCATE)
48583 Op = Op.getOperand(0);
48584 // A special case for rdrand/rdseed, where 0 is set if false cond is
48585 // found.
48586 if ((Op.getOpcode() != X86ISD::RDRAND &&
48587 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48588 return SDValue();
48589 }
48590 // Quit if false value is not the constant 0 or 1.
48591 bool FValIsFalse = true;
48592 if (FVal && FVal->getZExtValue() != 0) {
48593 if (FVal->getZExtValue() != 1)
48594 return SDValue();
48595 // If FVal is 1, opposite cond is needed.
48596 needOppositeCond = !needOppositeCond;
48597 FValIsFalse = false;
48598 }
48599 // Quit if TVal is not the constant opposite of FVal.
48600 if (FValIsFalse && TVal->getZExtValue() != 1)
48601 return SDValue();
48602 if (!FValIsFalse && TVal->getZExtValue() != 0)
48603 return SDValue();
48604 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48605 if (needOppositeCond)
48607 return SetCC.getOperand(3);
48608 }
48609 }
48610
48611 return SDValue();
48612}
48613
48614/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48615/// Match:
48616/// (X86or (X86setcc) (X86setcc))
48617/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48619 X86::CondCode &CC1, SDValue &Flags,
48620 bool &isAnd) {
48621 if (Cond->getOpcode() == X86ISD::CMP) {
48622 if (!isNullConstant(Cond->getOperand(1)))
48623 return false;
48624
48625 Cond = Cond->getOperand(0);
48626 }
48627
48628 isAnd = false;
48629
48630 SDValue SetCC0, SetCC1;
48631 switch (Cond->getOpcode()) {
48632 default: return false;
48633 case ISD::AND:
48634 case X86ISD::AND:
48635 isAnd = true;
48636 [[fallthrough]];
48637 case ISD::OR:
48638 case X86ISD::OR:
48639 SetCC0 = Cond->getOperand(0);
48640 SetCC1 = Cond->getOperand(1);
48641 break;
48642 };
48643
48644 // Make sure we have SETCC nodes, using the same flags value.
48645 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48646 SetCC1.getOpcode() != X86ISD::SETCC ||
48647 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48648 return false;
48649
48650 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48651 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48652 Flags = SetCC0->getOperand(1);
48653 return true;
48654}
48655
48656// When legalizing carry, we create carries via add X, -1
48657// If that comes from an actual carry, via setcc, we use the
48658// carry directly.
48660 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48661 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48662 bool FoundAndLSB = false;
48663 SDValue Carry = EFLAGS.getOperand(0);
48664 while (Carry.getOpcode() == ISD::TRUNCATE ||
48665 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48666 (Carry.getOpcode() == ISD::AND &&
48667 isOneConstant(Carry.getOperand(1)))) {
48668 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48669 Carry = Carry.getOperand(0);
48670 }
48671 if (Carry.getOpcode() == X86ISD::SETCC ||
48672 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48673 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48674 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48675 SDValue CarryOp1 = Carry.getOperand(1);
48676 if (CarryCC == X86::COND_B)
48677 return CarryOp1;
48678 if (CarryCC == X86::COND_A) {
48679 // Try to convert COND_A into COND_B in an attempt to facilitate
48680 // materializing "setb reg".
48681 //
48682 // Do not flip "e > c", where "c" is a constant, because Cmp
48683 // instruction cannot take an immediate as its first operand.
48684 //
48685 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48686 CarryOp1.getNode()->hasOneUse() &&
48687 CarryOp1.getValueType().isInteger() &&
48688 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48689 SDValue SubCommute =
48690 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48691 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48692 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48693 }
48694 }
48695 // If this is a check of the z flag of an add with 1, switch to the
48696 // C flag.
48697 if (CarryCC == X86::COND_E &&
48698 CarryOp1.getOpcode() == X86ISD::ADD &&
48699 isOneConstant(CarryOp1.getOperand(1)))
48700 return CarryOp1;
48701 } else if (FoundAndLSB) {
48702 SDLoc DL(Carry);
48703 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48704 if (Carry.getOpcode() == ISD::SRL) {
48705 BitNo = Carry.getOperand(1);
48706 Carry = Carry.getOperand(0);
48707 }
48708 return getBT(Carry, BitNo, DL, DAG);
48709 }
48710 }
48711 }
48712
48713 return SDValue();
48714}
48715
48716/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48717/// to avoid the inversion.
48719 SelectionDAG &DAG,
48720 const X86Subtarget &Subtarget) {
48721 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48722 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48723 EFLAGS.getOpcode() != X86ISD::TESTP)
48724 return SDValue();
48725
48726 // PTEST/TESTP sets EFLAGS as:
48727 // TESTZ: ZF = (Op0 & Op1) == 0
48728 // TESTC: CF = (~Op0 & Op1) == 0
48729 // TESTNZC: ZF == 0 && CF == 0
48730 MVT VT = EFLAGS.getSimpleValueType();
48731 SDValue Op0 = EFLAGS.getOperand(0);
48732 SDValue Op1 = EFLAGS.getOperand(1);
48733 MVT OpVT = Op0.getSimpleValueType();
48734 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48735
48736 // TEST*(~X,Y) == TEST*(X,Y)
48737 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48738 X86::CondCode InvCC;
48739 switch (CC) {
48740 case X86::COND_B:
48741 // testc -> testz.
48742 InvCC = X86::COND_E;
48743 break;
48744 case X86::COND_AE:
48745 // !testc -> !testz.
48746 InvCC = X86::COND_NE;
48747 break;
48748 case X86::COND_E:
48749 // testz -> testc.
48750 InvCC = X86::COND_B;
48751 break;
48752 case X86::COND_NE:
48753 // !testz -> !testc.
48754 InvCC = X86::COND_AE;
48755 break;
48756 case X86::COND_A:
48757 case X86::COND_BE:
48758 // testnzc -> testnzc (no change).
48759 InvCC = CC;
48760 break;
48761 default:
48762 InvCC = X86::COND_INVALID;
48763 break;
48764 }
48765
48766 if (InvCC != X86::COND_INVALID) {
48767 CC = InvCC;
48768 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48769 DAG.getBitcast(OpVT, NotOp0), Op1);
48770 }
48771 }
48772
48773 if (CC == X86::COND_B || CC == X86::COND_AE) {
48774 // TESTC(X,~X) == TESTC(X,-1)
48775 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48776 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48777 SDLoc DL(EFLAGS);
48778 return DAG.getNode(
48779 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48780 DAG.getBitcast(OpVT,
48781 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48782 }
48783 }
48784 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48785 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48787 SDValue BC0 = peekThroughBitcasts(Op0);
48788 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48790 SDLoc DL(EFLAGS);
48791 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48792 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48793 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48794 }
48795 }
48796 }
48797
48798 if (CC == X86::COND_E || CC == X86::COND_NE) {
48799 // TESTZ(X,~Y) == TESTC(Y,X)
48800 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48801 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48802 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48803 DAG.getBitcast(OpVT, NotOp1), Op0);
48804 }
48805
48806 if (Op0 == Op1) {
48807 SDValue BC = peekThroughBitcasts(Op0);
48808 EVT BCVT = BC.getValueType();
48809
48810 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48811 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48812 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48813 DAG.getBitcast(OpVT, BC.getOperand(0)),
48814 DAG.getBitcast(OpVT, BC.getOperand(1)));
48815 }
48816
48817 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48818 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48819 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48820 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48821 DAG.getBitcast(OpVT, BC.getOperand(0)),
48822 DAG.getBitcast(OpVT, BC.getOperand(1)));
48823 }
48824
48825 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48826 // to more efficiently extract the sign bits and compare that.
48827 // TODO: Handle TESTC with comparison inversion.
48828 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48829 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48830 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48831 unsigned EltBits = BCVT.getScalarSizeInBits();
48832 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48833 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48834 APInt SignMask = APInt::getSignMask(EltBits);
48835 if (SDValue Res =
48836 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48837 // For vXi16 cases we need to use pmovmksb and extract every other
48838 // sign bit.
48839 SDLoc DL(EFLAGS);
48840 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48841 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48842 MVT FloatVT =
48843 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48844 Res = DAG.getBitcast(FloatVT, Res);
48845 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48846 } else if (EltBits == 16) {
48847 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48848 Res = DAG.getBitcast(MovmskVT, Res);
48849 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48850 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48851 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48852 } else {
48853 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48854 }
48855 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48856 DAG.getConstant(0, DL, MVT::i32));
48857 }
48858 }
48859 }
48860 }
48861
48862 // TESTZ(-1,X) == TESTZ(X,X)
48864 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48865
48866 // TESTZ(X,-1) == TESTZ(X,X)
48868 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48869
48870 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48871 // TODO: Add COND_NE handling?
48872 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48873 SDValue Src0 = peekThroughBitcasts(Op0);
48874 SDValue Src1 = peekThroughBitcasts(Op1);
48875 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48877 peekThroughBitcasts(Src0.getOperand(1)), true);
48879 peekThroughBitcasts(Src1.getOperand(1)), true);
48880 if (Src0 && Src1) {
48881 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48882 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48883 DAG.getBitcast(OpVT2, Src0),
48884 DAG.getBitcast(OpVT2, Src1));
48885 }
48886 }
48887 }
48888 }
48889
48890 return SDValue();
48891}
48892
48893// Attempt to simplify the MOVMSK input based on the comparison type.
48895 SelectionDAG &DAG,
48896 const X86Subtarget &Subtarget) {
48897 // Handle eq/ne against zero (any_of).
48898 // Handle eq/ne against -1 (all_of).
48899 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48900 return SDValue();
48901 if (EFLAGS.getValueType() != MVT::i32)
48902 return SDValue();
48903 unsigned CmpOpcode = EFLAGS.getOpcode();
48904 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48905 return SDValue();
48906 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48907 if (!CmpConstant)
48908 return SDValue();
48909 const APInt &CmpVal = CmpConstant->getAPIntValue();
48910
48911 SDValue CmpOp = EFLAGS.getOperand(0);
48912 unsigned CmpBits = CmpOp.getValueSizeInBits();
48913 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48914
48915 // Peek through any truncate.
48916 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48917 CmpOp = CmpOp.getOperand(0);
48918
48919 // Bail if we don't find a MOVMSK.
48920 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48921 return SDValue();
48922
48923 SDValue Vec = CmpOp.getOperand(0);
48924 MVT VecVT = Vec.getSimpleValueType();
48925 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48926 "Unexpected MOVMSK operand");
48927 unsigned NumElts = VecVT.getVectorNumElements();
48928 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48929
48930 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48931 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48932 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48933 if (!IsAnyOf && !IsAllOf)
48934 return SDValue();
48935
48936 // TODO: Check more combining cases for me.
48937 // Here we check the cmp use number to decide do combining or not.
48938 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48939 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48940 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48941
48942 // See if we can peek through to a vector with a wider element type, if the
48943 // signbits extend down to all the sub-elements as well.
48944 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48945 // potential SimplifyDemandedBits/Elts cases.
48946 // If we looked through a truncate that discard bits, we can't do this
48947 // transform.
48948 // FIXME: We could do this transform for truncates that discarded bits by
48949 // inserting an AND mask between the new MOVMSK and the CMP.
48950 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48951 SDValue BC = peekThroughBitcasts(Vec);
48952 MVT BCVT = BC.getSimpleValueType();
48953 unsigned BCNumElts = BCVT.getVectorNumElements();
48954 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48955 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48956 BCNumEltBits > NumEltBits &&
48957 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48958 SDLoc DL(EFLAGS);
48959 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48960 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48961 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48962 DAG.getConstant(CmpMask, DL, MVT::i32));
48963 }
48964 }
48965
48966 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48967 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48968 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48969 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48970 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48972 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48973 Ops.size() == 2) {
48974 SDLoc DL(EFLAGS);
48975 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48976 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48977 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48978 DAG.getBitcast(SubVT, Ops[0]),
48979 DAG.getBitcast(SubVT, Ops[1]));
48980 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48981 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48982 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48983 DAG.getConstant(CmpMask, DL, MVT::i32));
48984 }
48985 }
48986
48987 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48988 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48989 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48990 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48991 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48992 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48993 SDValue BC = peekThroughBitcasts(Vec);
48994 // Ensure MOVMSK was testing every signbit of BC.
48995 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48996 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48997 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48998 BC.getOperand(0), BC.getOperand(1));
48999 V = DAG.getBitcast(TestVT, V);
49000 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49001 }
49002 // Check for 256-bit split vector cases.
49003 if (BC.getOpcode() == ISD::AND &&
49004 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
49005 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
49006 SDValue LHS = BC.getOperand(0);
49007 SDValue RHS = BC.getOperand(1);
49008 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
49009 LHS.getOperand(0), LHS.getOperand(1));
49010 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
49011 RHS.getOperand(0), RHS.getOperand(1));
49012 LHS = DAG.getBitcast(TestVT, LHS);
49013 RHS = DAG.getBitcast(TestVT, RHS);
49014 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
49015 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49016 }
49017 }
49018 }
49019
49020 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
49021 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
49022 // sign bits prior to the comparison with zero unless we know that
49023 // the vXi16 splats the sign bit down to the lower i8 half.
49024 // TODO: Handle all_of patterns.
49025 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
49026 SDValue VecOp0 = Vec.getOperand(0);
49027 SDValue VecOp1 = Vec.getOperand(1);
49028 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
49029 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
49030 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
49031 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
49032 SDLoc DL(EFLAGS);
49033 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
49034 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49035 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
49036 if (!SignExt0) {
49037 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
49038 DAG.getConstant(0xAAAA, DL, MVT::i16));
49039 }
49040 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49041 DAG.getConstant(0, DL, MVT::i16));
49042 }
49043 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
49044 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
49045 if (CmpBits >= 16 && Subtarget.hasInt256() &&
49046 (IsAnyOf || (SignExt0 && SignExt1))) {
49047 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
49048 SDLoc DL(EFLAGS);
49049 SDValue Result = peekThroughBitcasts(Src);
49050 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
49051 Result.getValueType().getVectorNumElements() <= NumElts) {
49052 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
49053 Result.getOperand(0), Result.getOperand(1));
49054 V = DAG.getBitcast(MVT::v4i64, V);
49055 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49056 }
49057 Result = DAG.getBitcast(MVT::v32i8, Result);
49058 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49059 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
49060 if (!SignExt0 || !SignExt1) {
49061 assert(IsAnyOf &&
49062 "Only perform v16i16 signmasks for any_of patterns");
49063 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
49064 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
49065 }
49066 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49067 DAG.getConstant(CmpMask, DL, MVT::i32));
49068 }
49069 }
49070 }
49071
49072 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
49073 // Since we peek through a bitcast, we need to be careful if the base vector
49074 // type has smaller elements than the MOVMSK type. In that case, even if
49075 // all the elements are demanded by the shuffle mask, only the "high"
49076 // elements which have highbits that align with highbits in the MOVMSK vec
49077 // elements are actually demanded. A simplification of spurious operations
49078 // on the "low" elements take place during other simplifications.
49079 //
49080 // For example:
49081 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
49082 // demanded, because we are swapping around the result can change.
49083 //
49084 // To address this, we check that we can scale the shuffle mask to MOVMSK
49085 // element width (this will ensure "high" elements match). Its slightly overly
49086 // conservative, but fine for an edge case fold.
49087 SmallVector<int, 32> ShuffleMask;
49088 SmallVector<SDValue, 2> ShuffleInputs;
49089 if (NumElts <= CmpBits &&
49090 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
49091 ShuffleMask, DAG) &&
49092 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
49093 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49094 canScaleShuffleElements(ShuffleMask, NumElts)) {
49095 SDLoc DL(EFLAGS);
49096 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49097 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49098 Result =
49099 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49100 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49101 }
49102
49103 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49104 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49105 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49106 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49107 // iff every element is referenced.
49108 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49109 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49110 (NumEltBits == 32 || NumEltBits == 64)) {
49111 SDLoc DL(EFLAGS);
49112 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49113 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49114 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49115 SDValue LHS = Vec;
49116 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49117 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49118 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49119 DAG.getBitcast(FloatVT, LHS),
49120 DAG.getBitcast(FloatVT, RHS));
49121 }
49122
49123 return SDValue();
49124}
49125
49126/// Optimize an EFLAGS definition used according to the condition code \p CC
49127/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49128/// uses of chain values.
49130 SelectionDAG &DAG,
49131 const X86Subtarget &Subtarget) {
49132 if (CC == X86::COND_B)
49133 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49134 return Flags;
49135
49136 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49137 return R;
49138
49139 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49140 return R;
49141
49142 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49143 return R;
49144
49145 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49146 return R;
49147
49148 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49149}
49150
49151/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49154 const X86Subtarget &Subtarget) {
49155 SDLoc DL(N);
49156 EVT VT = N->getValueType(0);
49157 SDValue FalseOp = N->getOperand(0);
49158 SDValue TrueOp = N->getOperand(1);
49159 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49160 SDValue Cond = N->getOperand(3);
49161
49162 // cmov X, X, ?, ? --> X
49163 if (TrueOp == FalseOp)
49164 return TrueOp;
49165
49166 // Try to simplify the EFLAGS and condition code operands.
49167 // We can't always do this as FCMOV only supports a subset of X86 cond.
49168 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49169 if (!(FalseOp.getValueType() == MVT::f80 ||
49170 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49171 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49172 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49173 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49174 Flags};
49175 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49176 }
49177 }
49178
49179 // If this is a select between two integer constants, try to do some
49180 // optimizations. Note that the operands are ordered the opposite of SELECT
49181 // operands.
49182 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49183 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49184 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49185 // larger than FalseC (the false value).
49186 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49188 std::swap(TrueC, FalseC);
49189 std::swap(TrueOp, FalseOp);
49190 }
49191
49192 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49193 // This is efficient for any integer data type (including i8/i16) and
49194 // shift amount.
49195 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49196 Cond = getSETCC(CC, Cond, DL, DAG);
49197
49198 // Zero extend the condition if needed.
49199 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49200
49201 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49202 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49203 DAG.getConstant(ShAmt, DL, MVT::i8));
49204 return Cond;
49205 }
49206
49207 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49208 // for any integer data type, including i8/i16.
49209 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49210 Cond = getSETCC(CC, Cond, DL, DAG);
49211
49212 // Zero extend the condition if needed.
49214 FalseC->getValueType(0), Cond);
49215 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49216 SDValue(FalseC, 0));
49217 return Cond;
49218 }
49219
49220 // Optimize cases that will turn into an LEA instruction. This requires
49221 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49222 if (VT == MVT::i32 || VT == MVT::i64) {
49223 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49224 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49225 "Implicit constant truncation");
49226
49227 bool isFastMultiplier = false;
49228 if (Diff.ult(10)) {
49229 switch (Diff.getZExtValue()) {
49230 default: break;
49231 case 1: // result = add base, cond
49232 case 2: // result = lea base( , cond*2)
49233 case 3: // result = lea base(cond, cond*2)
49234 case 4: // result = lea base( , cond*4)
49235 case 5: // result = lea base(cond, cond*4)
49236 case 8: // result = lea base( , cond*8)
49237 case 9: // result = lea base(cond, cond*8)
49238 isFastMultiplier = true;
49239 break;
49240 }
49241 }
49242
49243 if (isFastMultiplier) {
49244 Cond = getSETCC(CC, Cond, DL ,DAG);
49245 // Zero extend the condition if needed.
49246 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49247 Cond);
49248 // Scale the condition by the difference.
49249 if (Diff != 1)
49250 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49251 DAG.getConstant(Diff, DL, Cond.getValueType()));
49252
49253 // Add the base if non-zero.
49254 if (FalseC->getAPIntValue() != 0)
49255 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49256 SDValue(FalseC, 0));
49257 return Cond;
49258 }
49259 }
49260 }
49261 }
49262
49263 // Handle these cases:
49264 // (select (x != c), e, c) -> select (x != c), e, x),
49265 // (select (x == c), c, e) -> select (x == c), x, e)
49266 // where the c is an integer constant, and the "select" is the combination
49267 // of CMOV and CMP.
49268 //
49269 // The rationale for this change is that the conditional-move from a constant
49270 // needs two instructions, however, conditional-move from a register needs
49271 // only one instruction.
49272 //
49273 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49274 // some instruction-combining opportunities. This opt needs to be
49275 // postponed as late as possible.
49276 //
49277 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49278 // the DCI.xxxx conditions are provided to postpone the optimization as
49279 // late as possible.
49280
49281 ConstantSDNode *CmpAgainst = nullptr;
49282 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49283 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49284 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49285
49286 if (CC == X86::COND_NE &&
49287 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49289 std::swap(TrueOp, FalseOp);
49290 }
49291
49292 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49293 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49294 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49295 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49296 }
49297 }
49298 }
49299
49300 // Transform:
49301 //
49302 // (cmov 1 T (uge T 2))
49303 //
49304 // to:
49305 //
49306 // (adc T 0 (sub T 1))
49307 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49308 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49309 SDValue Cond0 = Cond.getOperand(0);
49310 if (Cond0.getOpcode() == ISD::TRUNCATE)
49311 Cond0 = Cond0.getOperand(0);
49312 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49313 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49314 EVT CondVT = Cond->getValueType(0);
49315 // Subtract 1 and generate a carry.
49316 SDValue NewSub =
49317 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49318 DAG.getConstant(1, DL, CondVT));
49319 SDValue EFLAGS(NewSub.getNode(), 1);
49320 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49321 DAG.getConstant(0, DL, VT), EFLAGS);
49322 }
49323 }
49324
49325 // Fold and/or of setcc's to double CMOV:
49326 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49327 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49328 //
49329 // This combine lets us generate:
49330 // cmovcc1 (jcc1 if we don't have CMOV)
49331 // cmovcc2 (same)
49332 // instead of:
49333 // setcc1
49334 // setcc2
49335 // and/or
49336 // cmovne (jne if we don't have CMOV)
49337 // When we can't use the CMOV instruction, it might increase branch
49338 // mispredicts.
49339 // When we can use CMOV, or when there is no mispredict, this improves
49340 // throughput and reduces register pressure.
49341 //
49342 if (CC == X86::COND_NE) {
49343 SDValue Flags;
49344 X86::CondCode CC0, CC1;
49345 bool isAndSetCC;
49346 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49347 if (isAndSetCC) {
49348 std::swap(FalseOp, TrueOp);
49351 }
49352
49353 SDValue LOps[] = {FalseOp, TrueOp,
49354 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49355 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49356 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49357 Flags};
49358 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49359 return CMOV;
49360 }
49361 }
49362
49363 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49364 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49365 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49366 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49367 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49368 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49369 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49370 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49371 SDValue Add = TrueOp;
49372 SDValue Const = FalseOp;
49373 // Canonicalize the condition code for easier matching and output.
49374 if (CC == X86::COND_E)
49375 std::swap(Add, Const);
49376
49377 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49378 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49379 Add.getResNo() == 0 && Add.hasOneUse() &&
49380 Add.getOperand(1) == Cond.getOperand(0)) {
49381 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49382 Add.getOperand(1));
49383 }
49384
49385 // We might have replaced the constant in the cmov with the LHS of the
49386 // compare. If so change it to the RHS of the compare.
49387 if (Const == Cond.getOperand(0))
49388 Const = Cond.getOperand(1);
49389
49390 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49391 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49392 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49393 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49394 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49395 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49396 // This should constant fold.
49397 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49398 SDValue CMov =
49399 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49400 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49401 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49402 }
49403 }
49404
49405 return SDValue();
49406}
49407
49408/// Different mul shrinking modes.
49410
49412 EVT VT = N->getOperand(0).getValueType();
49413 if (VT.getScalarSizeInBits() != 32)
49414 return false;
49415
49416 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49417 unsigned SignBits[2] = {1, 1};
49418 bool IsPositive[2] = {false, false};
49419 for (unsigned i = 0; i < 2; i++) {
49420 SDValue Opd = N->getOperand(i);
49421
49422 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49423 IsPositive[i] = DAG.SignBitIsZero(Opd);
49424 }
49425
49426 bool AllPositive = IsPositive[0] && IsPositive[1];
49427 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49428 // When ranges are from -128 ~ 127, use MULS8 mode.
49429 if (MinSignBits >= 25)
49431 // When ranges are from 0 ~ 255, use MULU8 mode.
49432 else if (AllPositive && MinSignBits >= 24)
49434 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49435 else if (MinSignBits >= 17)
49437 // When ranges are from 0 ~ 65535, use MULU16 mode.
49438 else if (AllPositive && MinSignBits >= 16)
49440 else
49441 return false;
49442 return true;
49443}
49444
49445/// When the operands of vector mul are extended from smaller size values,
49446/// like i8 and i16, the type of mul may be shrinked to generate more
49447/// efficient code. Two typical patterns are handled:
49448/// Pattern1:
49449/// %2 = sext/zext <N x i8> %1 to <N x i32>
49450/// %4 = sext/zext <N x i8> %3 to <N x i32>
49451// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49452/// %5 = mul <N x i32> %2, %4
49453///
49454/// Pattern2:
49455/// %2 = zext/sext <N x i16> %1 to <N x i32>
49456/// %4 = zext/sext <N x i16> %3 to <N x i32>
49457/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49458/// %5 = mul <N x i32> %2, %4
49459///
49460/// There are four mul shrinking modes:
49461/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49462/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49463/// generate pmullw+sext32 for it (MULS8 mode).
49464/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49465/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49466/// generate pmullw+zext32 for it (MULU8 mode).
49467/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49468/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49469/// generate pmullw+pmulhw for it (MULS16 mode).
49470/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49471/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49472/// generate pmullw+pmulhuw for it (MULU16 mode).
49474 const X86Subtarget &Subtarget) {
49475 // Check for legality
49476 // pmullw/pmulhw are not supported by SSE.
49477 if (!Subtarget.hasSSE2())
49478 return SDValue();
49479
49480 // Check for profitability
49481 // pmulld is supported since SSE41. It is better to use pmulld
49482 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49483 // the expansion.
49484 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49485 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49486 return SDValue();
49487
49489 if (!canReduceVMulWidth(N, DAG, Mode))
49490 return SDValue();
49491
49492 SDValue N0 = N->getOperand(0);
49493 SDValue N1 = N->getOperand(1);
49494 EVT VT = N->getOperand(0).getValueType();
49495 unsigned NumElts = VT.getVectorNumElements();
49496 if ((NumElts % 2) != 0)
49497 return SDValue();
49498
49499 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49500
49501 // Shrink the operands of mul.
49502 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49503 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49504
49505 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49506 // lower part is needed.
49507 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49511 DL, VT, MulLo);
49512
49513 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49514 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49515 // the higher part is also needed.
49516 SDValue MulHi =
49518 ReducedVT, NewN0, NewN1);
49519
49520 // Repack the lower part and higher part result of mul into a wider
49521 // result.
49522 // Generate shuffle functioning as punpcklwd.
49523 SmallVector<int, 16> ShuffleMask(NumElts);
49524 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49525 ShuffleMask[2 * i] = i;
49526 ShuffleMask[2 * i + 1] = i + NumElts;
49527 }
49528 SDValue ResLo =
49529 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49530 ResLo = DAG.getBitcast(ResVT, ResLo);
49531 // Generate shuffle functioning as punpckhwd.
49532 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49533 ShuffleMask[2 * i] = i + NumElts / 2;
49534 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49535 }
49536 SDValue ResHi =
49537 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49538 ResHi = DAG.getBitcast(ResVT, ResHi);
49539 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49540}
49541
49543 EVT VT, const SDLoc &DL) {
49544
49545 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49546 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49547 DAG.getConstant(Mult, DL, VT));
49548 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49549 DAG.getConstant(Shift, DL, MVT::i8));
49550 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49551 N->getOperand(0));
49552 return Result;
49553 };
49554
49555 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49556 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49557 DAG.getConstant(Mul1, DL, VT));
49558 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49559 DAG.getConstant(Mul2, DL, VT));
49560 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49561 N->getOperand(0));
49562 return Result;
49563 };
49564
49565 switch (MulAmt) {
49566 default:
49567 break;
49568 case 11:
49569 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49570 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49571 case 21:
49572 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49573 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49574 case 41:
49575 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49576 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49577 case 22:
49578 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49579 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49580 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49581 case 19:
49582 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49583 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49584 case 37:
49585 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49586 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49587 case 73:
49588 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49589 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49590 case 13:
49591 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49592 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49593 case 23:
49594 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49595 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49596 case 26:
49597 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49598 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49599 case 28:
49600 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49601 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49602 case 29:
49603 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49604 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49605 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49606 }
49607
49608 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49609 // by a single LEA.
49610 // First check if this a sum of two power of 2s because that's easy. Then
49611 // count how many zeros are up to the first bit.
49612 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49613 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49614 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49615 if (ScaleShift >= 1 && ScaleShift < 4) {
49616 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49617 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49618 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49619 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49620 DAG.getConstant(ScaleShift, DL, MVT::i8));
49621 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49622 }
49623 }
49624
49625 return SDValue();
49626}
49627
49628// If the upper 17 bits of either element are zero and the other element are
49629// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49630// PMULLD, except on KNL.
49632 SelectionDAG &DAG,
49633 const X86Subtarget &Subtarget) {
49634 if (!Subtarget.hasSSE2())
49635 return SDValue();
49636
49637 if (Subtarget.isPMADDWDSlow())
49638 return SDValue();
49639
49640 EVT VT = N->getValueType(0);
49641
49642 // Only support vXi32 vectors.
49643 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49644 return SDValue();
49645
49646 // Make sure the type is legal or can split/widen to a legal type.
49647 // With AVX512 but without BWI, we would need to split v32i16.
49648 unsigned NumElts = VT.getVectorNumElements();
49649 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49650 return SDValue();
49651
49652 // With AVX512 but without BWI, we would need to split v32i16.
49653 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49654 return SDValue();
49655
49656 SDValue N0 = N->getOperand(0);
49657 SDValue N1 = N->getOperand(1);
49658
49659 // If we are zero/sign extending two steps without SSE4.1, its better to
49660 // reduce the vmul width instead.
49661 if (!Subtarget.hasSSE41() &&
49662 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49663 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49664 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49665 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49666 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49667 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49668 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49669 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49670 return SDValue();
49671
49672 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49673 // the vmul width instead.
49674 if (!Subtarget.hasSSE41() &&
49675 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49676 N0.getOperand(0).getValueSizeInBits() > 128) &&
49677 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49678 N1.getOperand(0).getValueSizeInBits() > 128))
49679 return SDValue();
49680
49681 // Sign bits must extend down to the lowest i16.
49682 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49683 DAG.ComputeMaxSignificantBits(N0) > 16)
49684 return SDValue();
49685
49686 // At least one of the elements must be zero in the upper 17 bits, or can be
49687 // safely made zero without altering the final result.
49688 auto GetZeroableOp = [&](SDValue Op) {
49689 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49690 if (DAG.MaskedValueIsZero(Op, Mask17))
49691 return Op;
49692 // Mask off upper 16-bits of sign-extended constants.
49694 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49695 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49696 SDValue Src = Op.getOperand(0);
49697 // Convert sext(vXi16) to zext(vXi16).
49698 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49699 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49700 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49701 // which will expand the extension.
49702 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49703 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49704 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49705 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49706 }
49707 }
49708 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49709 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49710 N->isOnlyUserOf(Op.getNode())) {
49711 SDValue Src = Op.getOperand(0);
49712 if (Src.getScalarValueSizeInBits() == 16)
49713 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49714 }
49715 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49716 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49717 N->isOnlyUserOf(Op.getNode())) {
49718 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49719 Op.getOperand(1));
49720 }
49721 return SDValue();
49722 };
49723 SDValue ZeroN0 = GetZeroableOp(N0);
49724 SDValue ZeroN1 = GetZeroableOp(N1);
49725 if (!ZeroN0 && !ZeroN1)
49726 return SDValue();
49727 N0 = ZeroN0 ? ZeroN0 : N0;
49728 N1 = ZeroN1 ? ZeroN1 : N1;
49729
49730 // Use SplitOpsAndApply to handle AVX splitting.
49731 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49733 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49734 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49735 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49736 DAG.getBitcast(OpVT, Ops[0]),
49737 DAG.getBitcast(OpVT, Ops[1]));
49738 };
49739 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49740}
49741
49743 const X86Subtarget &Subtarget) {
49744 if (!Subtarget.hasSSE2())
49745 return SDValue();
49746
49747 EVT VT = N->getValueType(0);
49748
49749 // Only support vXi64 vectors.
49750 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49751 VT.getVectorNumElements() < 2 ||
49753 return SDValue();
49754
49755 SDValue N0 = N->getOperand(0);
49756 SDValue N1 = N->getOperand(1);
49757
49758 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49759 // 32-bits. We can lower with this if the sign bits stretch that far.
49760 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49761 DAG.ComputeNumSignBits(N1) > 32) {
49762 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49764 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49765 };
49766 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49767 /*CheckBWI*/ false);
49768 }
49769
49770 // If the upper bits are zero we can use a single pmuludq.
49771 APInt Mask = APInt::getHighBitsSet(64, 32);
49772 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49773 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49775 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49776 };
49777 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49778 /*CheckBWI*/ false);
49779 }
49780
49781 return SDValue();
49782}
49783
49786 const X86Subtarget &Subtarget) {
49787 EVT VT = N->getValueType(0);
49788 SDLoc DL(N);
49789
49790 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49791 return V;
49792
49793 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49794 return V;
49795
49796 if (DCI.isBeforeLegalize() && VT.isVector())
49797 return reduceVMULWidth(N, DL, DAG, Subtarget);
49798
49799 if (VT != MVT::i64 && VT != MVT::i32 &&
49800 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49801 return SDValue();
49802
49803 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49804 if (!Known1.isConstant())
49805 return SDValue();
49806
49807 const APInt &C = Known1.getConstant();
49808 if (C.isZero())
49809 return DAG.getConstant(0, DL, VT);
49810
49811 if (C.isAllOnes())
49812 return DAG.getNegative(N->getOperand(0), DL, VT);
49813
49814 if (isPowerOf2_64(C.getZExtValue()))
49815 return SDValue();
49816
49817 // Optimize a single multiply with constant into two operations in order to
49818 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49820 return SDValue();
49821
49822 // An imul is usually smaller than the alternative sequence.
49824 return SDValue();
49825
49826 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49827 return SDValue();
49828
49829 int64_t SignMulAmt = C.getSExtValue();
49830 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49831 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49832
49833 SDValue NewMul = SDValue();
49834 if (VT == MVT::i64 || VT == MVT::i32) {
49835 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49836 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49837 DAG.getConstant(AbsMulAmt, DL, VT));
49838 if (SignMulAmt < 0)
49839 NewMul = DAG.getNegative(NewMul, DL, VT);
49840
49841 return NewMul;
49842 }
49843
49844 uint64_t MulAmt1 = 0;
49845 uint64_t MulAmt2 = 0;
49846 if ((AbsMulAmt % 9) == 0) {
49847 MulAmt1 = 9;
49848 MulAmt2 = AbsMulAmt / 9;
49849 } else if ((AbsMulAmt % 5) == 0) {
49850 MulAmt1 = 5;
49851 MulAmt2 = AbsMulAmt / 5;
49852 } else if ((AbsMulAmt % 3) == 0) {
49853 MulAmt1 = 3;
49854 MulAmt2 = AbsMulAmt / 3;
49855 }
49856
49857 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49858 if (MulAmt2 &&
49859 (isPowerOf2_64(MulAmt2) ||
49860 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49861
49862 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49863 N->user_begin()->getOpcode() == ISD::ADD))
49864 // If second multiplifer is pow2, issue it first. We want the multiply
49865 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49866 // use is an add. Only do this for positive multiply amounts since the
49867 // negate would prevent it from being used as an address mode anyway.
49868 std::swap(MulAmt1, MulAmt2);
49869
49870 if (isPowerOf2_64(MulAmt1))
49871 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49872 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49873 else
49874 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49875 DAG.getConstant(MulAmt1, DL, VT));
49876
49877 if (isPowerOf2_64(MulAmt2))
49878 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49879 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49880 else
49881 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49882 DAG.getConstant(MulAmt2, DL, VT));
49883
49884 // Negate the result.
49885 if (SignMulAmt < 0)
49886 NewMul = DAG.getNegative(NewMul, DL, VT);
49887 } else if (!Subtarget.slowLEA())
49888 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49889 }
49890 if (!NewMul) {
49891 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49892 if (isPowerOf2_64(AbsMulAmt - 1)) {
49893 // (mul x, 2^N + 1) => (add (shl x, N), x)
49894 NewMul = DAG.getNode(
49895 ISD::ADD, DL, VT, N->getOperand(0),
49896 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49897 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49898 if (SignMulAmt < 0)
49899 NewMul = DAG.getNegative(NewMul, DL, VT);
49900 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49901 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49902 NewMul =
49903 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49904 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49905 // To negate, reverse the operands of the subtract.
49906 if (SignMulAmt < 0)
49907 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49908 else
49909 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49910 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49911 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49912 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49913 NewMul =
49914 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49915 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49916 NewMul = DAG.getNode(
49917 ISD::ADD, DL, VT, NewMul,
49918 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49919 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49920 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49921 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49922 NewMul =
49923 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49924 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49925 NewMul = DAG.getNode(
49926 ISD::SUB, DL, VT, NewMul,
49927 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49928 } else if (SignMulAmt >= 0 && VT.isVector() &&
49929 Subtarget.fastImmVectorShift()) {
49930 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49931 uint64_t ShiftAmt1;
49932 std::optional<unsigned> Opc;
49933 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49934 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49935 Opc = ISD::ADD;
49936 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49937 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49938 Opc = ISD::SUB;
49939 }
49940
49941 if (Opc) {
49942 SDValue Shift1 =
49943 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49944 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49945 SDValue Shift2 =
49946 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49947 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49948 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49949 }
49950 }
49951 }
49952
49953 return NewMul;
49954}
49955
49956// Try to form a MULHU or MULHS node by looking for
49957// (srl (mul ext, ext), 16)
49958// TODO: This is X86 specific because we want to be able to handle wide types
49959// before type legalization. But we can only do it if the vector will be
49960// legalized via widening/splitting. Type legalization can't handle promotion
49961// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49962// combiner.
49964 const SDLoc &DL,
49965 const X86Subtarget &Subtarget) {
49966 using namespace SDPatternMatch;
49967 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49968 "SRL or SRA node is required here!");
49969
49970 if (!Subtarget.hasSSE2())
49971 return SDValue();
49972
49973 // Input type should be at least vXi32.
49974 EVT VT = N->getValueType(0);
49975 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49976 return SDValue();
49977
49978 // The operation must be a multiply shifted right by 16.
49979 SDValue LHS, RHS;
49980 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49981 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49982 return SDValue();
49983
49984 unsigned ExtOpc = LHS.getOpcode();
49985 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49986 RHS.getOpcode() != ExtOpc)
49987 return SDValue();
49988
49989 // Peek through the extends.
49990 LHS = LHS.getOperand(0);
49991 RHS = RHS.getOperand(0);
49992
49993 // Ensure the input types match.
49994 EVT MulVT = LHS.getValueType();
49995 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49996 return SDValue();
49997
49998 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49999 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
50000
50001 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50002 return DAG.getNode(ExtOpc, DL, VT, Mulh);
50003}
50004
50006 const X86Subtarget &Subtarget) {
50007 using namespace llvm::SDPatternMatch;
50008 SDValue N0 = N->getOperand(0);
50009 SDValue N1 = N->getOperand(1);
50011 EVT VT = N0.getValueType();
50012 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50013 SDLoc DL(N);
50014
50015 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50016 // with out-of-bounds clamping.
50017 if (N0.getOpcode() == ISD::VSELECT &&
50018 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
50019 SDValue Cond = N0.getOperand(0);
50020 SDValue N00 = N0.getOperand(1);
50021 SDValue N01 = N0.getOperand(2);
50022 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
50024 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50026 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
50027 }
50028 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
50030 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50032 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
50033 }
50034 }
50035
50036 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
50037 // since the result of setcc_c is all zero's or all ones.
50038 if (VT.isInteger() && !VT.isVector() &&
50039 N1C && N0.getOpcode() == ISD::AND &&
50040 N0.getOperand(1).getOpcode() == ISD::Constant) {
50041 SDValue N00 = N0.getOperand(0);
50042 APInt Mask = N0.getConstantOperandAPInt(1);
50043 Mask <<= N1C->getAPIntValue();
50044 bool MaskOK = false;
50045 // We can handle cases concerning bit-widening nodes containing setcc_c if
50046 // we carefully interrogate the mask to make sure we are semantics
50047 // preserving.
50048 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
50049 // of the underlying setcc_c operation if the setcc_c was zero extended.
50050 // Consider the following example:
50051 // zext(setcc_c) -> i32 0x0000FFFF
50052 // c1 -> i32 0x0000FFFF
50053 // c2 -> i32 0x00000001
50054 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
50055 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
50056 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
50057 MaskOK = true;
50058 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
50060 MaskOK = true;
50061 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
50062 N00.getOpcode() == ISD::ANY_EXTEND) &&
50064 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
50065 }
50066 if (MaskOK && Mask != 0)
50067 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
50068 }
50069
50070 return SDValue();
50071}
50072
50074 const X86Subtarget &Subtarget) {
50075 using namespace llvm::SDPatternMatch;
50076 SDValue N0 = N->getOperand(0);
50077 SDValue N1 = N->getOperand(1);
50078 EVT VT = N0.getValueType();
50079 unsigned Size = VT.getSizeInBits();
50080 SDLoc DL(N);
50081
50082 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50083 return V;
50084
50085 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
50086 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
50087 SDValue ShrAmtVal;
50088 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
50090 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
50091 }
50092
50093 // fold (SRA (SHL X, ShlConst), SraConst)
50094 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50095 // or (sext_in_reg X)
50096 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50097 // depending on relation between SraConst and ShlConst.
50098 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50099 // us to do the sext_in_reg from corresponding bit.
50100
50101 // sexts in X86 are MOVs. The MOVs have the same code size
50102 // as above SHIFTs (only SHIFT on 1 has lower code size).
50103 // However the MOVs have 2 advantages to a SHIFT:
50104 // 1. MOVs can write to a register that differs from source
50105 // 2. MOVs accept memory operands
50106
50107 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50108 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50110 return SDValue();
50111
50112 SDValue N00 = N0.getOperand(0);
50113 SDValue N01 = N0.getOperand(1);
50114 APInt ShlConst = N01->getAsAPIntVal();
50115 APInt SraConst = N1->getAsAPIntVal();
50116 EVT CVT = N1.getValueType();
50117
50118 if (CVT != N01.getValueType())
50119 return SDValue();
50120 if (SraConst.isNegative())
50121 return SDValue();
50122
50123 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50124 unsigned ShiftSize = SVT.getSizeInBits();
50125 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50126 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50127 continue;
50128 SDValue NN =
50129 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50130 if (SraConst.eq(ShlConst))
50131 return NN;
50132 if (SraConst.ult(ShlConst))
50133 return DAG.getNode(ISD::SHL, DL, VT, NN,
50134 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50135 return DAG.getNode(ISD::SRA, DL, VT, NN,
50136 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50137 }
50138 return SDValue();
50139}
50140
50143 const X86Subtarget &Subtarget) {
50144 using namespace llvm::SDPatternMatch;
50145 SDValue N0 = N->getOperand(0);
50146 SDValue N1 = N->getOperand(1);
50147 EVT VT = N0.getValueType();
50148 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50149 SDLoc DL(N);
50150
50151 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50152 return V;
50153
50154 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50155 // with out-of-bounds clamping.
50156 if (N0.getOpcode() == ISD::VSELECT &&
50157 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50158 SDValue Cond = N0.getOperand(0);
50159 SDValue N00 = N0.getOperand(1);
50160 SDValue N01 = N0.getOperand(2);
50161 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50163 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50165 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50166 }
50167 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50169 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50171 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50172 }
50173 }
50174
50175 // Only do this on the last DAG combine as it can interfere with other
50176 // combines.
50177 if (!DCI.isAfterLegalizeDAG())
50178 return SDValue();
50179
50180 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50181 // TODO: This is a generic DAG combine that became an x86-only combine to
50182 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50183 // and-not ('andn').
50184 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50185 return SDValue();
50186
50187 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50188 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50189 if (!ShiftC || !AndC)
50190 return SDValue();
50191
50192 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50193 // transform should reduce code size. It may also enable secondary transforms
50194 // from improved known-bits analysis or instruction selection.
50195 APInt MaskVal = AndC->getAPIntValue();
50196
50197 // If this can be matched by a zero extend, don't optimize.
50198 if (MaskVal.isMask()) {
50199 unsigned TO = MaskVal.countr_one();
50200 if (TO >= 8 && isPowerOf2_32(TO))
50201 return SDValue();
50202 }
50203
50204 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50205 unsigned OldMaskSize = MaskVal.getSignificantBits();
50206 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50207 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50208 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50209 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50210 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50211 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50212 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50213 }
50214 return SDValue();
50215}
50216
50218 const X86Subtarget &Subtarget) {
50219 unsigned Opcode = N->getOpcode();
50220 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50221
50222 SDLoc DL(N);
50223 EVT VT = N->getValueType(0);
50224 SDValue N0 = N->getOperand(0);
50225 SDValue N1 = N->getOperand(1);
50226 EVT SrcVT = N0.getValueType();
50227
50228 SDValue BC0 =
50229 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50230 SDValue BC1 =
50231 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50232
50233 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50234 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50235 // truncation trees that help us avoid lane crossing shuffles.
50236 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50237 // TODO: We don't handle vXf64 shuffles yet.
50238 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50239 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50241 SmallVector<int> ShuffleMask, ScaledMask;
50242 SDValue Vec = peekThroughBitcasts(BCSrc);
50243 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50245 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50246 // shuffle to a v4X64 width - we can probably relax this in the future.
50247 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50248 ShuffleOps[0].getValueType().is256BitVector() &&
50249 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50250 SDValue Lo, Hi;
50251 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50252 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50253 Lo = DAG.getBitcast(SrcVT, Lo);
50254 Hi = DAG.getBitcast(SrcVT, Hi);
50255 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50256 Res = DAG.getBitcast(ShufVT, Res);
50257 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50258 return DAG.getBitcast(VT, Res);
50259 }
50260 }
50261 }
50262 }
50263
50264 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50265 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50266 // If either/both ops are a shuffle that can scale to v2x64,
50267 // then see if we can perform this as a v4x32 post shuffle.
50268 SmallVector<SDValue> Ops0, Ops1;
50269 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50270 bool IsShuf0 =
50271 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50272 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50273 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50274 bool IsShuf1 =
50275 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50276 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50277 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50278 if (IsShuf0 || IsShuf1) {
50279 if (!IsShuf0) {
50280 Ops0.assign({BC0});
50281 ScaledMask0.assign({0, 1});
50282 }
50283 if (!IsShuf1) {
50284 Ops1.assign({BC1});
50285 ScaledMask1.assign({0, 1});
50286 }
50287
50288 SDValue LHS, RHS;
50289 int PostShuffle[4] = {-1, -1, -1, -1};
50290 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50291 if (M < 0)
50292 return true;
50293 Idx = M % 2;
50294 SDValue Src = Ops[M / 2];
50295 if (!LHS || LHS == Src) {
50296 LHS = Src;
50297 return true;
50298 }
50299 if (!RHS || RHS == Src) {
50300 Idx += 2;
50301 RHS = Src;
50302 return true;
50303 }
50304 return false;
50305 };
50306 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50307 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50308 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50309 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50310 LHS = DAG.getBitcast(SrcVT, LHS);
50311 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50312 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50313 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50314 Res = DAG.getBitcast(ShufVT, Res);
50315 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50316 return DAG.getBitcast(VT, Res);
50317 }
50318 }
50319 }
50320
50321 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50322 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50323 SmallVector<int> Mask0, Mask1;
50324 SmallVector<SDValue> Ops0, Ops1;
50325 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50326 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50327 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50328 !Ops0.empty() && !Ops1.empty() &&
50329 all_of(Ops0,
50330 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50331 all_of(Ops1,
50332 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50333 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50334 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50335 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50336 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50337 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50338 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50339 if ((Op00 == Op11) && (Op01 == Op10)) {
50340 std::swap(Op10, Op11);
50342 }
50343 if ((Op00 == Op10) && (Op01 == Op11)) {
50344 const int Map[4] = {0, 2, 1, 3};
50345 SmallVector<int, 4> ShuffleMask(
50346 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50347 Map[ScaledMask1[1]]});
50348 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50349 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50350 DAG.getBitcast(SrcVT, Op01));
50351 Res = DAG.getBitcast(ShufVT, Res);
50352 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50353 return DAG.getBitcast(VT, Res);
50354 }
50355 }
50356 }
50357
50358 return SDValue();
50359}
50360
50363 const X86Subtarget &Subtarget) {
50364 unsigned Opcode = N->getOpcode();
50365 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50366 "Unexpected pack opcode");
50367
50368 EVT VT = N->getValueType(0);
50369 SDValue N0 = N->getOperand(0);
50370 SDValue N1 = N->getOperand(1);
50371 unsigned NumDstElts = VT.getVectorNumElements();
50372 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50373 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50374 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50375 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50376 "Unexpected PACKSS/PACKUS input type");
50377
50378 bool IsSigned = (X86ISD::PACKSS == Opcode);
50379
50380 // Constant Folding.
50381 APInt UndefElts0, UndefElts1;
50382 SmallVector<APInt, 32> EltBits0, EltBits1;
50383 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50384 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50385 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50386 /*AllowWholeUndefs*/ true,
50387 /*AllowPartialUndefs*/ true) &&
50388 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50389 /*AllowWholeUndefs*/ true,
50390 /*AllowPartialUndefs*/ true)) {
50391 unsigned NumLanes = VT.getSizeInBits() / 128;
50392 unsigned NumSrcElts = NumDstElts / 2;
50393 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50394 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50395
50396 APInt Undefs(NumDstElts, 0);
50397 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50398 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50399 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50400 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50401 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50402 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50403
50404 if (UndefElts[SrcIdx]) {
50405 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50406 continue;
50407 }
50408
50409 APInt &Val = EltBits[SrcIdx];
50410 if (IsSigned) {
50411 // PACKSS: Truncate signed value with signed saturation.
50412 // Source values less than dst minint are saturated to minint.
50413 // Source values greater than dst maxint are saturated to maxint.
50414 Val = Val.truncSSat(DstBitsPerElt);
50415 } else {
50416 // PACKUS: Truncate signed value with unsigned saturation.
50417 // Source values less than zero are saturated to zero.
50418 // Source values greater than dst maxuint are saturated to maxuint.
50419 // NOTE: This is different from APInt::truncUSat.
50420 if (Val.isIntN(DstBitsPerElt))
50421 Val = Val.trunc(DstBitsPerElt);
50422 else if (Val.isNegative())
50423 Val = APInt::getZero(DstBitsPerElt);
50424 else
50425 Val = APInt::getAllOnes(DstBitsPerElt);
50426 }
50427 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50428 }
50429 }
50430
50431 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50432 }
50433
50434 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50435 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50436 return V;
50437
50438 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50439 // Currently limit this to allsignbits cases only.
50440 if (IsSigned &&
50441 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50442 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50443 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50444 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50445 if (Not0 && Not1) {
50446 SDLoc DL(N);
50447 MVT SrcVT = N0.getSimpleValueType();
50448 SDValue Pack =
50449 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50450 DAG.getBitcast(SrcVT, Not1));
50451 return DAG.getNOT(DL, Pack, VT);
50452 }
50453 }
50454
50455 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50456 // truncate to create a larger truncate.
50457 if (Subtarget.hasAVX512() &&
50458 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50459 N0.getOperand(0).getValueType() == MVT::v8i32) {
50460 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50461 (!IsSigned &&
50462 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50463 if (Subtarget.hasVLX())
50464 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50465
50466 // Widen input to v16i32 so we can truncate that.
50467 SDLoc dl(N);
50468 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50469 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50470 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50471 }
50472 }
50473
50474 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50475 if (VT.is128BitVector()) {
50476 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50477 SDValue Src0, Src1;
50478 if (N0.getOpcode() == ExtOpc &&
50480 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50481 Src0 = N0.getOperand(0);
50482 }
50483 if (N1.getOpcode() == ExtOpc &&
50485 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50486 Src1 = N1.getOperand(0);
50487 }
50488 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50489 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50490 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50491 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50492 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50493 }
50494
50495 // Try again with pack(*_extend_vector_inreg, undef).
50496 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50498 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50499 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50500 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50501 DAG);
50502 }
50503
50504 // Attempt to combine as shuffle.
50505 SDValue Op(N, 0);
50506 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50507 return Res;
50508
50509 return SDValue();
50510}
50511
50514 const X86Subtarget &Subtarget) {
50515 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50516 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50517 "Unexpected horizontal add/sub opcode");
50518
50519 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50520 MVT VT = N->getSimpleValueType(0);
50521 SDValue LHS = N->getOperand(0);
50522 SDValue RHS = N->getOperand(1);
50523
50524 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50525 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50526 LHS.getOpcode() == RHS.getOpcode() &&
50527 LHS.getValueType() == RHS.getValueType() &&
50528 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50529 SDValue LHS0 = LHS.getOperand(0);
50530 SDValue LHS1 = LHS.getOperand(1);
50531 SDValue RHS0 = RHS.getOperand(0);
50532 SDValue RHS1 = RHS.getOperand(1);
50533 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50534 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50535 SDLoc DL(N);
50536 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50537 LHS0.isUndef() ? LHS1 : LHS0,
50538 RHS0.isUndef() ? RHS1 : RHS0);
50539 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50540 Res = DAG.getBitcast(ShufVT, Res);
50541 SDValue NewLHS =
50542 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50543 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50544 SDValue NewRHS =
50545 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50546 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50547 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50548 DAG.getBitcast(VT, NewRHS));
50549 }
50550 }
50551 }
50552
50553 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50554 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50555 return V;
50556
50557 return SDValue();
50558}
50559
50562 const X86Subtarget &Subtarget) {
50563 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50564 X86ISD::VSRL == N->getOpcode()) &&
50565 "Unexpected shift opcode");
50566 EVT VT = N->getValueType(0);
50567 SDValue N0 = N->getOperand(0);
50568 SDValue N1 = N->getOperand(1);
50569
50570 // Shift zero -> zero.
50572 return DAG.getConstant(0, SDLoc(N), VT);
50573
50574 // Detect constant shift amounts.
50575 APInt UndefElts;
50576 SmallVector<APInt, 32> EltBits;
50577 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50578 /*AllowWholeUndefs*/ true,
50579 /*AllowPartialUndefs*/ false)) {
50580 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50581 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50582 EltBits[0].getZExtValue(), DAG);
50583 }
50584
50585 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50586 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50587 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50588 return SDValue(N, 0);
50589
50590 return SDValue();
50591}
50592
50595 const X86Subtarget &Subtarget) {
50596 unsigned Opcode = N->getOpcode();
50597 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50598 X86ISD::VSRLI == Opcode) &&
50599 "Unexpected shift opcode");
50600 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50601 EVT VT = N->getValueType(0);
50602 SDValue N0 = N->getOperand(0);
50603 SDValue N1 = N->getOperand(1);
50604 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50605 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50606 "Unexpected value type");
50607 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50608
50609 // (shift undef, X) -> 0
50610 if (N0.isUndef())
50611 return DAG.getConstant(0, SDLoc(N), VT);
50612
50613 // Out of range logical bit shifts are guaranteed to be zero.
50614 // Out of range arithmetic bit shifts splat the sign bit.
50615 unsigned ShiftVal = N->getConstantOperandVal(1);
50616 if (ShiftVal >= NumBitsPerElt) {
50617 if (LogicalShift)
50618 return DAG.getConstant(0, SDLoc(N), VT);
50619 ShiftVal = NumBitsPerElt - 1;
50620 }
50621
50622 // (shift X, 0) -> X
50623 if (!ShiftVal)
50624 return N0;
50625
50626 // (shift 0, C) -> 0
50628 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50629 // result are all zeros, not undef.
50630 return DAG.getConstant(0, SDLoc(N), VT);
50631
50632 // (VSRAI -1, C) -> -1
50633 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50634 // N0 is all ones or undef. We guarantee that the bits shifted into the
50635 // result are all ones, not undef.
50636 return DAG.getAllOnesConstant(SDLoc(N), VT);
50637
50638 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50639 unsigned NewShiftVal = Amt0 + Amt1;
50640 if (NewShiftVal >= NumBitsPerElt) {
50641 // Out of range logical bit shifts are guaranteed to be zero.
50642 // Out of range arithmetic bit shifts splat the sign bit.
50643 if (LogicalShift)
50644 return DAG.getConstant(0, SDLoc(N), VT);
50645 NewShiftVal = NumBitsPerElt - 1;
50646 }
50647 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50648 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50649 };
50650
50651 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50652 if (Opcode == N0.getOpcode())
50653 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50654
50655 // (shl (add X, X), C) -> (shl X, (C + 1))
50656 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50657 N0.getOperand(0) == N0.getOperand(1))
50658 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50659
50660 // We can decode 'whole byte' logical bit shifts as shuffles.
50661 if (LogicalShift && (ShiftVal % 8) == 0) {
50662 SDValue Op(N, 0);
50663 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50664 return Res;
50665 }
50666
50667 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50668 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50669 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50670 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50671 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50672 N0.getOpcode() == X86ISD::PSHUFD &&
50673 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50674 N0->hasOneUse()) {
50676 if (BC.getOpcode() == X86ISD::VSHLI &&
50677 BC.getScalarValueSizeInBits() == 64 &&
50678 BC.getConstantOperandVal(1) == 63) {
50679 SDLoc DL(N);
50680 SDValue Src = BC.getOperand(0);
50681 Src = DAG.getBitcast(VT, Src);
50682 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50683 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50684 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50685 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50686 return Src;
50687 }
50688 }
50689
50690 auto TryConstantFold = [&](SDValue V) {
50691 APInt UndefElts;
50692 SmallVector<APInt, 32> EltBits;
50693 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50694 /*AllowWholeUndefs*/ true,
50695 /*AllowPartialUndefs*/ true))
50696 return SDValue();
50697 assert(EltBits.size() == VT.getVectorNumElements() &&
50698 "Unexpected shift value type");
50699 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50700 // created an undef input due to no input bits being demanded, but user
50701 // still expects 0 in other bits.
50702 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50703 APInt &Elt = EltBits[i];
50704 if (UndefElts[i])
50705 Elt = 0;
50706 else if (X86ISD::VSHLI == Opcode)
50707 Elt <<= ShiftVal;
50708 else if (X86ISD::VSRAI == Opcode)
50709 Elt.ashrInPlace(ShiftVal);
50710 else
50711 Elt.lshrInPlace(ShiftVal);
50712 }
50713 // Reset undef elements since they were zeroed above.
50714 UndefElts = 0;
50715 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50716 };
50717
50718 // Constant Folding.
50719 if (N->isOnlyUserOf(N0.getNode())) {
50720 if (SDValue C = TryConstantFold(N0))
50721 return C;
50722
50723 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50724 // Don't break NOT patterns.
50726 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50727 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50729 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50730 SDLoc DL(N);
50731 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50732 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50733 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50734 }
50735 }
50736 }
50737
50738 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50739 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50740 DCI))
50741 return SDValue(N, 0);
50742
50743 return SDValue();
50744}
50745
50748 const X86Subtarget &Subtarget) {
50749 EVT VT = N->getValueType(0);
50750 unsigned Opcode = N->getOpcode();
50751 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50752 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50753 Opcode == ISD::INSERT_VECTOR_ELT) &&
50754 "Unexpected vector insertion");
50755
50756 SDValue Vec = N->getOperand(0);
50757 SDValue Scl = N->getOperand(1);
50758 SDValue Idx = N->getOperand(2);
50759
50760 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50761 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50762 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50763
50764 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50765 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50766 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50767 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50768 APInt::getAllOnes(NumBitsPerElt), DCI))
50769 return SDValue(N, 0);
50770 }
50771
50772 // Attempt to combine insertion patterns to a shuffle.
50773 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50774 SDValue Op(N, 0);
50775 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50776 return Res;
50777 }
50778
50779 return SDValue();
50780}
50781
50782/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50783/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50784/// OR -> CMPNEQSS.
50787 const X86Subtarget &Subtarget) {
50788 unsigned opcode;
50789
50790 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50791 // we're requiring SSE2 for both.
50792 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50793 SDValue N0 = N->getOperand(0);
50794 SDValue N1 = N->getOperand(1);
50795 SDValue CMP0 = N0.getOperand(1);
50796 SDValue CMP1 = N1.getOperand(1);
50797 SDLoc DL(N);
50798
50799 // The SETCCs should both refer to the same CMP.
50800 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50801 return SDValue();
50802
50803 SDValue CMP00 = CMP0->getOperand(0);
50804 SDValue CMP01 = CMP0->getOperand(1);
50805 EVT VT = CMP00.getValueType();
50806
50807 if (VT == MVT::f32 || VT == MVT::f64 ||
50808 (VT == MVT::f16 && Subtarget.hasFP16())) {
50809 bool ExpectingFlags = false;
50810 // Check for any users that want flags:
50811 for (const SDNode *U : N->users()) {
50812 if (ExpectingFlags)
50813 break;
50814
50815 switch (U->getOpcode()) {
50816 default:
50817 case ISD::BR_CC:
50818 case ISD::BRCOND:
50819 case ISD::SELECT:
50820 ExpectingFlags = true;
50821 break;
50822 case ISD::CopyToReg:
50823 case ISD::SIGN_EXTEND:
50824 case ISD::ZERO_EXTEND:
50825 case ISD::ANY_EXTEND:
50826 break;
50827 }
50828 }
50829
50830 if (!ExpectingFlags) {
50831 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50832 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50833
50834 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50835 X86::CondCode tmp = cc0;
50836 cc0 = cc1;
50837 cc1 = tmp;
50838 }
50839
50840 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50841 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50842 // FIXME: need symbolic constants for these magic numbers.
50843 // See X86ATTInstPrinter.cpp:printSSECC().
50844 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50845 if (Subtarget.hasAVX512()) {
50846 SDValue FSetCC =
50847 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50848 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50849 // Need to fill with zeros to ensure the bitcast will produce zeroes
50850 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50851 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50852 DAG.getConstant(0, DL, MVT::v16i1),
50853 FSetCC, DAG.getVectorIdxConstant(0, DL));
50854 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50855 N->getSimpleValueType(0));
50856 }
50857 SDValue OnesOrZeroesF =
50858 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50859 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50860
50861 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50862 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50863
50864 if (is64BitFP && !Subtarget.is64Bit()) {
50865 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50866 // 64-bit integer, since that's not a legal type. Since
50867 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50868 // bits, but can do this little dance to extract the lowest 32 bits
50869 // and work with those going forward.
50870 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50871 MVT::v2f64, OnesOrZeroesF);
50872 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50873 OnesOrZeroesF =
50874 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50875 DAG.getVectorIdxConstant(0, DL));
50876 IntVT = MVT::i32;
50877 }
50878
50879 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50880 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50881 DAG.getConstant(1, DL, IntVT));
50882 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50883 ANDed);
50884 return OneBitOfTruth;
50885 }
50886 }
50887 }
50888 }
50889 return SDValue();
50890}
50891
50892/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50894 SelectionDAG &DAG) {
50895 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50896
50897 MVT VT = N->getSimpleValueType(0);
50898 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50899 return SDValue();
50900
50901 SDValue X, Y;
50902 SDValue N0 = N->getOperand(0);
50903 SDValue N1 = N->getOperand(1);
50904
50905 if (SDValue Not = IsNOT(N0, DAG)) {
50906 X = Not;
50907 Y = N1;
50908 } else if (SDValue Not = IsNOT(N1, DAG)) {
50909 X = Not;
50910 Y = N0;
50911 } else
50912 return SDValue();
50913
50914 X = DAG.getBitcast(VT, X);
50915 Y = DAG.getBitcast(VT, Y);
50916 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50917}
50918
50919/// Try to fold:
50920/// and (vector_shuffle<Z,...,Z>
50921/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50922/// ->
50923/// andnp (vector_shuffle<Z,...,Z>
50924/// (insert_vector_elt undef, X, Z), undef), Y
50926 const X86Subtarget &Subtarget) {
50927 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50928
50929 EVT VT = N->getValueType(0);
50930 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50931 // value and require extra moves.
50932 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50933 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50934 return SDValue();
50935
50936 auto GetNot = [&DAG](SDValue V) {
50938 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50939 // end-users are ISD::AND including cases
50940 // (and(extract_vector_element(SVN), Y)).
50941 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50942 !SVN->getOperand(1).isUndef()) {
50943 return SDValue();
50944 }
50945 SDValue IVEN = SVN->getOperand(0);
50946 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50947 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50948 return SDValue();
50949 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50950 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50951 return SDValue();
50952 SDValue Src = IVEN.getOperand(1);
50953 if (SDValue Not = IsNOT(Src, DAG)) {
50954 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50955 SDValue NotIVEN =
50957 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50958 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50959 SVN->getOperand(1), SVN->getMask());
50960 }
50961 return SDValue();
50962 };
50963
50964 SDValue X, Y;
50965 SDValue N0 = N->getOperand(0);
50966 SDValue N1 = N->getOperand(1);
50967 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50968
50969 if (SDValue Not = GetNot(N0)) {
50970 X = Not;
50971 Y = N1;
50972 } else if (SDValue Not = GetNot(N1)) {
50973 X = Not;
50974 Y = N0;
50975 } else
50976 return SDValue();
50977
50978 X = DAG.getBitcast(VT, X);
50979 Y = DAG.getBitcast(VT, Y);
50980 SDLoc DL(N);
50981
50982 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50983 // AVX2.
50984 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50986 SDValue LoX, HiX;
50987 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50988 SDValue LoY, HiY;
50989 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50990 EVT SplitVT = LoX.getValueType();
50991 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50992 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50993 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50994 }
50995
50996 if (TLI.isTypeLegal(VT))
50997 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50998
50999 return SDValue();
51000}
51001
51002// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
51003// logical operations, like in the example below.
51004// or (and (truncate x, truncate y)),
51005// (xor (truncate z, build_vector (constants)))
51006// Given a target type \p VT, we generate
51007// or (and x, y), (xor z, zext(build_vector (constants)))
51008// given x, y and z are of type \p VT. We can do so, if operands are either
51009// truncates from VT types, the second operand is a vector of constants, can
51010// be recursively promoted or is an existing extension we can extend further.
51012 SelectionDAG &DAG,
51013 const X86Subtarget &Subtarget,
51014 unsigned Depth) {
51015 // Limit recursion to avoid excessive compile times.
51017 return SDValue();
51018
51019 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
51020 return SDValue();
51021
51022 SDValue N0 = N.getOperand(0);
51023 SDValue N1 = N.getOperand(1);
51024
51025 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51026 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
51027 return SDValue();
51028
51029 if (SDValue NN0 =
51030 PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
51031 N0 = NN0;
51032 else {
51033 // The left side has to be a 'trunc'.
51034 bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
51035 N0.getOperand(0).getValueType() == VT;
51036 if (LHSTrunc)
51037 N0 = N0.getOperand(0);
51038 else
51039 return SDValue();
51040 }
51041
51042 if (SDValue NN1 =
51043 PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
51044 N1 = NN1;
51045 else {
51046 // The right side has to be a 'trunc', a (foldable) constant or an
51047 // existing extension we can extend further.
51048 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
51049 N1.getOperand(0).getValueType() == VT;
51050 if (RHSTrunc)
51051 N1 = N1.getOperand(0);
51052 else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
51053 Subtarget.hasInt256() && N1.hasOneUse())
51054 N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
51055 else if (SDValue Cst =
51057 N1 = Cst;
51058 else
51059 return SDValue();
51060 }
51061
51062 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
51063}
51064
51065// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
51066// register. In most cases we actually compare or select YMM-sized registers
51067// and mixing the two types creates horrible code. This method optimizes
51068// some of the transition sequences.
51069// Even with AVX-512 this is still useful for removing casts around logical
51070// operations on vXi1 mask types.
51072 SelectionDAG &DAG,
51073 const X86Subtarget &Subtarget) {
51074 EVT VT = N.getValueType();
51075 assert(VT.isVector() && "Expected vector type");
51076 assert((N.getOpcode() == ISD::ANY_EXTEND ||
51077 N.getOpcode() == ISD::ZERO_EXTEND ||
51078 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
51079
51080 SDValue Narrow = N.getOperand(0);
51081 EVT NarrowVT = Narrow.getValueType();
51082
51083 // Generate the wide operation.
51084 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
51085 if (!Op)
51086 return SDValue();
51087 switch (N.getOpcode()) {
51088 default: llvm_unreachable("Unexpected opcode");
51089 case ISD::ANY_EXTEND:
51090 return Op;
51091 case ISD::ZERO_EXTEND:
51092 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
51093 case ISD::SIGN_EXTEND:
51094 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
51095 Op, DAG.getValueType(NarrowVT));
51096 }
51097}
51098
51099static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51100 unsigned FPOpcode;
51101 switch (Opcode) {
51102 // clang-format off
51103 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51104 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51105 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51106 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51107 // clang-format on
51108 }
51109 return FPOpcode;
51110}
51111
51112/// If both input operands of a logic op are being cast from floating-point
51113/// types or FP compares, try to convert this into a floating-point logic node
51114/// to avoid unnecessary moves from SSE to integer registers.
51115static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51116 SDValue N0, SDValue N1,
51117 SelectionDAG &DAG,
51119 const X86Subtarget &Subtarget) {
51120 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51121 "Unexpected bit opcode");
51122
51123 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51124 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51125 return SDValue();
51126
51127 SDValue N00 = N0.getOperand(0);
51128 SDValue N10 = N1.getOperand(0);
51129 EVT N00Type = N00.getValueType();
51130 EVT N10Type = N10.getValueType();
51131
51132 // Ensure that both types are the same and are legal scalar fp types.
51133 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51134 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51135 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51136 return SDValue();
51137
51138 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51139 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51140 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51141 return DAG.getBitcast(VT, FPLogic);
51142 }
51143
51144 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51145 !N1.hasOneUse())
51146 return SDValue();
51147
51148 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51149 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51150
51151 // The vector ISA for FP predicates is incomplete before AVX, so converting
51152 // COMIS* to CMPS* may not be a win before AVX.
51153 if (!Subtarget.hasAVX() &&
51154 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51155 return SDValue();
51156
51157 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51158 // and vector logic:
51159 // logic (setcc N00, N01), (setcc N10, N11) -->
51160 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51161 unsigned NumElts = 128 / N00Type.getSizeInBits();
51162 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51163 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51164 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51165 SDValue N01 = N0.getOperand(1);
51166 SDValue N11 = N1.getOperand(1);
51167 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51168 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51169 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51170 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51171 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51172 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51173 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51174 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51175}
51176
51177// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51178// to reduce XMM->GPR traffic.
51179static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51180 SDValue N1, SelectionDAG &DAG) {
51181 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51182 "Unexpected bit opcode");
51183
51184 // Both operands must be single use MOVMSK.
51185 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51186 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51187 return SDValue();
51188
51189 SDValue Vec0 = N0.getOperand(0);
51190 SDValue Vec1 = N1.getOperand(0);
51191 EVT VecVT0 = Vec0.getValueType();
51192 EVT VecVT1 = Vec1.getValueType();
51193
51194 // Both MOVMSK operands must be from vectors of the same size and same element
51195 // size, but its OK for a fp/int diff.
51196 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51197 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51198 return SDValue();
51199
51200 unsigned VecOpc =
51202 SDValue Result =
51203 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51204 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51205}
51206
51207// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51208// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51209// handles in InstCombine.
51210static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51211 SDValue N0, SDValue N1,
51212 SelectionDAG &DAG) {
51213 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51214 "Unexpected bit opcode");
51215
51216 // Both operands must be single use.
51217 if (!N0.hasOneUse() || !N1.hasOneUse())
51218 return SDValue();
51219
51220 // Search for matching shifts.
51223
51224 unsigned BCOpc = BC0.getOpcode();
51225 EVT BCVT = BC0.getValueType();
51226 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51227 return SDValue();
51228
51229 switch (BCOpc) {
51230 case X86ISD::VSHLI:
51231 case X86ISD::VSRLI:
51232 case X86ISD::VSRAI: {
51233 if (BC0.getOperand(1) != BC1.getOperand(1))
51234 return SDValue();
51235 SDValue BitOp =
51236 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51237 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51238 return DAG.getBitcast(VT, Shift);
51239 }
51240 }
51241
51242 return SDValue();
51243}
51244
51245// Attempt to fold:
51246// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51247// TODO: Handle PACKUS handling.
51248static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51249 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51250 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51251 "Unexpected bit opcode");
51252
51253 // Both operands must be single use.
51254 if (!N0.hasOneUse() || !N1.hasOneUse())
51255 return SDValue();
51256
51257 // Search for matching packs.
51260
51261 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51262 return SDValue();
51263
51264 MVT DstVT = N0.getSimpleValueType();
51265 if (DstVT != N1.getSimpleValueType())
51266 return SDValue();
51267
51268 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51269 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51270
51271 // Limit to allsignbits packing.
51272 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51273 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51274 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51275 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51276 return SDValue();
51277
51278 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51279 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51280 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51281}
51282
51283/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51284/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51285/// with a shift-right to eliminate loading the vector constant mask value.
51287 SelectionDAG &DAG,
51288 const X86Subtarget &Subtarget) {
51289 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51290 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51291 EVT VT = Op0.getValueType();
51292 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51293 return SDValue();
51294
51295 // Try to convert an "is positive" signbit masking operation into arithmetic
51296 // shift and "andn". This saves a materialization of a -1 vector constant.
51297 // The "is negative" variant should be handled more generally because it only
51298 // requires "and" rather than "andn":
51299 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51300 //
51301 // This is limited to the original type to avoid producing even more bitcasts.
51302 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51303 // will be profitable.
51304 if (N->getValueType(0) == VT &&
51305 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51306 SDValue X, Y;
51307 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51308 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51309 X = Op1.getOperand(0);
51310 Y = Op0;
51311 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51312 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51313 X = Op0.getOperand(0);
51314 Y = Op1;
51315 }
51316 if (X && Y) {
51317 SDValue Sra =
51319 VT.getScalarSizeInBits() - 1, DAG);
51320 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51321 }
51322 }
51323
51324 APInt SplatVal;
51325 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51326 return SDValue();
51327
51328 // Don't prevent creation of ANDN.
51329 if (isBitwiseNot(Op0))
51330 return SDValue();
51331
51332 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51333 return SDValue();
51334
51335 unsigned EltBitWidth = VT.getScalarSizeInBits();
51336 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51337 return SDValue();
51338
51339 unsigned ShiftVal = SplatVal.countr_one();
51340 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51341 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51342 return DAG.getBitcast(N->getValueType(0), Shift);
51343}
51344
51345// Get the index node from the lowered DAG of a GEP IR instruction with one
51346// indexing dimension.
51348 if (Ld->isIndexed())
51349 return SDValue();
51350
51351 SDValue Base = Ld->getBasePtr();
51352 if (Base.getOpcode() != ISD::ADD)
51353 return SDValue();
51354
51355 SDValue ShiftedIndex = Base.getOperand(0);
51356 if (ShiftedIndex.getOpcode() != ISD::SHL)
51357 return SDValue();
51358
51359 return ShiftedIndex.getOperand(0);
51360}
51361
51362static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51363 return Subtarget.hasBMI2() &&
51364 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51365}
51366
51367/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51368/// This undoes the inverse fold performed in InstCombine
51370 SelectionDAG &DAG) {
51371 using namespace llvm::SDPatternMatch;
51372 MVT VT = N->getSimpleValueType(0);
51373 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51374 return SDValue();
51375
51376 SDValue X, Y, Z;
51377 if (sd_match(N, m_And(m_Value(X),
51378 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51379 // Don't fold if Y or Z are constants to prevent infinite loops.
51382 return DAG.getNode(
51383 ISD::AND, DL, VT, X,
51384 DAG.getNOT(
51385 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51386 }
51387
51388 return SDValue();
51389}
51390
51391// This function recognizes cases where X86 bzhi instruction can replace and
51392// 'and-load' sequence.
51393// In case of loading integer value from an array of constants which is defined
51394// as follows:
51395//
51396// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51397//
51398// then applying a bitwise and on the result with another input.
51399// It's equivalent to performing bzhi (zero high bits) on the input, with the
51400// same index of the load.
51402 const X86Subtarget &Subtarget) {
51403 MVT VT = Node->getSimpleValueType(0);
51404 SDLoc dl(Node);
51405
51406 // Check if subtarget has BZHI instruction for the node's type
51407 if (!hasBZHI(Subtarget, VT))
51408 return SDValue();
51409
51410 // Try matching the pattern for both operands.
51411 for (unsigned i = 0; i < 2; i++) {
51412 // continue if the operand is not a load instruction
51413 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51414 if (!Ld)
51415 continue;
51416 const Value *MemOp = Ld->getMemOperand()->getValue();
51417 if (!MemOp)
51418 continue;
51419 // Get the Node which indexes into the array.
51421 if (!Index)
51422 continue;
51423
51424 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51425 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51426 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51427 Constant *Init = GV->getInitializer();
51428 Type *Ty = Init->getType();
51430 !Ty->getArrayElementType()->isIntegerTy() ||
51431 Ty->getArrayElementType()->getScalarSizeInBits() !=
51432 VT.getSizeInBits() ||
51433 Ty->getArrayNumElements() >
51434 Ty->getArrayElementType()->getScalarSizeInBits())
51435 continue;
51436
51437 // Check if the array's constant elements are suitable to our case.
51438 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51439 bool ConstantsMatch = true;
51440 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51441 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51442 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51443 ConstantsMatch = false;
51444 break;
51445 }
51446 }
51447 if (!ConstantsMatch)
51448 continue;
51449
51450 // Do the transformation (For 32-bit type):
51451 // -> (and (load arr[idx]), inp)
51452 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51453 // that will be replaced with one bzhi instruction.
51454 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51455 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51456
51457 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51458 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51459 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51460
51461 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51462 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51463 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51464 }
51465 }
51466 }
51467 }
51468 return SDValue();
51469}
51470
51471// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51472// Where C is a mask containing the same number of bits as the setcc and
51473// where the setcc will freely 0 upper bits of k-register. We can replace the
51474// undef in the concat with 0s and remove the AND. This mainly helps with
51475// v2i1/v4i1 setcc being casted to scalar.
51477 const X86Subtarget &Subtarget) {
51478 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51479
51480 EVT VT = N->getValueType(0);
51481
51482 // Make sure this is an AND with constant. We will check the value of the
51483 // constant later.
51484 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51485 if (!C1)
51486 return SDValue();
51487
51488 // This is implied by the ConstantSDNode.
51489 assert(!VT.isVector() && "Expected scalar VT!");
51490
51491 SDValue Src = N->getOperand(0);
51492 if (!Src.hasOneUse())
51493 return SDValue();
51494
51495 // (Optionally) peek through any_extend().
51496 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51497 if (!Src.getOperand(0).hasOneUse())
51498 return SDValue();
51499 Src = Src.getOperand(0);
51500 }
51501
51502 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51503 return SDValue();
51504
51505 Src = Src.getOperand(0);
51506 EVT SrcVT = Src.getValueType();
51507
51508 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51509 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51510 !TLI.isTypeLegal(SrcVT))
51511 return SDValue();
51512
51513 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51514 return SDValue();
51515
51516 // We only care about the first subvector of the concat, we expect the
51517 // other subvectors to be ignored due to the AND if we make the change.
51518 SDValue SubVec = Src.getOperand(0);
51519 EVT SubVecVT = SubVec.getValueType();
51520
51521 // The RHS of the AND should be a mask with as many bits as SubVec.
51522 if (!TLI.isTypeLegal(SubVecVT) ||
51523 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51524 return SDValue();
51525
51526 // First subvector should be a setcc with a legal result type or a
51527 // AND containing at least one setcc with a legal result type.
51528 auto IsLegalSetCC = [&](SDValue V) {
51529 if (V.getOpcode() != ISD::SETCC)
51530 return false;
51531 EVT SetccVT = V.getOperand(0).getValueType();
51532 if (!TLI.isTypeLegal(SetccVT) ||
51533 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51534 return false;
51535 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51536 return false;
51537 return true;
51538 };
51539 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51540 (IsLegalSetCC(SubVec.getOperand(0)) ||
51541 IsLegalSetCC(SubVec.getOperand(1))))))
51542 return SDValue();
51543
51544 // We passed all the checks. Rebuild the concat_vectors with zeroes
51545 // and cast it back to VT.
51546 SDLoc dl(N);
51547 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51548 DAG.getConstant(0, dl, SubVecVT));
51549 Ops[0] = SubVec;
51550 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51551 Ops);
51552 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51553 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51554}
51555
51557 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51558 // We don't want to go crazy with the recursion here. This isn't a super
51559 // important optimization.
51560 static constexpr unsigned kMaxDepth = 2;
51561
51562 // Only do this re-ordering if op has one use.
51563 if (!Op.hasOneUse())
51564 return SDValue();
51565
51566 SDLoc DL(Op);
51567 // If we hit another assosiative op, recurse further.
51568 if (Op.getOpcode() == Opc) {
51569 // Done recursing.
51570 if (Depth++ >= kMaxDepth)
51571 return SDValue();
51572
51573 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51574 if (SDValue R =
51575 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51576 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51577 Op.getOperand(1 - OpIdx));
51578
51579 } else if (Op.getOpcode() == ISD::SUB) {
51580 if (Opc == ISD::AND) {
51581 // BLSI: (and x, (sub 0, x))
51582 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51583 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51584 }
51585 // Opc must be ISD::AND or ISD::XOR
51586 // BLSR: (and x, (sub x, 1))
51587 // BLSMSK: (xor x, (sub x, 1))
51588 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51589 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51590
51591 } else if (Op.getOpcode() == ISD::ADD) {
51592 // Opc must be ISD::AND or ISD::XOR
51593 // BLSR: (and x, (add x, -1))
51594 // BLSMSK: (xor x, (add x, -1))
51595 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51596 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51597 }
51598 return SDValue();
51599}
51600
51602 const X86Subtarget &Subtarget) {
51603 EVT VT = N->getValueType(0);
51604 // Make sure this node is a candidate for BMI instructions.
51605 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51606 (VT != MVT::i32 && VT != MVT::i64))
51607 return SDValue();
51608
51609 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51610
51611 // Try and match LHS and RHS.
51612 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51613 if (SDValue OpMatch =
51614 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51615 N->getOperand(1 - OpIdx), 0))
51616 return OpMatch;
51617 return SDValue();
51618}
51619
51620/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51622 SelectionDAG &DAG,
51623 const X86Subtarget &Subtarget) {
51624 using namespace llvm::SDPatternMatch;
51625
51626 EVT VT = And->getValueType(0);
51627 // Make sure this node is a candidate for BMI instructions.
51628 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51629 return SDValue();
51630
51631 SDValue X;
51632 SDValue Y;
51635 m_Value(Y))))
51636 return SDValue();
51637
51638 SDValue BLSMSK =
51639 DAG.getNode(ISD::XOR, DL, VT, X,
51640 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51641 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51642 return AndN;
51643}
51644
51646 SelectionDAG &DAG,
51648 const X86Subtarget &ST) {
51649 // cmp(setcc(cc, X), 0)
51650 // brcond ne
51651 // ->
51652 // X
51653 // brcond cc
51654
51655 // sub(setcc(cc, X), 1)
51656 // brcond ne
51657 // ->
51658 // X
51659 // brcond ~cc
51660 //
51661 // if only flag has users
51662
51663 SDValue SetCC = N->getOperand(0);
51664
51665 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51666 return SDValue();
51667
51668 // Check the only user of flag is `brcond ne`.
51669 SDNode *BrCond = *Flag->user_begin();
51670 if (BrCond->getOpcode() != X86ISD::BRCOND)
51671 return SDValue();
51672 unsigned CondNo = 2;
51673 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51675 return SDValue();
51676
51677 SDValue X = SetCC.getOperand(1);
51678 // sub has two results while X only have one. DAG combine assumes the value
51679 // type matches.
51680 if (N->getOpcode() == X86ISD::SUB)
51681 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51682
51683 SDValue CCN = SetCC.getOperand(0);
51684 X86::CondCode CC =
51685 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51687 // Update CC for the consumer of the flag.
51688 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51689 // checking if the second condition evaluates to true. When comparing the
51690 // result with 1, we are checking uf the second condition evaluates to false.
51692 if (isNullConstant(N->getOperand(1)))
51693 Ops[CondNo] = CCN;
51694 else if (isOneConstant(N->getOperand(1)))
51695 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51696 else
51697 llvm_unreachable("expect constant 0 or 1");
51698
51699 SDValue NewBrCond =
51700 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51701 // Avoid self-assign error b/c CC1 can be `e/ne`.
51702 if (BrCond != NewBrCond.getNode())
51703 DCI.CombineTo(BrCond, NewBrCond);
51704 return X;
51705}
51706
51709 const X86Subtarget &ST) {
51710 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51711 // ->
51712 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51713
51714 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51715 // ->
51716 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51717 //
51718 // where cflags is determined by cc1.
51719
51720 if (!ST.hasCCMP())
51721 return SDValue();
51722
51723 SDValue SetCC0 = N->getOperand(0);
51724 SDValue SetCC1 = N->getOperand(1);
51725 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51726 SetCC1.getOpcode() != X86ISD::SETCC)
51727 return SDValue();
51728
51729 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51730 SDValue Op = V.getOperand(1);
51731 unsigned Opc = Op.getOpcode();
51732 if (Opc == X86ISD::SUB)
51733 return X86ISD::CCMP;
51734 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51735 return X86ISD::CTEST;
51736 return 0U;
51737 };
51738
51739 unsigned NewOpc = 0;
51740
51741 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51742 // appear on the right.
51743 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51744 std::swap(SetCC0, SetCC1);
51745 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51746 return SDValue();
51747 }
51748
51749 X86::CondCode CC0 =
51750 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51751 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51752 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51753 return SDValue();
51754
51755 bool IsOR = N->getOpcode() == ISD::OR;
51756
51757 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51758 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51759 // operator is OR. Similar for CC1.
51760 SDValue SrcCC =
51762 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51763 : SetCC0.getOperand(0);
51764 SDValue CC1N = SetCC1.getOperand(0);
51765 X86::CondCode CC1 =
51766 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51768 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51769 SDLoc DL(N);
51770 SDValue CFlags = DAG.getTargetConstant(
51771 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51772 SDValue Sub = SetCC1.getOperand(1);
51773
51774 // Replace any uses of the old flag produced by SUB/CMP with the new one
51775 // produced by CCMP/CTEST.
51776 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51777 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51778 {Sub.getOperand(0), Sub.getOperand(1),
51779 CFlags, SrcCC, SetCC0.getOperand(1)})
51780 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51781 {Sub.getOperand(0), Sub.getOperand(0),
51782 CFlags, SrcCC, SetCC0.getOperand(1)});
51783
51784 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51785}
51786
51789 const X86Subtarget &Subtarget) {
51790 using namespace SDPatternMatch;
51791
51792 SDValue N0 = N->getOperand(0);
51793 SDValue N1 = N->getOperand(1);
51794 EVT VT = N->getValueType(0);
51795 SDLoc dl(N);
51796 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51797
51798 // If this is SSE1 only convert to FAND to avoid scalarization.
51799 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51800 return DAG.getBitcast(MVT::v4i32,
51801 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51802 DAG.getBitcast(MVT::v4f32, N0),
51803 DAG.getBitcast(MVT::v4f32, N1)));
51804 }
51805
51806 // Use a 32-bit and+zext if upper bits known zero.
51807 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51808 APInt HiMask = APInt::getHighBitsSet(64, 32);
51809 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51810 DAG.MaskedValueIsZero(N0, HiMask)) {
51811 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51812 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51813 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51814 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51815 }
51816 }
51817
51818 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51819 // TODO: Support multiple SrcOps.
51820 if (VT == MVT::i1) {
51822 SmallVector<APInt, 2> SrcPartials;
51823 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51824 SrcOps.size() == 1) {
51825 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51826 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51827 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51828 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51829 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51830 if (Mask) {
51831 assert(SrcPartials[0].getBitWidth() == NumElts &&
51832 "Unexpected partial reduction mask");
51833 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51834 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51835 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51836 }
51837 }
51838 }
51839
51840 // InstCombine converts:
51841 // `(-x << C0) & C1`
51842 // to
51843 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51844 // This saves an IR instruction but on x86 the neg/shift version is preferable
51845 // so undo the transform.
51846
51847 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51848 // TODO: We don't actually need a splat for this, we just need the checks to
51849 // hold for each element.
51850 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51851 /*AllowTruncation*/ false);
51852 ConstantSDNode *N01C =
51853 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51854 /*AllowTruncation*/ false);
51855 if (N1C && N01C) {
51856 const APInt &MulC = N01C->getAPIntValue();
51857 const APInt &AndC = N1C->getAPIntValue();
51858 APInt MulCLowBit = MulC & (-MulC);
51859 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51860 (MulCLowBit + MulC).isPowerOf2()) {
51861 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51862 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51863 assert(MulCLowBitLog != -1 &&
51864 "Isolated lowbit is somehow not a power of 2!");
51865 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51866 DAG.getConstant(MulCLowBitLog, dl, VT));
51867 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51868 }
51869 }
51870 }
51871
51872 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51873 return SetCC;
51874
51875 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51876 return V;
51877
51878 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51879 return R;
51880
51881 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51882 return R;
51883
51884 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51885 return R;
51886
51887 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51888 DAG, DCI, Subtarget))
51889 return FPLogic;
51890
51891 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51892 return R;
51893
51894 if (DCI.isBeforeLegalizeOps())
51895 return SDValue();
51896
51897 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51898 return R;
51899
51900 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51901 return R;
51902
51903 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51904 return ShiftRight;
51905
51906 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51907 return R;
51908
51909 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51910 return R;
51911
51912 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51913 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51914 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51915 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51916 unsigned Opc0 = N0.getOpcode();
51917 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51919 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51920 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51921 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51922 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51923 }
51924 }
51925
51926 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51927 // to make use of predicated selects.
51928 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51929 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51930 SDValue X, Y;
51931 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51932 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51933 (VT.is512BitVector() || Subtarget.hasVLX()) &&
51934 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
51937 m_Value(Y), m_SpecificVT(CondVT),
51938 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51939 return DAG.getSelect(dl, VT, Y, X,
51940 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51941 }
51942 }
51943
51944 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51945 // avoids slow variable shift (moving shift amount to ECX etc.)
51946 if (isOneConstant(N1) && N0->hasOneUse()) {
51947 SDValue Src = N0;
51948 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51949 Src.getOpcode() == ISD::TRUNCATE) &&
51950 Src.getOperand(0)->hasOneUse())
51951 Src = Src.getOperand(0);
51952 bool ContainsNOT = false;
51953 X86::CondCode X86CC = X86::COND_B;
51954 // Peek through AND(NOT(SRL(X,Y)),1).
51955 if (isBitwiseNot(Src)) {
51956 Src = Src.getOperand(0);
51957 X86CC = X86::COND_AE;
51958 ContainsNOT = true;
51959 }
51960 if (Src.getOpcode() == ISD::SRL &&
51961 !isa<ConstantSDNode>(Src.getOperand(1))) {
51962 SDValue BitNo = Src.getOperand(1);
51963 Src = Src.getOperand(0);
51964 // Peek through AND(SRL(NOT(X),Y),1).
51965 if (isBitwiseNot(Src)) {
51966 Src = Src.getOperand(0);
51967 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51968 ContainsNOT = true;
51969 }
51970 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51971 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51972 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51973 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51974 }
51975 }
51976
51977 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51978 // Attempt to recursively combine a bitmask AND with shuffles.
51979 SDValue Op(N, 0);
51980 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51981 return Res;
51982
51983 // If either operand is a constant mask, then only the elements that aren't
51984 // zero are actually demanded by the other operand.
51985 auto GetDemandedMasks = [&](SDValue Op) {
51986 APInt UndefElts;
51987 SmallVector<APInt> EltBits;
51988 int NumElts = VT.getVectorNumElements();
51989 int EltSizeInBits = VT.getScalarSizeInBits();
51990 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51991 APInt DemandedElts = APInt::getAllOnes(NumElts);
51992 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51993 EltBits)) {
51994 DemandedBits.clearAllBits();
51995 DemandedElts.clearAllBits();
51996 for (int I = 0; I != NumElts; ++I) {
51997 if (UndefElts[I]) {
51998 // We can't assume an undef src element gives an undef dst - the
51999 // other src might be zero.
52000 DemandedBits.setAllBits();
52001 DemandedElts.setBit(I);
52002 } else if (!EltBits[I].isZero()) {
52003 DemandedBits |= EltBits[I];
52004 DemandedElts.setBit(I);
52005 }
52006 }
52007 }
52008 return std::make_pair(DemandedBits, DemandedElts);
52009 };
52010 APInt Bits0, Elts0;
52011 APInt Bits1, Elts1;
52012 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52013 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
52014
52015 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52016 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52017 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52018 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52019 if (N->getOpcode() != ISD::DELETED_NODE)
52020 DCI.AddToWorklist(N);
52021 return SDValue(N, 0);
52022 }
52023
52024 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
52025 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
52026 if (NewN0 || NewN1)
52027 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
52028 NewN1 ? NewN1 : N1);
52029 }
52030
52031 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
52032 if ((VT.getScalarSizeInBits() % 8) == 0 &&
52034 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
52035 SDValue BitMask = N1;
52036 SDValue SrcVec = N0.getOperand(0);
52037 EVT SrcVecVT = SrcVec.getValueType();
52038
52039 // Check that the constant bitmask masks whole bytes.
52040 APInt UndefElts;
52041 SmallVector<APInt, 64> EltBits;
52042 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
52043 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
52044 llvm::all_of(EltBits, [](const APInt &M) {
52045 return M.isZero() || M.isAllOnes();
52046 })) {
52047 unsigned NumElts = SrcVecVT.getVectorNumElements();
52048 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
52049 unsigned Idx = N0.getConstantOperandVal(1);
52050
52051 // Create a root shuffle mask from the byte mask and the extracted index.
52052 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
52053 for (unsigned i = 0; i != Scale; ++i) {
52054 if (UndefElts[i])
52055 continue;
52056 int VecIdx = Scale * Idx + i;
52057 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
52058 }
52059
52061 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
52062 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
52063 /*AllowVariableCrossLaneMask=*/true,
52064 /*AllowVariablePerLaneMask=*/true,
52065 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
52066 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
52067 N0.getOperand(1));
52068 }
52069 }
52070
52071 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52072 return R;
52073
52074 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
52075 return R;
52076
52077 return SDValue();
52078}
52079
52080// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
52082 SelectionDAG &DAG,
52083 const X86Subtarget &Subtarget) {
52084 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52085
52086 MVT VT = N->getSimpleValueType(0);
52087 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52088 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
52089 return SDValue();
52090
52091 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
52092 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
52093 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
52094 return SDValue();
52095
52096 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
52097 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
52098 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
52099 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
52100 return SDValue();
52101
52102 // Attempt to extract constant byte masks.
52103 APInt UndefElts0, UndefElts1;
52104 SmallVector<APInt, 32> EltBits0, EltBits1;
52105 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52106 /*AllowWholeUndefs*/ false,
52107 /*AllowPartialUndefs*/ false))
52108 return SDValue();
52109 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52110 /*AllowWholeUndefs*/ false,
52111 /*AllowPartialUndefs*/ false))
52112 return SDValue();
52113
52114 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52115 // TODO - add UNDEF elts support.
52116 if (UndefElts0[i] || UndefElts1[i])
52117 return SDValue();
52118 if (EltBits0[i] != ~EltBits1[i])
52119 return SDValue();
52120 }
52121
52122 if (useVPTERNLOG(Subtarget, VT)) {
52123 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52124 // VPTERNLOG is only available as vXi32/64-bit types.
52125 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52126 MVT OpVT =
52127 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52128 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52129 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52130 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52131 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52132 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52133 DAG, Subtarget);
52134 return DAG.getBitcast(VT, Res);
52135 }
52136
52137 SDValue X = N->getOperand(0);
52138 SDValue Y =
52139 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52140 DAG.getBitcast(VT, N1.getOperand(0)));
52141 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52142}
52143
52144// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52145// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52146// Waiting for ANDNP combine allows other combines to happen that prevent
52147// matching.
52148static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52149 using namespace SDPatternMatch;
52150 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52151 m_And(m_Deferred(Mask), m_Value(Y))));
52152}
52153
52154// Try to fold:
52155// (or (and (m, y), (pandn m, x)))
52156// into:
52157// (vselect m, x, y)
52158// As a special case, try to fold:
52159// (or (and (m, (sub 0, x)), (pandn m, x)))
52160// into:
52161// (sub (xor X, M), M)
52163 SelectionDAG &DAG,
52164 const X86Subtarget &Subtarget) {
52165 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52166
52167 EVT VT = N->getValueType(0);
52168 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52169 (VT.is256BitVector() && Subtarget.hasInt256())))
52170 return SDValue();
52171
52172 SDValue X, Y, Mask;
52173 if (!matchLogicBlend(N, X, Y, Mask))
52174 return SDValue();
52175
52176 // Validate that X, Y, and Mask are bitcasts, and see through them.
52177 Mask = peekThroughBitcasts(Mask);
52180
52181 EVT MaskVT = Mask.getValueType();
52182 unsigned EltBits = MaskVT.getScalarSizeInBits();
52183
52184 // TODO: Attempt to handle floating point cases as well?
52185 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52186 return SDValue();
52187
52188 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52189 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52190 DAG, Subtarget))
52191 return Res;
52192
52193 // PBLENDVB is only available on SSE 4.1.
52194 if (!Subtarget.hasSSE41())
52195 return SDValue();
52196
52197 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52198 if (Subtarget.hasVLX())
52199 return SDValue();
52200
52201 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52202
52203 X = DAG.getBitcast(BlendVT, X);
52204 Y = DAG.getBitcast(BlendVT, Y);
52205 Mask = DAG.getBitcast(BlendVT, Mask);
52206 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52207 return DAG.getBitcast(VT, Mask);
52208}
52209
52210// Helper function for combineOrCmpEqZeroToCtlzSrl
52211// Transforms:
52212// seteq(cmp x, 0)
52213// into:
52214// srl(ctlz x), log2(bitsize(x))
52215// Input pattern is checked by caller.
52217 SDValue Cmp = Op.getOperand(1);
52218 EVT VT = Cmp.getOperand(0).getValueType();
52219 unsigned Log2b = Log2_32(VT.getSizeInBits());
52220 SDLoc dl(Op);
52221 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52222 // The result of the shift is true or false, and on X86, the 32-bit
52223 // encoding of shr and lzcnt is more desirable.
52224 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52225 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52226 DAG.getConstant(Log2b, dl, MVT::i8));
52227 return Scc;
52228}
52229
52230// Try to transform:
52231// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52232// into:
52233// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52234// Will also attempt to match more generic cases, eg:
52235// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52236// Only applies if the target supports the FastLZCNT feature.
52239 const X86Subtarget &Subtarget) {
52240 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52241 return SDValue();
52242
52243 auto isORCandidate = [](SDValue N) {
52244 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52245 };
52246
52247 // Check the zero extend is extending to 32-bit or more. The code generated by
52248 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52249 // instructions to clear the upper bits.
52250 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52251 !isORCandidate(N->getOperand(0)))
52252 return SDValue();
52253
52254 // Check the node matches: setcc(eq, cmp 0)
52255 auto isSetCCCandidate = [](SDValue N) {
52256 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52257 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52258 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52259 isNullConstant(N->getOperand(1).getOperand(1)) &&
52260 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52261 };
52262
52263 SDNode *OR = N->getOperand(0).getNode();
52264 SDValue LHS = OR->getOperand(0);
52265 SDValue RHS = OR->getOperand(1);
52266
52267 // Save nodes matching or(or, setcc(eq, cmp 0)).
52269 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52270 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52271 ORNodes.push_back(OR);
52272 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52273 LHS = OR->getOperand(0);
52274 RHS = OR->getOperand(1);
52275 }
52276
52277 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52278 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52279 !isORCandidate(SDValue(OR, 0)))
52280 return SDValue();
52281
52282 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52283 // to
52284 // or(srl(ctlz),srl(ctlz)).
52285 // The dag combiner can then fold it into:
52286 // srl(or(ctlz, ctlz)).
52287 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52288 SDValue Ret, NewRHS;
52289 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52290 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52291
52292 if (!Ret)
52293 return SDValue();
52294
52295 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52296 while (!ORNodes.empty()) {
52297 OR = ORNodes.pop_back_val();
52298 LHS = OR->getOperand(0);
52299 RHS = OR->getOperand(1);
52300 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52301 if (RHS->getOpcode() == ISD::OR)
52302 std::swap(LHS, RHS);
52303 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52304 if (!NewRHS)
52305 return SDValue();
52306 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52307 }
52308
52309 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52310}
52311
52312/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52313/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52314/// with CMP+{ADC, SBB}.
52315/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52316static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52317 SDValue X, SDValue Y,
52318 SelectionDAG &DAG,
52319 bool ZeroSecondOpOnly = false) {
52320 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52321 return SDValue();
52322
52323 // Look through a one-use zext.
52324 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52325 Y = Y.getOperand(0);
52326
52327 X86::CondCode CC;
52328 SDValue EFLAGS;
52329 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52330 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52331 EFLAGS = Y.getOperand(1);
52332 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52333 Y.hasOneUse()) {
52334 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52335 }
52336
52337 if (!EFLAGS)
52338 return SDValue();
52339
52340 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52341 // the general case below.
52342 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52343 if (ConstantX && !ZeroSecondOpOnly) {
52344 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52345 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52346 // This is a complicated way to get -1 or 0 from the carry flag:
52347 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52348 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52349 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52350 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52351 EFLAGS);
52352 }
52353
52354 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52355 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52356 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52357 EFLAGS.getValueType().isInteger() &&
52358 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52359 // Swap the operands of a SUB, and we have the same pattern as above.
52360 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52361 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52362 SDValue NewSub = DAG.getNode(
52363 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52364 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52365 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52366 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52367 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52368 NewEFLAGS);
52369 }
52370 }
52371 }
52372
52373 if (CC == X86::COND_B) {
52374 // X + SETB Z --> adc X, 0
52375 // X - SETB Z --> sbb X, 0
52376 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52377 DAG.getVTList(VT, MVT::i32), X,
52378 DAG.getConstant(0, DL, VT), EFLAGS);
52379 }
52380
52381 if (ZeroSecondOpOnly)
52382 return SDValue();
52383
52384 if (CC == X86::COND_A) {
52385 // Try to convert COND_A into COND_B in an attempt to facilitate
52386 // materializing "setb reg".
52387 //
52388 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52389 // cannot take an immediate as its first operand.
52390 //
52391 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52392 EFLAGS.getValueType().isInteger() &&
52393 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52394 SDValue NewSub =
52395 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52396 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52397 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52398 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52399 DAG.getVTList(VT, MVT::i32), X,
52400 DAG.getConstant(0, DL, VT), NewEFLAGS);
52401 }
52402 }
52403
52404 if (CC == X86::COND_AE) {
52405 // X + SETAE --> sbb X, -1
52406 // X - SETAE --> adc X, -1
52407 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52408 DAG.getVTList(VT, MVT::i32), X,
52409 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52410 }
52411
52412 if (CC == X86::COND_BE) {
52413 // X + SETBE --> sbb X, -1
52414 // X - SETBE --> adc X, -1
52415 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52416 // materializing "setae reg".
52417 //
52418 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52419 // cannot take an immediate as its first operand.
52420 //
52421 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52422 EFLAGS.getValueType().isInteger() &&
52423 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52424 SDValue NewSub =
52425 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52426 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52427 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52428 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52429 DAG.getVTList(VT, MVT::i32), X,
52430 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52431 }
52432 }
52433
52434 if (CC != X86::COND_E && CC != X86::COND_NE)
52435 return SDValue();
52436
52437 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52438 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52439 !EFLAGS.getOperand(0).getValueType().isInteger())
52440 return SDValue();
52441
52442 SDValue Z = EFLAGS.getOperand(0);
52443 EVT ZVT = Z.getValueType();
52444
52445 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52446 // the general case below.
52447 if (ConstantX) {
52448 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52449 // fake operands:
52450 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52451 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52452 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52453 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52454 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52455 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52456 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52457 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52458 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52459 SDValue(Neg.getNode(), 1));
52460 }
52461
52462 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52463 // with fake operands:
52464 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52465 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52466 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52467 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52468 SDValue One = DAG.getConstant(1, DL, ZVT);
52469 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52470 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52471 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52472 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52473 Cmp1.getValue(1));
52474 }
52475 }
52476
52477 // (cmp Z, 1) sets the carry flag if Z is 0.
52478 SDValue One = DAG.getConstant(1, DL, ZVT);
52479 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52480 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52481
52482 // Add the flags type for ADC/SBB nodes.
52483 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52484
52485 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52486 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52487 if (CC == X86::COND_NE)
52488 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52489 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52490
52491 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52492 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52493 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52494 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52495}
52496
52497/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52498/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52499/// with CMP+{ADC, SBB}.
52501 SelectionDAG &DAG) {
52502 bool IsSub = N->getOpcode() == ISD::SUB;
52503 SDValue X = N->getOperand(0);
52504 SDValue Y = N->getOperand(1);
52505 EVT VT = N->getValueType(0);
52506
52507 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52508 return ADCOrSBB;
52509
52510 // Commute and try again (negate the result for subtracts).
52511 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52512 if (IsSub)
52513 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52514 return ADCOrSBB;
52515 }
52516
52517 return SDValue();
52518}
52519
52520static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52521 SDValue N0, SDValue N1,
52522 SelectionDAG &DAG) {
52523 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52524
52525 // Delegate to combineAddOrSubToADCOrSBB if we have:
52526 //
52527 // (xor/or (zero_extend (setcc)) imm)
52528 //
52529 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52530 // equivalent to a SUB/ADD, respectively.
52531 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52532 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52533 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52534 bool IsSub = Opc == ISD::XOR;
52535 bool N1COdd = N1C->getZExtValue() & 1;
52536 if (IsSub ? N1COdd : !N1COdd)
52537 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52538 return R;
52539 }
52540 }
52541
52542 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52543 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52544 N0.getOperand(0).getOpcode() == ISD::AND &&
52548 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52549 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52550 N0.getOperand(0).getOperand(1));
52551 }
52552
52553 return SDValue();
52554}
52555
52558 const X86Subtarget &Subtarget) {
52559 SDValue N0 = N->getOperand(0);
52560 SDValue N1 = N->getOperand(1);
52561 EVT VT = N->getValueType(0);
52562 SDLoc dl(N);
52563 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52564
52565 // If this is SSE1 only convert to FOR to avoid scalarization.
52566 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52567 return DAG.getBitcast(MVT::v4i32,
52568 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52569 DAG.getBitcast(MVT::v4f32, N0),
52570 DAG.getBitcast(MVT::v4f32, N1)));
52571 }
52572
52573 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52574 // TODO: Support multiple SrcOps.
52575 if (VT == MVT::i1) {
52577 SmallVector<APInt, 2> SrcPartials;
52578 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52579 SrcOps.size() == 1) {
52580 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52581 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52582 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52583 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52584 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52585 if (Mask) {
52586 assert(SrcPartials[0].getBitWidth() == NumElts &&
52587 "Unexpected partial reduction mask");
52588 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52589 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52590 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52591 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52592 }
52593 }
52594 }
52595
52596 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52597 return SetCC;
52598
52599 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52600 return R;
52601
52602 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52603 return R;
52604
52605 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52606 return R;
52607
52608 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52609 DAG, DCI, Subtarget))
52610 return FPLogic;
52611
52612 if (DCI.isBeforeLegalizeOps())
52613 return SDValue();
52614
52615 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52616 return R;
52617
52618 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52619 return R;
52620
52621 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52622 return R;
52623
52624 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52625 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52626 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52627 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52628 uint64_t Val = CN->getZExtValue();
52629 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52630 Val == 8) {
52631 SDValue NotCond;
52632 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52633 N0.getOperand(1).hasOneUse()) {
52636 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52637 } else if (N0.getOpcode() == ISD::SUB &&
52638 isNullConstant(N0.getOperand(0))) {
52639 SDValue Cond = N0.getOperand(1);
52640 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52641 Cond = Cond.getOperand(0);
52642 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52643 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52645 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52646 }
52647 }
52648
52649 if (NotCond) {
52650 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52651 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52652 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52653 return R;
52654 }
52655 }
52656 }
52657 }
52658
52659 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52660 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52661 // iff the upper elements of the non-shifted arg are zero.
52662 // KUNPCK require 16+ bool vector elements.
52663 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52664 unsigned NumElts = VT.getVectorNumElements();
52665 unsigned HalfElts = NumElts / 2;
52666 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52667 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52668 N1.getConstantOperandAPInt(1) == HalfElts &&
52669 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52670 return DAG.getNode(
52671 ISD::CONCAT_VECTORS, dl, VT,
52672 extractSubVector(N0, 0, DAG, dl, HalfElts),
52673 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52674 }
52675 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52676 N0.getConstantOperandAPInt(1) == HalfElts &&
52677 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52678 return DAG.getNode(
52679 ISD::CONCAT_VECTORS, dl, VT,
52680 extractSubVector(N1, 0, DAG, dl, HalfElts),
52681 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52682 }
52683 }
52684
52685 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52686 // Attempt to recursively combine an OR of shuffles.
52687 SDValue Op(N, 0);
52688 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52689 return Res;
52690
52691 // If either operand is a constant mask, then only the elements that aren't
52692 // allones are actually demanded by the other operand.
52693 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52694 APInt UndefElts;
52695 SmallVector<APInt> EltBits;
52696 int NumElts = VT.getVectorNumElements();
52697 int EltSizeInBits = VT.getScalarSizeInBits();
52698 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52699 return false;
52700
52701 APInt DemandedElts = APInt::getZero(NumElts);
52702 for (int I = 0; I != NumElts; ++I)
52703 if (!EltBits[I].isAllOnes())
52704 DemandedElts.setBit(I);
52705
52706 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52707 };
52708 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52709 if (N->getOpcode() != ISD::DELETED_NODE)
52710 DCI.AddToWorklist(N);
52711 return SDValue(N, 0);
52712 }
52713 }
52714
52715 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52716 return R;
52717
52718 return SDValue();
52719}
52720
52721/// Try to turn tests against the signbit in the form of:
52722/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52723/// into:
52724/// SETGT(X, -1)
52726 SelectionDAG &DAG) {
52727 // This is only worth doing if the output type is i8 or i1.
52728 EVT ResultType = N->getValueType(0);
52729 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52730 return SDValue();
52731
52732 SDValue N0 = N->getOperand(0);
52733 SDValue N1 = N->getOperand(1);
52734
52735 // We should be performing an xor against a truncated shift.
52736 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52737 return SDValue();
52738
52739 // Make sure we are performing an xor against one.
52740 if (!isOneConstant(N1))
52741 return SDValue();
52742
52743 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52744 SDValue Shift = N0.getOperand(0);
52745 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52746 return SDValue();
52747
52748 // Make sure we are truncating from one of i16, i32 or i64.
52749 EVT ShiftTy = Shift.getValueType();
52750 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52751 return SDValue();
52752
52753 // Make sure the shift amount extracts the sign bit.
52754 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52755 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52756 return SDValue();
52757
52758 // Create a greater-than comparison against -1.
52759 // N.B. Using SETGE against 0 works but we want a canonical looking
52760 // comparison, using SETGT matches up with what TranslateX86CC.
52761 SDValue ShiftOp = Shift.getOperand(0);
52762 EVT ShiftOpTy = ShiftOp.getValueType();
52763 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52764 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52765 *DAG.getContext(), ResultType);
52766 SDValue Cond =
52767 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52768 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52769 if (SetCCResultType != ResultType)
52770 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52771 return Cond;
52772}
52773
52774/// Turn vector tests of the signbit in the form of:
52775/// xor (sra X, elt_size(X)-1), -1
52776/// into:
52777/// pcmpgt X, -1
52778///
52779/// This should be called before type legalization because the pattern may not
52780/// persist after that.
52782 const X86Subtarget &Subtarget) {
52783 EVT VT = N->getValueType(0);
52784 if (!VT.isSimple())
52785 return SDValue();
52786
52787 switch (VT.getSimpleVT().SimpleTy) {
52788 // clang-format off
52789 default: return SDValue();
52790 case MVT::v16i8:
52791 case MVT::v8i16:
52792 case MVT::v4i32:
52793 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52794 case MVT::v32i8:
52795 case MVT::v16i16:
52796 case MVT::v8i32:
52797 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52798 // clang-format on
52799 }
52800
52801 // There must be a shift right algebraic before the xor, and the xor must be a
52802 // 'not' operation.
52803 SDValue Shift = N->getOperand(0);
52804 SDValue Ones = N->getOperand(1);
52805 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52807 return SDValue();
52808
52809 // The shift should be smearing the sign bit across each vector element.
52810 auto *ShiftAmt =
52811 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52812 if (!ShiftAmt ||
52813 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52814 return SDValue();
52815
52816 // Create a greater-than comparison against -1. We don't use the more obvious
52817 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52818 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52819}
52820
52821/// Detect patterns of truncation with unsigned saturation:
52822///
52823/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52824/// Return the source value x to be truncated or SDValue() if the pattern was
52825/// not matched.
52826///
52827/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52828/// where C1 >= 0 and C2 is unsigned max of destination type.
52829///
52830/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52831/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52832///
52833/// These two patterns are equivalent to:
52834/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52835/// So return the smax(x, C1) value to be truncated or SDValue() if the
52836/// pattern was not matched.
52838 const SDLoc &DL) {
52839 using namespace llvm::SDPatternMatch;
52840 EVT InVT = In.getValueType();
52841
52842 // Saturation with truncation. We truncate from InVT to VT.
52844 "Unexpected types for truncate operation");
52845
52846 APInt C1, C2;
52848
52849 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52850 // the element size of the destination type.
52851 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52852 C2.isMask(VT.getScalarSizeInBits()))
52853 return UMin;
52854
52855 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52857 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52858 return SMin;
52859
52860 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52862 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52863 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52864
52865 return SDValue();
52866}
52867
52868/// Detect patterns of truncation with signed saturation:
52869/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52870/// signed_max_of_dest_type)) to dest_type)
52871/// or:
52872/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52873/// signed_min_of_dest_type)) to dest_type).
52874/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52875/// Return the source value to be truncated or SDValue() if the pattern was not
52876/// matched.
52877static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52878 using namespace llvm::SDPatternMatch;
52879 unsigned NumDstBits = VT.getScalarSizeInBits();
52880 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52881 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52882
52883 APInt SignedMax, SignedMin;
52884 if (MatchPackUS) {
52885 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52886 SignedMin = APInt::getZero(NumSrcBits);
52887 } else {
52888 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52889 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52890 }
52891
52892 SDValue SMin, SMax;
52893 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52894 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52895 return SMax;
52896
52897 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52898 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52899 return SMin;
52900
52901 return SDValue();
52902}
52903
52905 SelectionDAG &DAG,
52906 const X86Subtarget &Subtarget) {
52907 if (!Subtarget.hasSSE2() || !VT.isVector())
52908 return SDValue();
52909
52910 EVT SVT = VT.getVectorElementType();
52911 EVT InVT = In.getValueType();
52912 EVT InSVT = InVT.getVectorElementType();
52913
52914 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52915 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52916 // and concatenate at the same time. Then we can use a final vpmovuswb to
52917 // clip to 0-255.
52918 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52919 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52920 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52921 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52922 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52923 DL, DAG, Subtarget);
52924 assert(Mid && "Failed to pack!");
52925 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52926 }
52927 }
52928
52929 // vXi32 truncate instructions are available with AVX512F.
52930 // vXi16 truncate instructions are only available with AVX512BW.
52931 // For 256-bit or smaller vectors, we require VLX.
52932 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52933 // If the result type is 256-bits or larger and we have disable 512-bit
52934 // registers, we should go ahead and use the pack instructions if possible.
52935 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52936 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52937 (InVT.getSizeInBits() > 128) &&
52938 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52939 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52940
52941 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52943 (SVT == MVT::i8 || SVT == MVT::i16) &&
52944 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52945 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52946 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52947 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52948 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52949 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52950 DAG, Subtarget);
52951 assert(Mid && "Failed to pack!");
52953 Subtarget);
52954 assert(V && "Failed to pack!");
52955 return V;
52956 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52957 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52958 Subtarget);
52959 }
52960 if (SDValue SSatVal = detectSSatPattern(In, VT))
52961 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52962 Subtarget);
52963 }
52964
52965 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52966 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52967 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52968 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52969 unsigned TruncOpc = 0;
52970 SDValue SatVal;
52971 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52972 SatVal = SSatVal;
52973 TruncOpc = X86ISD::VTRUNCS;
52974 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52975 SatVal = USatVal;
52976 TruncOpc = X86ISD::VTRUNCUS;
52977 }
52978 if (SatVal) {
52979 unsigned ResElts = VT.getVectorNumElements();
52980 // If the input type is less than 512 bits and we don't have VLX, we need
52981 // to widen to 512 bits.
52982 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52983 unsigned NumConcats = 512 / InVT.getSizeInBits();
52984 ResElts *= NumConcats;
52985 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
52986 ConcatOps[0] = SatVal;
52987 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
52988 NumConcats * InVT.getVectorNumElements());
52989 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
52990 }
52991 // Widen the result if its narrower than 128 bits.
52992 if (ResElts * SVT.getSizeInBits() < 128)
52993 ResElts = 128 / SVT.getSizeInBits();
52994 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
52995 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
52996 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
52997 DAG.getVectorIdxConstant(0, DL));
52998 }
52999 }
53000
53001 return SDValue();
53002}
53003
53005 SelectionDAG &DAG,
53007 const X86Subtarget &Subtarget) {
53008 auto *Ld = cast<LoadSDNode>(N);
53009 EVT RegVT = Ld->getValueType(0);
53010 SDValue Ptr = Ld->getBasePtr();
53011 SDValue Chain = Ld->getChain();
53012 ISD::LoadExtType Ext = Ld->getExtensionType();
53013
53014 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
53015 return SDValue();
53016
53017 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
53018 return SDValue();
53019
53021 if (!LdC)
53022 return SDValue();
53023
53024 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
53025 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
53026 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
53027 if (Undefs[I])
53028 continue;
53029 if (UserUndefs[I] || Bits[I] != UserBits[I])
53030 return false;
53031 }
53032 return true;
53033 };
53034
53035 // Look through all other loads/broadcasts in the chain for another constant
53036 // pool entry.
53037 for (SDNode *User : Chain->users()) {
53038 auto *UserLd = dyn_cast<MemSDNode>(User);
53039 if (User != N && UserLd &&
53040 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
53041 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
53043 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
53044 User->getValueSizeInBits(0).getFixedValue() >
53045 RegVT.getFixedSizeInBits()) {
53046 EVT UserVT = User->getValueType(0);
53047 SDValue UserPtr = UserLd->getBasePtr();
53048 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
53049
53050 // See if we are loading a constant that matches in the lower
53051 // bits of a longer constant (but from a different constant pool ptr).
53052 if (UserC && UserPtr != Ptr) {
53053 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
53054 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
53055 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
53056 APInt Undefs, UserUndefs;
53057 SmallVector<APInt> Bits, UserBits;
53058 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
53059 UserVT.getScalarSizeInBits());
53060 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
53061 Bits) &&
53063 UserUndefs, UserBits)) {
53064 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
53066 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53067 RegVT.getSizeInBits());
53068 Extract = DAG.getBitcast(RegVT, Extract);
53069 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53070 }
53071 }
53072 }
53073 }
53074 }
53075 }
53076
53077 return SDValue();
53078}
53079
53082 const X86Subtarget &Subtarget) {
53083 auto *Ld = cast<LoadSDNode>(N);
53084 EVT RegVT = Ld->getValueType(0);
53085 EVT MemVT = Ld->getMemoryVT();
53086 SDLoc dl(Ld);
53087 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53088
53089 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
53090 // into two 16-byte operations. Also split non-temporal aligned loads on
53091 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
53092 ISD::LoadExtType Ext = Ld->getExtensionType();
53093 unsigned Fast;
53094 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
53095 Ext == ISD::NON_EXTLOAD &&
53096 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
53097 Ld->getAlign() >= Align(16)) ||
53098 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
53099 *Ld->getMemOperand(), &Fast) &&
53100 !Fast))) {
53101 unsigned NumElems = RegVT.getVectorNumElements();
53102 if (NumElems < 2)
53103 return SDValue();
53104
53105 unsigned HalfOffset = 16;
53106 SDValue Ptr1 = Ld->getBasePtr();
53107 SDValue Ptr2 =
53108 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53109 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53110 NumElems / 2);
53111 SDValue Load1 =
53112 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53113 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53114 SDValue Load2 =
53115 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53116 Ld->getPointerInfo().getWithOffset(HalfOffset),
53117 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53118 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53119 Load1.getValue(1), Load2.getValue(1));
53120
53121 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53122 return DCI.CombineTo(N, NewVec, TF, true);
53123 }
53124
53125 // Bool vector load - attempt to cast to an integer, as we have good
53126 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53127 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53128 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53129 unsigned NumElts = RegVT.getVectorNumElements();
53130 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53131 if (TLI.isTypeLegal(IntVT)) {
53132 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53133 Ld->getPointerInfo(), Ld->getBaseAlign(),
53134 Ld->getMemOperand()->getFlags());
53135 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53136 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53137 }
53138 }
53139
53140 // If we also broadcast this vector to a wider type, then just extract the
53141 // lowest subvector.
53142 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53143 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53144 SDValue Ptr = Ld->getBasePtr();
53145 SDValue Chain = Ld->getChain();
53146 for (SDNode *User : Chain->users()) {
53147 auto *UserLd = dyn_cast<MemSDNode>(User);
53148 if (User != N && UserLd &&
53149 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53150 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53151 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53152 User->hasAnyUseOfValue(0) &&
53153 User->getValueSizeInBits(0).getFixedValue() >
53154 RegVT.getFixedSizeInBits()) {
53156 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53157 RegVT.getSizeInBits());
53158 Extract = DAG.getBitcast(RegVT, Extract);
53159 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53160 }
53161 }
53162 }
53163
53164 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53165 return V;
53166
53167 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53168 unsigned AddrSpace = Ld->getAddressSpace();
53169 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53170 AddrSpace == X86AS::PTR32_UPTR) {
53171 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53172 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53173 SDValue Cast =
53174 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53175 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53176 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53177 Ld->getMemOperand()->getFlags());
53178 }
53179 }
53180
53181 return SDValue();
53182}
53183
53184/// If V is a build vector of boolean constants and exactly one of those
53185/// constants is true, return the operand index of that true element.
53186/// Otherwise, return -1.
53187static int getOneTrueElt(SDValue V) {
53188 // This needs to be a build vector of booleans.
53189 // TODO: Checking for the i1 type matches the IR definition for the mask,
53190 // but the mask check could be loosened to i8 or other types. That might
53191 // also require checking more than 'allOnesValue'; eg, the x86 HW
53192 // instructions only require that the MSB is set for each mask element.
53193 // The ISD::MSTORE comments/definition do not specify how the mask operand
53194 // is formatted.
53195 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53196 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53197 return -1;
53198
53199 int TrueIndex = -1;
53200 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53201 for (unsigned i = 0; i < NumElts; ++i) {
53202 const SDValue &Op = BV->getOperand(i);
53203 if (Op.isUndef())
53204 continue;
53205 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53206 if (!ConstNode)
53207 return -1;
53208 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53209 // If we already found a one, this is too many.
53210 if (TrueIndex >= 0)
53211 return -1;
53212 TrueIndex = i;
53213 }
53214 }
53215 return TrueIndex;
53216}
53217
53218/// Given a masked memory load/store operation, return true if it has one mask
53219/// bit set. If it has one mask bit set, then also return the memory address of
53220/// the scalar element to load/store, the vector index to insert/extract that
53221/// scalar element, and the alignment for the scalar memory access.
53223 SelectionDAG &DAG, SDValue &Addr,
53224 SDValue &Index, Align &Alignment,
53225 unsigned &Offset) {
53226 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53227 if (TrueMaskElt < 0)
53228 return false;
53229
53230 // Get the address of the one scalar element that is specified by the mask
53231 // using the appropriate offset from the base pointer.
53232 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53233 Offset = 0;
53234 Addr = MaskedOp->getBasePtr();
53235 if (TrueMaskElt != 0) {
53236 Offset = TrueMaskElt * EltVT.getStoreSize();
53238 SDLoc(MaskedOp));
53239 }
53240
53241 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53242 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53243 return true;
53244}
53245
53246/// If exactly one element of the mask is set for a non-extending masked load,
53247/// it is a scalar load and vector insert.
53248/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53249/// mask have already been optimized in IR, so we don't bother with those here.
53250static SDValue
53253 const X86Subtarget &Subtarget) {
53254 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53255 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53256 // However, some target hooks may need to be added to know when the transform
53257 // is profitable. Endianness would also have to be considered.
53258
53259 SDValue Addr, VecIndex;
53260 Align Alignment;
53261 unsigned Offset;
53262 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53263 return SDValue();
53264
53265 // Load the one scalar element that is specified by the mask using the
53266 // appropriate offset from the base pointer.
53267 SDLoc DL(ML);
53268 EVT VT = ML->getValueType(0);
53269 EVT EltVT = VT.getVectorElementType();
53270
53271 EVT CastVT = VT;
53272 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53273 EltVT = MVT::f64;
53274 CastVT = VT.changeVectorElementType(EltVT);
53275 }
53276
53277 SDValue Load =
53278 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53279 ML->getPointerInfo().getWithOffset(Offset),
53280 Alignment, ML->getMemOperand()->getFlags());
53281
53282 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53283
53284 // Insert the loaded element into the appropriate place in the vector.
53285 SDValue Insert =
53286 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53287 Insert = DAG.getBitcast(VT, Insert);
53288 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53289}
53290
53291static SDValue
53294 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53295 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53296 return SDValue();
53297
53298 SDLoc DL(ML);
53299 EVT VT = ML->getValueType(0);
53300
53301 // If we are loading the first and last elements of a vector, it is safe and
53302 // always faster to load the whole vector. Replace the masked load with a
53303 // vector load and select.
53304 unsigned NumElts = VT.getVectorNumElements();
53305 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53306 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53307 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53308 if (LoadFirstElt && LoadLastElt) {
53309 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53310 ML->getMemOperand());
53311 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53312 ML->getPassThru());
53313 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53314 }
53315
53316 // Convert a masked load with a constant mask into a masked load and a select.
53317 // This allows the select operation to use a faster kind of select instruction
53318 // (for example, vblendvps -> vblendps).
53319
53320 // Don't try this if the pass-through operand is already undefined. That would
53321 // cause an infinite loop because that's what we're about to create.
53322 if (ML->getPassThru().isUndef())
53323 return SDValue();
53324
53325 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53326 return SDValue();
53327
53328 // The new masked load has an undef pass-through operand. The select uses the
53329 // original pass-through operand.
53330 SDValue NewML = DAG.getMaskedLoad(
53331 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53332 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53333 ML->getAddressingMode(), ML->getExtensionType());
53334 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53335 ML->getPassThru());
53336
53337 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53338}
53339
53342 const X86Subtarget &Subtarget) {
53343 auto *Mld = cast<MaskedLoadSDNode>(N);
53344
53345 // TODO: Expanding load with constant mask may be optimized as well.
53346 if (Mld->isExpandingLoad())
53347 return SDValue();
53348
53349 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53350 if (SDValue ScalarLoad =
53351 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53352 return ScalarLoad;
53353
53354 // TODO: Do some AVX512 subsets benefit from this transform?
53355 if (!Subtarget.hasAVX512())
53356 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53357 return Blend;
53358 }
53359
53360 // If the mask value has been legalized to a non-boolean vector, try to
53361 // simplify ops leading up to it. We only demand the MSB of each lane.
53362 SDValue Mask = Mld->getMask();
53363 if (Mask.getScalarValueSizeInBits() != 1) {
53364 EVT VT = Mld->getValueType(0);
53365 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53367 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53368 if (N->getOpcode() != ISD::DELETED_NODE)
53369 DCI.AddToWorklist(N);
53370 return SDValue(N, 0);
53371 }
53372 if (SDValue NewMask =
53374 return DAG.getMaskedLoad(
53375 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53376 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53377 Mld->getAddressingMode(), Mld->getExtensionType());
53378 }
53379
53380 return SDValue();
53381}
53382
53383/// If exactly one element of the mask is set for a non-truncating masked store,
53384/// it is a vector extract and scalar store.
53385/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53386/// mask have already been optimized in IR, so we don't bother with those here.
53388 SelectionDAG &DAG,
53389 const X86Subtarget &Subtarget) {
53390 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53391 // However, some target hooks may need to be added to know when the transform
53392 // is profitable. Endianness would also have to be considered.
53393
53394 SDValue Addr, VecIndex;
53395 Align Alignment;
53396 unsigned Offset;
53397 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53398 return SDValue();
53399
53400 // Extract the one scalar element that is actually being stored.
53401 SDLoc DL(MS);
53402 SDValue Value = MS->getValue();
53403 EVT VT = Value.getValueType();
53404 EVT EltVT = VT.getVectorElementType();
53405 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53406 EltVT = MVT::f64;
53407 EVT CastVT = VT.changeVectorElementType(EltVT);
53408 Value = DAG.getBitcast(CastVT, Value);
53409 }
53410 SDValue Extract =
53411 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53412
53413 // Store that element at the appropriate offset from the base pointer.
53414 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53416 Alignment, MS->getMemOperand()->getFlags());
53417}
53418
53421 const X86Subtarget &Subtarget) {
53423 if (Mst->isCompressingStore())
53424 return SDValue();
53425
53426 EVT VT = Mst->getValue().getValueType();
53427 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53428
53429 if (Mst->isTruncatingStore())
53430 return SDValue();
53431
53432 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53433 return ScalarStore;
53434
53435 // If the mask value has been legalized to a non-boolean vector, try to
53436 // simplify ops leading up to it. We only demand the MSB of each lane.
53437 SDValue Mask = Mst->getMask();
53438 if (Mask.getScalarValueSizeInBits() != 1) {
53440 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53441 if (N->getOpcode() != ISD::DELETED_NODE)
53442 DCI.AddToWorklist(N);
53443 return SDValue(N, 0);
53444 }
53445 if (SDValue NewMask =
53447 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53448 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53449 Mst->getMemoryVT(), Mst->getMemOperand(),
53450 Mst->getAddressingMode());
53451 }
53452
53453 SDValue Value = Mst->getValue();
53454 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53455 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53456 Mst->getMemoryVT())) {
53457 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53458 Mst->getBasePtr(), Mst->getOffset(), Mask,
53459 Mst->getMemoryVT(), Mst->getMemOperand(),
53460 Mst->getAddressingMode(), true);
53461 }
53462
53463 return SDValue();
53464}
53465
53468 const X86Subtarget &Subtarget) {
53470 EVT StVT = St->getMemoryVT();
53471 SDLoc dl(St);
53472 SDValue StoredVal = St->getValue();
53473 EVT VT = StoredVal.getValueType();
53474 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53475
53476 // Convert a store of vXi1 into a store of iX and a bitcast.
53477 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53478 VT.getVectorElementType() == MVT::i1) {
53479
53481 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53482
53483 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53484 St->getPointerInfo(), St->getBaseAlign(),
53485 St->getMemOperand()->getFlags());
53486 }
53487
53488 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53489 // This will avoid a copy to k-register.
53490 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53491 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53492 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53493 SDValue Val = StoredVal.getOperand(0);
53494 // We must store zeros to the unused bits.
53495 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53496 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53497 St->getPointerInfo(), St->getBaseAlign(),
53498 St->getMemOperand()->getFlags());
53499 }
53500
53501 // Widen v2i1/v4i1 stores to v8i1.
53502 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53503 Subtarget.hasAVX512()) {
53504 unsigned NumConcats = 8 / VT.getVectorNumElements();
53505 // We must store zeros to the unused bits.
53506 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53507 Ops[0] = StoredVal;
53508 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53509 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53510 St->getPointerInfo(), St->getBaseAlign(),
53511 St->getMemOperand()->getFlags());
53512 }
53513
53514 // Turn vXi1 stores of constants into a scalar store.
53515 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53516 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53518 // If its a v64i1 store without 64-bit support, we need two stores.
53519 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53520 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53521 StoredVal->ops().slice(0, 32));
53523 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53524 StoredVal->ops().slice(32, 32));
53526
53527 SDValue Ptr0 = St->getBasePtr();
53528 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53529
53530 SDValue Ch0 =
53531 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53532 St->getBaseAlign(), St->getMemOperand()->getFlags());
53533 SDValue Ch1 = DAG.getStore(
53534 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53535 St->getBaseAlign(), St->getMemOperand()->getFlags());
53536 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53537 }
53538
53539 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53540 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53541 St->getPointerInfo(), St->getBaseAlign(),
53542 St->getMemOperand()->getFlags());
53543 }
53544
53545 // Convert scalar fabs/fneg load-store to integer equivalents.
53546 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53547 (StoredVal.getOpcode() == ISD::FABS ||
53548 StoredVal.getOpcode() == ISD::FNEG) &&
53549 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53550 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53551 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53552 if (TLI.isTypeLegal(IntVT)) {
53554 unsigned SignOp = ISD::XOR;
53555 if (StoredVal.getOpcode() == ISD::FABS) {
53556 SignMask = ~SignMask;
53557 SignOp = ISD::AND;
53558 }
53559 SDValue LogicOp = DAG.getNode(
53560 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53561 DAG.getConstant(SignMask, dl, IntVT));
53562 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53563 St->getPointerInfo(), St->getBaseAlign(),
53564 St->getMemOperand()->getFlags());
53565 }
53566 }
53567
53568 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53569 // Sandy Bridge, perform two 16-byte stores.
53570 unsigned Fast;
53571 if (VT.is256BitVector() && StVT == VT &&
53572 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53573 *St->getMemOperand(), &Fast) &&
53574 !Fast) {
53575 unsigned NumElems = VT.getVectorNumElements();
53576 if (NumElems < 2)
53577 return SDValue();
53578
53579 return splitVectorStore(St, DAG);
53580 }
53581
53582 // Split under-aligned vector non-temporal stores.
53583 if (St->isNonTemporal() && StVT == VT &&
53584 St->getAlign().value() < VT.getStoreSize()) {
53585 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53586 // vectors or the legalizer can scalarize it to use MOVNTI.
53587 if (VT.is256BitVector() || VT.is512BitVector()) {
53588 unsigned NumElems = VT.getVectorNumElements();
53589 if (NumElems < 2)
53590 return SDValue();
53591 return splitVectorStore(St, DAG);
53592 }
53593
53594 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53595 // to use MOVNTI.
53596 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53597 MVT NTVT = Subtarget.hasSSE4A()
53598 ? MVT::v2f64
53599 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53600 return scalarizeVectorStore(St, NTVT, DAG);
53601 }
53602 }
53603
53604 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53605 // supported, but avx512f is by extending to v16i32 and truncating.
53606 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53607 St->getValue().getOpcode() == ISD::TRUNCATE &&
53608 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53609 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53610 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53611 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53612 St->getValue().getOperand(0));
53613 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53614 MVT::v16i8, St->getMemOperand());
53615 }
53616
53617 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53618 if (!St->isTruncatingStore() &&
53619 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53620 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53621 StoredVal.hasOneUse() &&
53622 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53623 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53624 return EmitTruncSStore(IsSigned, St->getChain(),
53625 dl, StoredVal.getOperand(0), St->getBasePtr(),
53626 VT, St->getMemOperand(), DAG);
53627 }
53628
53629 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53630 if (!St->isTruncatingStore()) {
53631 auto IsExtractedElement = [](SDValue V) {
53632 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53633 V = V.getOperand(0);
53634 unsigned Opc = V.getOpcode();
53636 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53637 V.getOperand(0).hasOneUse())
53638 return V.getOperand(0);
53639 return SDValue();
53640 };
53641 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53642 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53643 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53644 SDValue Src = Trunc.getOperand(0);
53645 MVT DstVT = Trunc.getSimpleValueType();
53646 MVT SrcVT = Src.getSimpleValueType();
53647 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53648 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53649 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53650 if (NumTruncBits == VT.getSizeInBits() &&
53651 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53652 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53653 TruncVT, St->getMemOperand());
53654 }
53655 }
53656 }
53657 }
53658
53659 // Optimize trunc store (of multiple scalars) to shuffle and store.
53660 // First, pack all of the elements in one place. Next, store to memory
53661 // in fewer chunks.
53662 if (St->isTruncatingStore() && VT.isVector()) {
53663 if (TLI.isTruncStoreLegal(VT, StVT)) {
53664 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53665 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53666 dl, Val, St->getBasePtr(),
53667 St->getMemoryVT(), St->getMemOperand(), DAG);
53668 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53669 DAG, dl))
53670 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53671 dl, Val, St->getBasePtr(),
53672 St->getMemoryVT(), St->getMemOperand(), DAG);
53673 }
53674
53675 return SDValue();
53676 }
53677
53678 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53679 unsigned AddrSpace = St->getAddressSpace();
53680 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53681 AddrSpace == X86AS::PTR32_UPTR) {
53682 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53683 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53684 SDValue Cast =
53685 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53686 return DAG.getTruncStore(
53687 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53688 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53689 }
53690 }
53691
53692 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53693 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53694 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53695 Subtarget.hasCF() && St->isSimple()) {
53696 SDValue Cmov;
53697 if (StoredVal.getOpcode() == X86ISD::CMOV)
53698 Cmov = StoredVal;
53699 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53700 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53701 Cmov = StoredVal.getOperand(0);
53702 else
53703 return SDValue();
53704
53705 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53706 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53707 return SDValue();
53708
53709 bool InvertCC = false;
53710 SDValue V = SDValue(Ld, 0);
53711 if (V == Cmov.getOperand(1))
53712 InvertCC = true;
53713 else if (V != Cmov.getOperand(0))
53714 return SDValue();
53715
53716 SDVTList Tys = DAG.getVTList(MVT::Other);
53717 SDValue CC = Cmov.getOperand(2);
53718 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53719 if (InvertCC)
53720 CC = DAG.getTargetConstant(
53723 dl, MVT::i8);
53724 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53725 Cmov.getOperand(3)};
53726 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53727 St->getMemOperand());
53728 }
53729
53730 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53731 // the FP state in cases where an emms may be missing.
53732 // A preferable solution to the general problem is to figure out the right
53733 // places to insert EMMS. This qualifies as a quick hack.
53734
53735 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53736 if (VT.getSizeInBits() != 64)
53737 return SDValue();
53738
53739 const Function &F = DAG.getMachineFunction().getFunction();
53740 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53741 bool F64IsLegal =
53742 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53743
53744 if (!F64IsLegal || Subtarget.is64Bit())
53745 return SDValue();
53746
53747 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53748 cast<LoadSDNode>(St->getValue())->isSimple() &&
53749 St->getChain().hasOneUse() && St->isSimple()) {
53750 auto *Ld = cast<LoadSDNode>(St->getValue());
53751
53752 if (!ISD::isNormalLoad(Ld))
53753 return SDValue();
53754
53755 // Avoid the transformation if there are multiple uses of the loaded value.
53756 if (!Ld->hasNUsesOfValue(1, 0))
53757 return SDValue();
53758
53759 SDLoc LdDL(Ld);
53760 SDLoc StDL(N);
53761
53762 // Remove any range metadata as we're converting to f64 load/store.
53763 Ld->getMemOperand()->clearRanges();
53764
53765 // Lower to a single movq load/store pair.
53766 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53767 Ld->getBasePtr(), Ld->getMemOperand());
53768
53769 // Make sure new load is placed in same chain order.
53770 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53771 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53772 St->getMemOperand());
53773 }
53774
53775 // This is similar to the above case, but here we handle a scalar 64-bit
53776 // integer store that is extracted from a vector on a 32-bit target.
53777 // If we have SSE2, then we can treat it like a floating-point double
53778 // to get past legalization. The execution dependencies fixup pass will
53779 // choose the optimal machine instruction for the store if this really is
53780 // an integer or v2f32 rather than an f64.
53781 if (VT == MVT::i64 &&
53783 SDValue OldExtract = St->getOperand(1);
53784 SDValue ExtOp0 = OldExtract.getOperand(0);
53785 unsigned VecSize = ExtOp0.getValueSizeInBits();
53786 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53787 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53788 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53789 BitCast, OldExtract.getOperand(1));
53790 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53791 St->getPointerInfo(), St->getBaseAlign(),
53792 St->getMemOperand()->getFlags());
53793 }
53794
53795 return SDValue();
53796}
53797
53800 const X86Subtarget &Subtarget) {
53801 auto *St = cast<MemIntrinsicSDNode>(N);
53802
53803 SDValue StoredVal = N->getOperand(1);
53804 MVT VT = StoredVal.getSimpleValueType();
53805 EVT MemVT = St->getMemoryVT();
53806
53807 // Figure out which elements we demand.
53808 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53809 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53810
53811 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53812 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53813 if (N->getOpcode() != ISD::DELETED_NODE)
53814 DCI.AddToWorklist(N);
53815 return SDValue(N, 0);
53816 }
53817
53818 return SDValue();
53819}
53820
53821/// Return 'true' if this vector operation is "horizontal"
53822/// and return the operands for the horizontal operation in LHS and RHS. A
53823/// horizontal operation performs the binary operation on successive elements
53824/// of its first operand, then on successive elements of its second operand,
53825/// returning the resulting values in a vector. For example, if
53826/// A = < float a0, float a1, float a2, float a3 >
53827/// and
53828/// B = < float b0, float b1, float b2, float b3 >
53829/// then the result of doing a horizontal operation on A and B is
53830/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53831/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53832/// A horizontal-op B, for some already available A and B, and if so then LHS is
53833/// set to A, RHS to B, and the routine returns 'true'.
53834static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53835 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53836 bool IsCommutative,
53837 SmallVectorImpl<int> &PostShuffleMask,
53838 bool ForceHorizOp) {
53839 // If either operand is undef, bail out. The binop should be simplified.
53840 if (LHS.isUndef() || RHS.isUndef())
53841 return false;
53842
53843 // Look for the following pattern:
53844 // A = < float a0, float a1, float a2, float a3 >
53845 // B = < float b0, float b1, float b2, float b3 >
53846 // and
53847 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53848 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53849 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53850 // which is A horizontal-op B.
53851
53852 MVT VT = LHS.getSimpleValueType();
53853 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53854 "Unsupported vector type for horizontal add/sub");
53855 unsigned NumElts = VT.getVectorNumElements();
53856
53857 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53858 SmallVectorImpl<int> &ShuffleMask) {
53859 bool UseSubVector = false;
53860 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53861 Op.getOperand(0).getValueType().is256BitVector() &&
53862 llvm::isNullConstant(Op.getOperand(1))) {
53863 Op = Op.getOperand(0);
53864 UseSubVector = true;
53865 }
53867 SmallVector<int, 16> SrcMask, ScaledMask;
53869 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53870 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53871 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53872 })) {
53873 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53874 if (!UseSubVector && SrcOps.size() <= 2 &&
53875 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53876 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53877 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53878 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53879 }
53880 if (UseSubVector && SrcOps.size() == 1 &&
53881 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53882 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53883 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53884 ShuffleMask.assign(Mask.begin(), Mask.end());
53885 }
53886 }
53887 };
53888
53889 // View LHS in the form
53890 // LHS = VECTOR_SHUFFLE A, B, LMask
53891 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53892 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53893 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53894 SDValue A, B;
53896 GetShuffle(LHS, A, B, LMask);
53897
53898 // Likewise, view RHS in the form
53899 // RHS = VECTOR_SHUFFLE C, D, RMask
53900 SDValue C, D;
53902 GetShuffle(RHS, C, D, RMask);
53903
53904 // At least one of the operands should be a vector shuffle.
53905 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53906 if (NumShuffles == 0)
53907 return false;
53908
53909 if (LMask.empty()) {
53910 A = LHS;
53911 for (unsigned i = 0; i != NumElts; ++i)
53912 LMask.push_back(i);
53913 }
53914
53915 if (RMask.empty()) {
53916 C = RHS;
53917 for (unsigned i = 0; i != NumElts; ++i)
53918 RMask.push_back(i);
53919 }
53920
53921 // If we have an unary mask, ensure the other op is set to null.
53922 if (isUndefOrInRange(LMask, 0, NumElts))
53923 B = SDValue();
53924 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53925 A = SDValue();
53926
53927 if (isUndefOrInRange(RMask, 0, NumElts))
53928 D = SDValue();
53929 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53930 C = SDValue();
53931
53932 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53933 // RHS operands and shuffle mask.
53934 if (A != C) {
53935 std::swap(C, D);
53937 }
53938 // Check that the shuffles are both shuffling the same vectors.
53939 if (!(A == C && B == D))
53940 return false;
53941
53942 PostShuffleMask.clear();
53943 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53944
53945 // LHS and RHS are now:
53946 // LHS = shuffle A, B, LMask
53947 // RHS = shuffle A, B, RMask
53948 // Check that the masks correspond to performing a horizontal operation.
53949 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53950 // so we just repeat the inner loop if this is a 256-bit op.
53951 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53952 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53953 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53954 assert((NumEltsPer128BitChunk % 2 == 0) &&
53955 "Vector type should have an even number of elements in each lane");
53956 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53957 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53958 // Ignore undefined components.
53959 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53960 if (LIdx < 0 || RIdx < 0 ||
53961 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53962 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53963 continue;
53964
53965 // Check that successive odd/even elements are being operated on. If not,
53966 // this is not a horizontal operation.
53967 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53968 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53969 return false;
53970
53971 // Compute the post-shuffle mask index based on where the element
53972 // is stored in the HOP result, and where it needs to be moved to.
53973 int Base = LIdx & ~1u;
53974 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53975 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53976
53977 // The low half of the 128-bit result must choose from A.
53978 // The high half of the 128-bit result must choose from B,
53979 // unless B is undef. In that case, we are always choosing from A.
53980 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53981 Index += NumEltsPer64BitChunk;
53982 PostShuffleMask[i + j] = Index;
53983 }
53984 }
53985
53986 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
53987 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
53988
53989 bool IsIdentityPostShuffle =
53990 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
53991 if (IsIdentityPostShuffle)
53992 PostShuffleMask.clear();
53993
53994 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53995 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
53996 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
53997 return false;
53998
53999 // If the source nodes are already used in HorizOps then always accept this.
54000 // Shuffle folding should merge these back together.
54001 auto FoundHorizUser = [&](SDNode *User) {
54002 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
54003 };
54004 ForceHorizOp =
54005 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
54006 llvm::any_of(NewRHS->users(), FoundHorizUser));
54007
54008 // Assume a SingleSource HOP if we only shuffle one input and don't need to
54009 // shuffle the result.
54010 if (!ForceHorizOp &&
54011 !shouldUseHorizontalOp(NewLHS == NewRHS &&
54012 (NumShuffles < 2 || !IsIdentityPostShuffle),
54013 DAG, Subtarget))
54014 return false;
54015
54016 LHS = DAG.getBitcast(VT, NewLHS);
54017 RHS = DAG.getBitcast(VT, NewRHS);
54018 return true;
54019}
54020
54021// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
54023 const X86Subtarget &Subtarget) {
54024 EVT VT = N->getValueType(0);
54025 unsigned Opcode = N->getOpcode();
54026 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
54027 SmallVector<int, 8> PostShuffleMask;
54028
54029 auto MergableHorizOp = [N](unsigned HorizOpcode) {
54030 return N->hasOneUse() &&
54031 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
54032 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
54033 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
54034 };
54035
54036 switch (Opcode) {
54037 case ISD::FADD:
54038 case ISD::FSUB:
54039 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
54040 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
54041 SDValue LHS = N->getOperand(0);
54042 SDValue RHS = N->getOperand(1);
54043 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
54044 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54045 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54046 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
54047 if (!PostShuffleMask.empty())
54048 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54049 DAG.getUNDEF(VT), PostShuffleMask);
54050 return HorizBinOp;
54051 }
54052 }
54053 break;
54054 case ISD::ADD:
54055 case ISD::SUB:
54056 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
54057 VT == MVT::v16i16 || VT == MVT::v8i32)) {
54058 SDValue LHS = N->getOperand(0);
54059 SDValue RHS = N->getOperand(1);
54060 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
54061 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54062 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54063 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
54065 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
54066 };
54067 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
54068 {LHS, RHS}, HOpBuilder);
54069 if (!PostShuffleMask.empty())
54070 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54071 DAG.getUNDEF(VT), PostShuffleMask);
54072 return HorizBinOp;
54073 }
54074 }
54075 break;
54076 }
54077
54078 return SDValue();
54079}
54080
54081// Try to combine the following nodes
54082// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
54083// <i32 -2147483648[float -0.000000e+00]> 0
54084// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
54085// <(load 4 from constant-pool)> t0, t29
54086// [t30: v16i32 = bitcast t27]
54087// t6: v16i32 = xor t7, t27[t30]
54088// t11: v16f32 = bitcast t6
54089// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
54090// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
54091// t22: v16f32 = bitcast t7
54092// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
54093// t24: v32f16 = bitcast t23
54095 const X86Subtarget &Subtarget) {
54096 EVT VT = N->getValueType(0);
54097 SDValue LHS = N->getOperand(0);
54098 SDValue RHS = N->getOperand(1);
54099 int CombineOpcode =
54100 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
54101 auto combineConjugation = [&](SDValue &r) {
54102 if (LHS->getOpcode() == ISD::BITCAST) {
54103 SDValue XOR = LHS.getOperand(0);
54104 if (XOR->getOpcode() == ISD::XOR) {
54105 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54106 if (XORRHS.isConstant()) {
54107 APInt ConjugationInt32 = APInt(32, 0x80000000);
54108 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54109 if ((XORRHS.getBitWidth() == 32 &&
54110 XORRHS.getConstant() == ConjugationInt32) ||
54111 (XORRHS.getBitWidth() == 64 &&
54112 XORRHS.getConstant() == ConjugationInt64)) {
54113 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54114 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54115 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54116 r = DAG.getBitcast(VT, FCMulC);
54117 return true;
54118 }
54119 }
54120 }
54121 }
54122 return false;
54123 };
54124 SDValue Res;
54125 if (combineConjugation(Res))
54126 return Res;
54127 std::swap(LHS, RHS);
54128 if (combineConjugation(Res))
54129 return Res;
54130 return Res;
54131}
54132
54133// Try to combine the following nodes:
54134// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54136 const X86Subtarget &Subtarget) {
54137 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54139 Flags.hasAllowContract();
54140 };
54141
54142 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54143 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54144 Flags.hasNoSignedZeros();
54145 };
54146 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54147 APInt AI = APInt(32, 0x80008000);
54148 KnownBits Bits = DAG.computeKnownBits(Op);
54149 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54150 Bits.getConstant() == AI;
54151 };
54152
54153 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54154 !AllowContract(N->getFlags()))
54155 return SDValue();
54156
54157 EVT VT = N->getValueType(0);
54158 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54159 return SDValue();
54160
54161 SDValue LHS = N->getOperand(0);
54162 SDValue RHS = N->getOperand(1);
54163 bool IsConj;
54164 SDValue FAddOp1, MulOp0, MulOp1;
54165 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54166 &IsVectorAllNegativeZero,
54167 &HasNoSignedZero](SDValue N) -> bool {
54168 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54169 return false;
54170 SDValue Op0 = N.getOperand(0);
54171 unsigned Opcode = Op0.getOpcode();
54172 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54173 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54174 MulOp0 = Op0.getOperand(0);
54175 MulOp1 = Op0.getOperand(1);
54176 IsConj = Opcode == X86ISD::VFCMULC;
54177 return true;
54178 }
54179 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54181 HasNoSignedZero(Op0->getFlags())) ||
54182 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54183 MulOp0 = Op0.getOperand(0);
54184 MulOp1 = Op0.getOperand(1);
54185 IsConj = Opcode == X86ISD::VFCMADDC;
54186 return true;
54187 }
54188 }
54189 return false;
54190 };
54191
54192 if (GetCFmulFrom(LHS))
54193 FAddOp1 = RHS;
54194 else if (GetCFmulFrom(RHS))
54195 FAddOp1 = LHS;
54196 else
54197 return SDValue();
54198
54199 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54200 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54201 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54202 // FIXME: How do we handle when fast math flags of FADD are different from
54203 // CFMUL's?
54204 SDValue CFmul =
54205 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54206 return DAG.getBitcast(VT, CFmul);
54207}
54208
54209/// Do target-specific dag combines on floating-point adds/subs.
54211 const X86Subtarget &Subtarget) {
54212 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54213 return HOp;
54214
54215 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54216 return COp;
54217
54218 return SDValue();
54219}
54220
54222 const X86Subtarget &Subtarget) {
54223 EVT VT = N->getValueType(0);
54224 SDValue Src = N->getOperand(0);
54225 EVT SrcVT = Src.getValueType();
54226 SDLoc DL(N);
54227
54228 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54229
54230 // Let legalize expand this if it isn't a legal type yet.
54231 if (!TLI.isTypeLegal(VT))
54232 return SDValue();
54233
54234 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54235 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54236 return SDValue();
54237
54238 if (SrcVT == MVT::v2f16) {
54239 SrcVT = MVT::v4f16;
54240 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54241 DAG.getUNDEF(MVT::v2f16));
54242 }
54243
54244 if (SrcVT == MVT::v4f16) {
54245 SrcVT = MVT::v8f16;
54246 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54247 DAG.getUNDEF(MVT::v4f16));
54248 } else if (SrcVT == MVT::v2f32) {
54249 SrcVT = MVT::v4f32;
54250 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54251 DAG.getUNDEF(MVT::v2f32));
54252 } else {
54253 return SDValue();
54254 }
54255
54256 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54257}
54258
54259// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54260// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54261// are able to avoid generating code with MOVABS and large constants in certain
54262// cases.
54264 const SDLoc &DL) {
54265 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54266 std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
54267 if (!ValidSrlConst)
54268 return SDValue();
54269 unsigned SrlConstVal = *ValidSrlConst;
54270
54271 SDValue Op = N.getOperand(0);
54272 unsigned Opcode = Op.getOpcode();
54273 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54274 "Illegal truncation types");
54275
54276 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54277 !isa<ConstantSDNode>(Op.getOperand(1)))
54278 return SDValue();
54279 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54280
54281 if (SrlConstVal <= 32 ||
54282 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54283 return SDValue();
54284
54285 SDValue OpLhsSrl =
54286 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54287 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54288
54289 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54290 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54291 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54292
54293 if (Opcode == ISD::ADD) {
54294 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54295 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54296 }
54297 return NewOpNode;
54298}
54299
54300/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54301/// the codegen.
54302/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54303/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54304/// anything that is guaranteed to be transformed by DAGCombiner.
54306 const X86Subtarget &Subtarget,
54307 const SDLoc &DL) {
54308 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54309 SDValue Src = N->getOperand(0);
54310 unsigned SrcOpcode = Src.getOpcode();
54311 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54312
54313 EVT VT = N->getValueType(0);
54314 EVT SrcVT = Src.getValueType();
54315
54316 auto IsFreeTruncation = [VT](SDValue Op) {
54317 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54318
54319 // See if this has been extended from a smaller/equal size to
54320 // the truncation size, allowing a truncation to combine with the extend.
54321 unsigned Opcode = Op.getOpcode();
54322 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54323 Opcode == ISD::ZERO_EXTEND) &&
54324 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54325 return true;
54326
54327 // See if this is a single use constant which can be constant folded.
54328 // NOTE: We don't peek throught bitcasts here because there is currently
54329 // no support for constant folding truncate+bitcast+vector_of_constants. So
54330 // we'll just send up with a truncate on both operands which will
54331 // get turned back into (truncate (binop)) causing an infinite loop.
54332 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54333 };
54334
54335 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54336 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54337 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54338 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54339 };
54340
54341 // Don't combine if the operation has other uses.
54342 if (!Src.hasOneUse())
54343 return SDValue();
54344
54345 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54346 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54347
54348 if (!VT.isVector())
54349 return SDValue();
54350
54351 // In most cases its only worth pre-truncating if we're only facing the cost
54352 // of one truncation.
54353 // i.e. if one of the inputs will constant fold or the input is repeated.
54354 switch (SrcOpcode) {
54355 case ISD::MUL:
54356 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54357 // better to truncate if we have the chance.
54358 if (SrcVT.getScalarType() == MVT::i64 &&
54359 TLI.isOperationLegal(SrcOpcode, VT) &&
54360 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54361 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54362 [[fallthrough]];
54363 case ISD::AND:
54364 case ISD::XOR:
54365 case ISD::OR:
54366 case ISD::ADD:
54367 case ISD::SUB: {
54368 SDValue Op0 = Src.getOperand(0);
54369 SDValue Op1 = Src.getOperand(1);
54370 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54371 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54372 return TruncateArithmetic(Op0, Op1);
54373 break;
54374 }
54375 }
54376
54377 return SDValue();
54378}
54379
54380// Try to form a MULHU or MULHS node by looking for
54381// (trunc (srl (mul ext, ext), >= 16))
54382// TODO: This is X86 specific because we want to be able to handle wide types
54383// before type legalization. But we can only do it if the vector will be
54384// legalized via widening/splitting. Type legalization can't handle promotion
54385// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54386// combiner.
54387static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54388 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54389 using namespace llvm::SDPatternMatch;
54390
54391 if (!Subtarget.hasSSE2())
54392 return SDValue();
54393
54394 // Only handle vXi16 types that are at least 128-bits unless they will be
54395 // widened.
54396 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54397 return SDValue();
54398
54399 // Input type should be at least vXi32.
54400 EVT InVT = Src.getValueType();
54401 if (InVT.getVectorElementType().getSizeInBits() < 32)
54402 return SDValue();
54403
54404 // First instruction should be a right shift by 16 of a multiply.
54405 SDValue LHS, RHS;
54406 APInt ShiftAmt;
54407 if (!sd_match(Src,
54408 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54409 return SDValue();
54410
54411 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54412 return SDValue();
54413
54414 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54415
54416 // Count leading sign/zero bits on both inputs - if there are enough then
54417 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54418 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54419 // truncations may actually be free by peeking through to the ext source.
54420 auto IsSext = [&DAG](SDValue V) {
54421 return DAG.ComputeMaxSignificantBits(V) <= 16;
54422 };
54423 auto IsZext = [&DAG](SDValue V) {
54424 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54425 };
54426
54427 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54428 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54429 if (!IsSigned && !IsUnsigned)
54430 return SDValue();
54431
54432 // Check if both inputs are extensions, which will be removed by truncation.
54433 auto isOpTruncateFree = [](SDValue Op) {
54434 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54435 Op.getOpcode() == ISD::ZERO_EXTEND)
54436 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54437 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54438 };
54439 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54440
54441 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54442 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54443 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54444 // will have to split anyway.
54445 unsigned InSizeInBits = InVT.getSizeInBits();
54446 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54447 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54448 (InSizeInBits % 16) == 0) {
54449 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54450 InVT.getSizeInBits() / 16);
54451 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54452 DAG.getBitcast(BCVT, RHS));
54453 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54454 return DAG.getNode(ISD::SRL, DL, VT, Res,
54455 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54456 }
54457
54458 // Truncate back to source type.
54459 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54460 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54461
54462 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54463 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54464 return DAG.getNode(ISD::SRL, DL, VT, Res,
54465 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54466}
54467
54468// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54469// from one vector with signed bytes from another vector, adds together
54470// adjacent pairs of 16-bit products, and saturates the result before
54471// truncating to 16-bits.
54472//
54473// Which looks something like this:
54474// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54475// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54477 const X86Subtarget &Subtarget,
54478 const SDLoc &DL) {
54479 if (!VT.isVector() || !Subtarget.hasSSSE3())
54480 return SDValue();
54481
54482 unsigned NumElems = VT.getVectorNumElements();
54483 EVT ScalarVT = VT.getVectorElementType();
54484 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54485 return SDValue();
54486
54487 SDValue SSatVal = detectSSatPattern(In, VT);
54488 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54489 return SDValue();
54490
54491 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54492 // of multiplies from even/odd elements.
54493 SDValue N0 = SSatVal.getOperand(0);
54494 SDValue N1 = SSatVal.getOperand(1);
54495
54496 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54497 return SDValue();
54498
54499 SDValue N00 = N0.getOperand(0);
54500 SDValue N01 = N0.getOperand(1);
54501 SDValue N10 = N1.getOperand(0);
54502 SDValue N11 = N1.getOperand(1);
54503
54504 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54505 // Canonicalize zero_extend to LHS.
54506 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54507 std::swap(N00, N01);
54508 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54509 std::swap(N10, N11);
54510
54511 // Ensure we have a zero_extend and a sign_extend.
54512 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54513 N01.getOpcode() != ISD::SIGN_EXTEND ||
54514 N10.getOpcode() != ISD::ZERO_EXTEND ||
54515 N11.getOpcode() != ISD::SIGN_EXTEND)
54516 return SDValue();
54517
54518 // Peek through the extends.
54519 N00 = N00.getOperand(0);
54520 N01 = N01.getOperand(0);
54521 N10 = N10.getOperand(0);
54522 N11 = N11.getOperand(0);
54523
54524 // Ensure the extend is from vXi8.
54525 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54526 N01.getValueType().getVectorElementType() != MVT::i8 ||
54527 N10.getValueType().getVectorElementType() != MVT::i8 ||
54528 N11.getValueType().getVectorElementType() != MVT::i8)
54529 return SDValue();
54530
54531 // All inputs should be build_vectors.
54532 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54533 N01.getOpcode() != ISD::BUILD_VECTOR ||
54534 N10.getOpcode() != ISD::BUILD_VECTOR ||
54536 return SDValue();
54537
54538 // N00/N10 are zero extended. N01/N11 are sign extended.
54539
54540 // For each element, we need to ensure we have an odd element from one vector
54541 // multiplied by the odd element of another vector and the even element from
54542 // one of the same vectors being multiplied by the even element from the
54543 // other vector. So we need to make sure for each element i, this operator
54544 // is being performed:
54545 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54546 SDValue ZExtIn, SExtIn;
54547 for (unsigned i = 0; i != NumElems; ++i) {
54548 SDValue N00Elt = N00.getOperand(i);
54549 SDValue N01Elt = N01.getOperand(i);
54550 SDValue N10Elt = N10.getOperand(i);
54551 SDValue N11Elt = N11.getOperand(i);
54552 // TODO: Be more tolerant to undefs.
54553 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54554 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54555 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54557 return SDValue();
54558 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54559 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54560 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54561 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54562 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54563 return SDValue();
54564 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54565 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54566 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54567 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54568 // Add is commutative so indices can be reordered.
54569 if (IdxN00 > IdxN10) {
54570 std::swap(IdxN00, IdxN10);
54571 std::swap(IdxN01, IdxN11);
54572 }
54573 // N0 indices be the even element. N1 indices must be the next odd element.
54574 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54575 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54576 return SDValue();
54577 SDValue N00In = N00Elt.getOperand(0);
54578 SDValue N01In = N01Elt.getOperand(0);
54579 SDValue N10In = N10Elt.getOperand(0);
54580 SDValue N11In = N11Elt.getOperand(0);
54581 // First time we find an input capture it.
54582 if (!ZExtIn) {
54583 ZExtIn = N00In;
54584 SExtIn = N01In;
54585 }
54586 if (ZExtIn != N00In || SExtIn != N01In ||
54587 ZExtIn != N10In || SExtIn != N11In)
54588 return SDValue();
54589 }
54590
54591 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54592 EVT ExtVT = Ext.getValueType();
54593 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54594 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54595 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54596 DAG.getVectorIdxConstant(0, DL));
54597 }
54598 };
54599 ExtractVec(ZExtIn);
54600 ExtractVec(SExtIn);
54601
54602 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54604 // Shrink by adding truncate nodes and let DAGCombine fold with the
54605 // sources.
54606 EVT InVT = Ops[0].getValueType();
54607 assert(InVT.getScalarType() == MVT::i8 &&
54608 "Unexpected scalar element type");
54609 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54610 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54611 InVT.getVectorNumElements() / 2);
54612 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54613 };
54614 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54615 PMADDBuilder);
54616}
54617
54619 const X86Subtarget &Subtarget) {
54620 EVT VT = N->getValueType(0);
54621 SDValue Src = N->getOperand(0);
54622 SDLoc DL(N);
54623
54624 // Attempt to pre-truncate inputs to arithmetic ops instead.
54625 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54626 return V;
54627
54628 // Try to detect PMADD
54629 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54630 return PMAdd;
54631
54632 // Try to combine truncation with signed/unsigned saturation.
54633 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54634 return Val;
54635
54636 // Try to combine PMULHUW/PMULHW for vXi16.
54637 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54638 return V;
54639
54640 // The bitcast source is a direct mmx result.
54641 // Detect bitcasts between i32 to x86mmx
54642 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54643 SDValue BCSrc = Src.getOperand(0);
54644 if (BCSrc.getValueType() == MVT::x86mmx)
54645 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54646 }
54647
54648 return SDValue();
54649}
54650
54653 EVT VT = N->getValueType(0);
54654 SDValue In = N->getOperand(0);
54655 SDLoc DL(N);
54656
54657 if (SDValue SSatVal = detectSSatPattern(In, VT))
54658 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54659 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54660 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54661
54662 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54663 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54664 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54665 return SDValue(N, 0);
54666
54667 return SDValue();
54668}
54669
54670/// Returns the negated value if the node \p N flips sign of FP value.
54671///
54672/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54673/// or FSUB(0, x)
54674/// AVX512F does not have FXOR, so FNEG is lowered as
54675/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54676/// In this case we go though all bitcasts.
54677/// This also recognizes splat of a negated value and returns the splat of that
54678/// value.
54679static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54680 if (N->getOpcode() == ISD::FNEG)
54681 return N->getOperand(0);
54682
54683 // Don't recurse exponentially.
54685 return SDValue();
54686
54687 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54688
54690 EVT VT = Op->getValueType(0);
54691
54692 // Make sure the element size doesn't change.
54693 if (VT.getScalarSizeInBits() != ScalarSize)
54694 return SDValue();
54695
54696 unsigned Opc = Op.getOpcode();
54697 switch (Opc) {
54698 case ISD::VECTOR_SHUFFLE: {
54699 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54700 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54701 if (!Op.getOperand(1).isUndef())
54702 return SDValue();
54703 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54704 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54705 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54706 cast<ShuffleVectorSDNode>(Op)->getMask());
54707 break;
54708 }
54710 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54711 // -V, INDEX).
54712 SDValue InsVector = Op.getOperand(0);
54713 SDValue InsVal = Op.getOperand(1);
54714 if (!InsVector.isUndef())
54715 return SDValue();
54716 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54717 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54718 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54719 NegInsVal, Op.getOperand(2));
54720 break;
54721 }
54722 case ISD::FSUB:
54723 case ISD::XOR:
54724 case X86ISD::FXOR: {
54725 SDValue Op1 = Op.getOperand(1);
54726 SDValue Op0 = Op.getOperand(0);
54727
54728 // For XOR and FXOR, we want to check if constant
54729 // bits of Op1 are sign bit masks. For FSUB, we
54730 // have to check if constant bits of Op0 are sign
54731 // bit masks and hence we swap the operands.
54732 if (Opc == ISD::FSUB)
54733 std::swap(Op0, Op1);
54734
54735 APInt UndefElts;
54736 SmallVector<APInt, 16> EltBits;
54737 // Extract constant bits and see if they are all
54738 // sign bit masks. Ignore the undef elements.
54739 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54740 /* AllowWholeUndefs */ true,
54741 /* AllowPartialUndefs */ false)) {
54742 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54743 if (!UndefElts[I] && !EltBits[I].isSignMask())
54744 return SDValue();
54745
54746 // Only allow bitcast from correctly-sized constant.
54747 Op0 = peekThroughBitcasts(Op0);
54748 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54749 return Op0;
54750 }
54751 break;
54752 } // case
54753 } // switch
54754
54755 return SDValue();
54756}
54757
54758static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54759 bool NegRes) {
54760 if (NegMul) {
54761 switch (Opcode) {
54762 // clang-format off
54763 default: llvm_unreachable("Unexpected opcode");
54764 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54765 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54766 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54767 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54768 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54769 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54770 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54771 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54772 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54773 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54774 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54775 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54776 // clang-format on
54777 }
54778 }
54779
54780 if (NegAcc) {
54781 switch (Opcode) {
54782 // clang-format off
54783 default: llvm_unreachable("Unexpected opcode");
54784 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54785 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54786 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54787 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54788 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54789 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54790 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54791 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54792 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54793 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54794 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54795 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54796 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54797 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54798 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54799 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54800 // clang-format on
54801 }
54802 }
54803
54804 if (NegRes) {
54805 switch (Opcode) {
54806 // For accuracy reason, we never combine fneg and fma under strict FP.
54807 // clang-format off
54808 default: llvm_unreachable("Unexpected opcode");
54809 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54810 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54811 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54812 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54813 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54814 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54815 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54816 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54817 // clang-format on
54818 }
54819 }
54820
54821 return Opcode;
54822}
54823
54824/// Do target-specific dag combines on floating point negations.
54827 const X86Subtarget &Subtarget) {
54828 EVT OrigVT = N->getValueType(0);
54829 SDValue Arg = isFNEG(DAG, N);
54830 if (!Arg)
54831 return SDValue();
54832
54833 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54834 EVT VT = Arg.getValueType();
54835 EVT SVT = VT.getScalarType();
54836 SDLoc DL(N);
54837
54838 // Let legalize expand this if it isn't a legal type yet.
54839 if (!TLI.isTypeLegal(VT))
54840 return SDValue();
54841
54842 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54843 // use of a constant by performing (-0 - A*B) instead.
54844 // FIXME: Check rounding control flags as well once it becomes available.
54845 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54846 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54847 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54848 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54849 Arg.getOperand(1), Zero);
54850 return DAG.getBitcast(OrigVT, NewNode);
54851 }
54852
54854 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54855 if (SDValue NegArg =
54856 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54857 return DAG.getBitcast(OrigVT, NegArg);
54858
54859 return SDValue();
54860}
54861
54863 bool LegalOperations,
54864 bool ForCodeSize,
54866 unsigned Depth) const {
54867 // fneg patterns are removable even if they have multiple uses.
54868 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54870 return DAG.getBitcast(Op.getValueType(), Arg);
54871 }
54872
54873 EVT VT = Op.getValueType();
54874 EVT SVT = VT.getScalarType();
54875 unsigned Opc = Op.getOpcode();
54876 SDNodeFlags Flags = Op.getNode()->getFlags();
54877 switch (Opc) {
54878 case ISD::FMA:
54879 case X86ISD::FMSUB:
54880 case X86ISD::FNMADD:
54881 case X86ISD::FNMSUB:
54882 case X86ISD::FMADD_RND:
54883 case X86ISD::FMSUB_RND:
54884 case X86ISD::FNMADD_RND:
54885 case X86ISD::FNMSUB_RND: {
54886 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54887 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54889 break;
54890
54891 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54892 // if it may have signed zeros.
54893 if (!Flags.hasNoSignedZeros())
54894 break;
54895
54896 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54897 // keep temporary nodes alive.
54898 std::list<HandleSDNode> Handles;
54899
54900 // This is always negatible for free but we might be able to remove some
54901 // extra operand negations as well.
54902 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
54903 for (int i = 0; i != 3; ++i) {
54904 NewOps[i] = getCheaperNegatedExpression(
54905 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54906 if (!!NewOps[i])
54907 Handles.emplace_back(NewOps[i]);
54908 }
54909
54910 bool NegA = !!NewOps[0];
54911 bool NegB = !!NewOps[1];
54912 bool NegC = !!NewOps[2];
54913 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54914
54915 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54917
54918 // Fill in the non-negated ops with the original values.
54919 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54920 if (!NewOps[i])
54921 NewOps[i] = Op.getOperand(i);
54922 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54923 }
54924 case X86ISD::FRCP:
54925 if (SDValue NegOp0 =
54926 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54927 ForCodeSize, Cost, Depth + 1))
54928 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54929 break;
54930 }
54931
54932 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54933 ForCodeSize, Cost, Depth);
54934}
54935
54937 const X86Subtarget &Subtarget) {
54938 MVT VT = N->getSimpleValueType(0);
54939 // If we have integer vector types available, use the integer opcodes.
54940 if (!VT.isVector() || !Subtarget.hasSSE2())
54941 return SDValue();
54942
54943 SDLoc dl(N);
54945 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54946 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54947 unsigned IntOpcode;
54948 switch (N->getOpcode()) {
54949 // clang-format off
54950 default: llvm_unreachable("Unexpected FP logic op");
54951 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54952 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54953 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54954 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54955 // clang-format on
54956 }
54957 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54958 return DAG.getBitcast(VT, IntOp);
54959}
54960
54961/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54963 if (N->getOpcode() != ISD::XOR)
54964 return SDValue();
54965
54966 SDValue LHS = N->getOperand(0);
54967 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54968 return SDValue();
54969
54971 X86::CondCode(LHS->getConstantOperandVal(0)));
54972 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54973}
54974
54976 const X86Subtarget &Subtarget) {
54977 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54978 "Invalid opcode for combing with CTLZ");
54979 if (Subtarget.hasFastLZCNT())
54980 return SDValue();
54981
54982 EVT VT = N->getValueType(0);
54983 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54984 (VT != MVT::i64 || !Subtarget.is64Bit()))
54985 return SDValue();
54986
54987 SDValue N0 = N->getOperand(0);
54988 SDValue N1 = N->getOperand(1);
54989
54990 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
54992 return SDValue();
54993
54994 SDValue OpCTLZ;
54995 SDValue OpSizeTM1;
54996
54997 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
54998 OpCTLZ = N1;
54999 OpSizeTM1 = N0;
55000 } else if (N->getOpcode() == ISD::SUB) {
55001 return SDValue();
55002 } else {
55003 OpCTLZ = N0;
55004 OpSizeTM1 = N1;
55005 }
55006
55007 if (!OpCTLZ.hasOneUse())
55008 return SDValue();
55009 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
55010 if (!C)
55011 return SDValue();
55012
55013 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
55014 return SDValue();
55015 EVT OpVT = VT;
55016 SDValue Op = OpCTLZ.getOperand(0);
55017 if (VT == MVT::i8) {
55018 // Zero extend to i32 since there is not an i8 bsr.
55019 OpVT = MVT::i32;
55020 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
55021 }
55022
55023 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
55024 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
55025 if (VT == MVT::i8)
55026 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
55027
55028 return Op;
55029}
55030
55033 const X86Subtarget &Subtarget) {
55034 SDValue N0 = N->getOperand(0);
55035 SDValue N1 = N->getOperand(1);
55036 EVT VT = N->getValueType(0);
55037 SDLoc DL(N);
55038
55039 // If this is SSE1 only convert to FXOR to avoid scalarization.
55040 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
55041 return DAG.getBitcast(MVT::v4i32,
55042 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
55043 DAG.getBitcast(MVT::v4f32, N0),
55044 DAG.getBitcast(MVT::v4f32, N1)));
55045 }
55046
55047 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
55048 return Cmp;
55049
55050 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
55051 return R;
55052
55053 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
55054 return R;
55055
55056 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
55057 return R;
55058
55059 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
55060 DAG, DCI, Subtarget))
55061 return FPLogic;
55062
55063 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
55064 return R;
55065
55066 if (DCI.isBeforeLegalizeOps())
55067 return SDValue();
55068
55069 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
55070 return SetCC;
55071
55072 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
55073 return R;
55074
55075 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
55076 return RV;
55077
55078 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
55079 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55080 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
55081 N0.getOperand(0).getValueType().isVector() &&
55082 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55083 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
55084 return DAG.getBitcast(
55085 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
55086 }
55087
55088 // Handle AVX512 mask widening.
55089 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
55090 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
55091 VT.getVectorElementType() == MVT::i1 &&
55093 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
55094 return DAG.getNode(
55096 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
55097 N0.getOperand(2));
55098 }
55099
55100 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
55101 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55102 // TODO: Under what circumstances could this be performed in DAGCombine?
55103 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55104 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55105 SDValue TruncExtSrc = N0.getOperand(0);
55106 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55107 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55108 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55109 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55110 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55111 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55112 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55113 }
55114 }
55115
55116 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55117 return R;
55118
55119 return combineFneg(N, DAG, DCI, Subtarget);
55120}
55121
55124 const X86Subtarget &Subtarget) {
55125 SDValue N0 = N->getOperand(0);
55126 EVT VT = N->getValueType(0);
55127
55128 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55129 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55130 SDValue Src = N0.getOperand(0);
55131 EVT SrcVT = Src.getValueType();
55132 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55133 (DCI.isBeforeLegalize() ||
55134 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55135 Subtarget.hasSSSE3()) {
55136 unsigned NumElts = SrcVT.getVectorNumElements();
55137 SmallVector<int, 32> ReverseMask(NumElts);
55138 for (unsigned I = 0; I != NumElts; ++I)
55139 ReverseMask[I] = (NumElts - 1) - I;
55140 SDValue Rev =
55141 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55142 return DAG.getBitcast(VT, Rev);
55143 }
55144 }
55145
55146 return SDValue();
55147}
55148
55149// Various combines to try to convert to avgceilu.
55152 const X86Subtarget &Subtarget) {
55153 unsigned Opcode = N->getOpcode();
55154 SDValue N0 = N->getOperand(0);
55155 SDValue N1 = N->getOperand(1);
55156 EVT VT = N->getValueType(0);
55157 EVT SVT = VT.getScalarType();
55158 SDLoc DL(N);
55159
55160 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55161 // Only useful on vXi8 which doesn't have good SRA handling.
55162 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55164 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55165 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55166 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55167 return DAG.getNode(ISD::XOR, DL, VT,
55168 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55169 }
55170
55171 return SDValue();
55172}
55173
55176 const X86Subtarget &Subtarget) {
55177 EVT VT = N->getValueType(0);
55178 unsigned NumBits = VT.getSizeInBits();
55179
55180 // TODO - Constant Folding.
55181
55182 // Simplify the inputs.
55183 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55184 APInt DemandedMask(APInt::getAllOnes(NumBits));
55185 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55186 return SDValue(N, 0);
55187
55188 return SDValue();
55189}
55190
55192 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55193}
55194
55195/// If a value is a scalar FP zero or a vector FP zero (potentially including
55196/// undefined elements), return a zero constant that may be used to fold away
55197/// that value. In the case of a vector, the returned constant will not contain
55198/// undefined elements even if the input parameter does. This makes it suitable
55199/// to be used as a replacement operand with operations (eg, bitwise-and) where
55200/// an undef should not propagate.
55202 const X86Subtarget &Subtarget) {
55204 return SDValue();
55205
55206 if (V.getValueType().isVector())
55207 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55208
55209 return V;
55210}
55211
55213 const X86Subtarget &Subtarget) {
55214 SDValue N0 = N->getOperand(0);
55215 SDValue N1 = N->getOperand(1);
55216 EVT VT = N->getValueType(0);
55217 SDLoc DL(N);
55218
55219 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55220 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55221 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55222 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55223 return SDValue();
55224
55225 auto isAllOnesConstantFP = [](SDValue V) {
55226 if (V.getSimpleValueType().isVector())
55227 return ISD::isBuildVectorAllOnes(V.getNode());
55228 auto *C = dyn_cast<ConstantFPSDNode>(V);
55229 return C && C->getConstantFPValue()->isAllOnesValue();
55230 };
55231
55232 // fand (fxor X, -1), Y --> fandn X, Y
55233 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55234 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55235
55236 // fand X, (fxor Y, -1) --> fandn Y, X
55237 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55238 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55239
55240 return SDValue();
55241}
55242
55243/// Do target-specific dag combines on X86ISD::FAND nodes.
55245 const X86Subtarget &Subtarget) {
55246 // FAND(0.0, x) -> 0.0
55247 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55248 return V;
55249
55250 // FAND(x, 0.0) -> 0.0
55251 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55252 return V;
55253
55254 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55255 return V;
55256
55257 return lowerX86FPLogicOp(N, DAG, Subtarget);
55258}
55259
55260/// Do target-specific dag combines on X86ISD::FANDN nodes.
55262 const X86Subtarget &Subtarget) {
55263 // FANDN(0.0, x) -> x
55264 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55265 return N->getOperand(1);
55266
55267 // FANDN(x, 0.0) -> 0.0
55268 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55269 return V;
55270
55271 return lowerX86FPLogicOp(N, DAG, Subtarget);
55272}
55273
55274/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55277 const X86Subtarget &Subtarget) {
55278 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55279
55280 // F[X]OR(0.0, x) -> x
55281 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55282 return N->getOperand(1);
55283
55284 // F[X]OR(x, 0.0) -> x
55285 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55286 return N->getOperand(0);
55287
55288 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55289 return NewVal;
55290
55291 return lowerX86FPLogicOp(N, DAG, Subtarget);
55292}
55293
55294/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55296 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55297
55298 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55299 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55301 return SDValue();
55302
55303 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55304 // into FMINC and FMAXC, which are Commutative operations.
55305 unsigned NewOp = 0;
55306 switch (N->getOpcode()) {
55307 default: llvm_unreachable("unknown opcode");
55308 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55309 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55310 }
55311
55312 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55313 N->getOperand(0), N->getOperand(1));
55314}
55315
55317 const X86Subtarget &Subtarget) {
55318 EVT VT = N->getValueType(0);
55319 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55320 return SDValue();
55321
55322 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55323
55324 auto IsMinMaxLegal = [&](EVT VT) {
55325 if (!TLI.isTypeLegal(VT))
55326 return false;
55327 return VT.getScalarType() != MVT::f16 ||
55328 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55329 };
55330
55331 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55332 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55333 (Subtarget.hasFP16() && VT == MVT::f16) ||
55334 (VT.isVector() && IsMinMaxLegal(VT))))
55335 return SDValue();
55336
55337 SDValue Op0 = N->getOperand(0);
55338 SDValue Op1 = N->getOperand(1);
55339 SDLoc DL(N);
55340 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55341
55342 // If we don't have to respect NaN inputs, this is a direct translation to x86
55343 // min/max instructions.
55344 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55345 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55346
55347 // If one of the operands is known non-NaN use the native min/max instructions
55348 // with the non-NaN input as second operand.
55349 if (DAG.isKnownNeverNaN(Op1))
55350 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55351 if (DAG.isKnownNeverNaN(Op0))
55352 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55353
55354 // If we have to respect NaN inputs, this takes at least 3 instructions.
55355 // Favor a library call when operating on a scalar and minimizing code size.
55356 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55357 return SDValue();
55358
55359 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55360 VT);
55361
55362 // There are 4 possibilities involving NaN inputs, and these are the required
55363 // outputs:
55364 // Op1
55365 // Num NaN
55366 // ----------------
55367 // Num | Max | Op0 |
55368 // Op0 ----------------
55369 // NaN | Op1 | NaN |
55370 // ----------------
55371 //
55372 // The SSE FP max/min instructions were not designed for this case, but rather
55373 // to implement:
55374 // Min = Op1 < Op0 ? Op1 : Op0
55375 // Max = Op1 > Op0 ? Op1 : Op0
55376 //
55377 // So they always return Op0 if either input is a NaN. However, we can still
55378 // use those instructions for fmaxnum by selecting away a NaN input.
55379
55380 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55381 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55382 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55383
55384 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55385 // are NaN, the NaN value of Op1 is the result.
55386 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55387}
55388
55391 EVT VT = N->getValueType(0);
55392 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55393
55394 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55395 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55396 return SDValue(N, 0);
55397
55398 // Convert a full vector load into vzload when not all bits are needed.
55399 SDValue In = N->getOperand(0);
55400 MVT InVT = In.getSimpleValueType();
55401 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55402 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55403 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55404 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55405 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55406 MVT MemVT = MVT::getIntegerVT(NumBits);
55407 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55408 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55409 SDLoc dl(N);
55410 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55411 DAG.getBitcast(InVT, VZLoad));
55412 DCI.CombineTo(N, Convert);
55413 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55415 return SDValue(N, 0);
55416 }
55417 }
55418
55419 return SDValue();
55420}
55421
55425 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55426 EVT VT = N->getValueType(0);
55427
55428 // Convert a full vector load into vzload when not all bits are needed.
55429 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55430 MVT InVT = In.getSimpleValueType();
55431 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55432 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55433 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55434 LoadSDNode *LN = cast<LoadSDNode>(In);
55435 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55436 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55437 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55438 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55439 SDLoc dl(N);
55440 if (IsStrict) {
55441 SDValue Convert =
55442 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55443 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55444 DCI.CombineTo(N, Convert, Convert.getValue(1));
55445 } else {
55446 SDValue Convert =
55447 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55448 DCI.CombineTo(N, Convert);
55449 }
55450 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55452 return SDValue(N, 0);
55453 }
55454 }
55455
55456 return SDValue();
55457}
55458
55459/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55462 const X86Subtarget &Subtarget) {
55463 SDValue N0 = N->getOperand(0);
55464 SDValue N1 = N->getOperand(1);
55465 MVT VT = N->getSimpleValueType(0);
55466 int NumElts = VT.getVectorNumElements();
55467 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55468 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55469 SDLoc DL(N);
55470
55471 // ANDNP(undef, x) -> 0
55472 // ANDNP(x, undef) -> 0
55473 if (N0.isUndef() || N1.isUndef())
55474 return DAG.getConstant(0, DL, VT);
55475
55476 // ANDNP(0, x) -> x
55478 return N1;
55479
55480 // ANDNP(x, 0) -> 0
55482 return DAG.getConstant(0, DL, VT);
55483
55484 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55486 return DAG.getNOT(DL, N0, VT);
55487
55488 // Turn ANDNP back to AND if input is inverted.
55489 if (SDValue Not = IsNOT(N0, DAG))
55490 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55491
55492 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55493 // to make use of predicated selects.
55494 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55495 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55496 SDValue Src = N0.getOperand(0);
55497 EVT SrcVT = Src.getValueType();
55498 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55499 (VT.is512BitVector() || Subtarget.hasVLX()) &&
55500 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
55501 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55502 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55503 getZeroVector(VT, Subtarget, DAG, DL));
55504 }
55505
55506 // Constant Folding
55507 APInt Undefs0, Undefs1;
55508 SmallVector<APInt> EltBits0, EltBits1;
55509 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55510 /*AllowWholeUndefs*/ true,
55511 /*AllowPartialUndefs*/ true)) {
55512 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55513 /*AllowWholeUndefs*/ true,
55514 /*AllowPartialUndefs*/ true)) {
55515 SmallVector<APInt> ResultBits;
55516 for (int I = 0; I != NumElts; ++I)
55517 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55518 return getConstVector(ResultBits, VT, DAG, DL);
55519 }
55520
55521 // Constant fold NOT(N0) to allow us to use AND.
55522 // Ensure this is only performed if we can confirm that the bitcasted source
55523 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55524 if (N0->hasOneUse()) {
55526 if (BC0.getOpcode() != ISD::BITCAST) {
55527 for (APInt &Elt : EltBits0)
55528 Elt = ~Elt;
55529 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55530 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55531 }
55532 }
55533 }
55534
55535 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55536 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55537 SDValue Op(N, 0);
55538 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55539 return Res;
55540
55541 // If either operand is a constant mask, then only the elements that aren't
55542 // zero are actually demanded by the other operand.
55543 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55544 APInt UndefElts;
55545 SmallVector<APInt> EltBits;
55546 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55547 APInt DemandedElts = APInt::getAllOnes(NumElts);
55548 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55549 EltBits)) {
55550 DemandedBits.clearAllBits();
55551 DemandedElts.clearAllBits();
55552 for (int I = 0; I != NumElts; ++I) {
55553 if (UndefElts[I]) {
55554 // We can't assume an undef src element gives an undef dst - the
55555 // other src might be zero.
55556 DemandedBits.setAllBits();
55557 DemandedElts.setBit(I);
55558 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55559 (!Invert && !EltBits[I].isZero())) {
55560 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55561 DemandedElts.setBit(I);
55562 }
55563 }
55564 }
55565 return std::make_pair(DemandedBits, DemandedElts);
55566 };
55567 APInt Bits0, Elts0;
55568 APInt Bits1, Elts1;
55569 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55570 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55571
55572 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55573 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55574 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55575 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55576 if (N->getOpcode() != ISD::DELETED_NODE)
55577 DCI.AddToWorklist(N);
55578 return SDValue(N, 0);
55579 }
55580 }
55581
55582 // Folds for better commutativity:
55583 if (N1->hasOneUse()) {
55584 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55585 if (SDValue Not = IsNOT(N1, DAG))
55586 return DAG.getNOT(
55587 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55588
55589 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55590 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55591 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55593 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55594 EVT ShufVT = BC1.getValueType();
55595 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55596 DAG.getBitcast(ShufVT, N0));
55597 SDValue NewShuf =
55598 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55599 return DAG.getBitcast(VT, NewShuf);
55600 }
55601 }
55602 }
55603
55604 return SDValue();
55605}
55606
55609 SDValue N1 = N->getOperand(1);
55610
55611 // BT ignores high bits in the bit index operand.
55612 unsigned BitWidth = N1.getValueSizeInBits();
55614 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55615 if (N->getOpcode() != ISD::DELETED_NODE)
55616 DCI.AddToWorklist(N);
55617 return SDValue(N, 0);
55618 }
55619
55620 return SDValue();
55621}
55622
55625 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55626 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55627
55628 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55629 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55630 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55631 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55632 if (N->getOpcode() != ISD::DELETED_NODE)
55633 DCI.AddToWorklist(N);
55634 return SDValue(N, 0);
55635 }
55636
55637 // Convert a full vector load into vzload when not all bits are needed.
55638 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55639 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55640 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55641 SDLoc dl(N);
55642 if (IsStrict) {
55643 SDValue Convert = DAG.getNode(
55644 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55645 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55646 DCI.CombineTo(N, Convert, Convert.getValue(1));
55647 } else {
55648 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55649 DAG.getBitcast(MVT::v8i16, VZLoad));
55650 DCI.CombineTo(N, Convert);
55651 }
55652
55653 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55655 return SDValue(N, 0);
55656 }
55657 }
55658 }
55659
55660 return SDValue();
55661}
55662
55663// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55665 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55666
55667 EVT DstVT = N->getValueType(0);
55668
55669 SDValue N0 = N->getOperand(0);
55670 SDValue N1 = N->getOperand(1);
55671 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55672
55673 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55674 return SDValue();
55675
55676 // Look through single use any_extends / truncs.
55677 SDValue IntermediateBitwidthOp;
55678 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55679 N0.hasOneUse()) {
55680 IntermediateBitwidthOp = N0;
55681 N0 = N0.getOperand(0);
55682 }
55683
55684 // See if we have a single use cmov.
55685 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55686 return SDValue();
55687
55688 SDValue CMovOp0 = N0.getOperand(0);
55689 SDValue CMovOp1 = N0.getOperand(1);
55690
55691 // Make sure both operands are constants.
55692 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55693 !isa<ConstantSDNode>(CMovOp1.getNode()))
55694 return SDValue();
55695
55696 SDLoc DL(N);
55697
55698 // If we looked through an any_extend/trunc above, add one to the constants.
55699 if (IntermediateBitwidthOp) {
55700 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55701 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55702 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55703 }
55704
55705 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55706 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55707
55708 EVT CMovVT = DstVT;
55709 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55710 if (DstVT == MVT::i16) {
55711 CMovVT = MVT::i32;
55712 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55713 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55714 }
55715
55716 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55717 N0.getOperand(2), N0.getOperand(3));
55718
55719 if (CMovVT != DstVT)
55720 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55721
55722 return CMov;
55723}
55724
55726 const X86Subtarget &Subtarget) {
55727 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55728
55729 if (SDValue V = combineSextInRegCmov(N, DAG))
55730 return V;
55731
55732 EVT VT = N->getValueType(0);
55733 SDValue N0 = N->getOperand(0);
55734 SDValue N1 = N->getOperand(1);
55735 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55736 SDLoc dl(N);
55737
55738 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55739 // both SSE and AVX2 since there is no sign-extended shift right
55740 // operation on a vector with 64-bit elements.
55741 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55742 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55743 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55744 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55745 SDValue N00 = N0.getOperand(0);
55746
55747 // EXTLOAD has a better solution on AVX2,
55748 // it may be replaced with X86ISD::VSEXT node.
55749 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55750 if (!ISD::isNormalLoad(N00.getNode()))
55751 return SDValue();
55752
55753 // Attempt to promote any comparison mask ops before moving the
55754 // SIGN_EXTEND_INREG in the way.
55755 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55756 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55757
55758 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55759 SDValue Tmp =
55760 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55761 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55762 }
55763 }
55764 return SDValue();
55765}
55766
55767/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55768/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55769/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55770/// opportunities to combine math ops, use an LEA, or use a complex addressing
55771/// mode. This can eliminate extend, add, and shift instructions.
55773 const X86Subtarget &Subtarget) {
55774 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55775 Ext->getOpcode() != ISD::ZERO_EXTEND)
55776 return SDValue();
55777
55778 // TODO: This should be valid for other integer types.
55779 EVT VT = Ext->getValueType(0);
55780 if (VT != MVT::i64)
55781 return SDValue();
55782
55783 SDValue Add = Ext->getOperand(0);
55784 if (Add.getOpcode() != ISD::ADD)
55785 return SDValue();
55786
55787 SDValue AddOp0 = Add.getOperand(0);
55788 SDValue AddOp1 = Add.getOperand(1);
55789 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55790 bool NSW = Add->getFlags().hasNoSignedWrap();
55791 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55792 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55793 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55794
55795 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55796 // into the 'zext'
55797 if ((Sext && !NSW) || (!Sext && !NUW))
55798 return SDValue();
55799
55800 // Having a constant operand to the 'add' ensures that we are not increasing
55801 // the instruction count because the constant is extended for free below.
55802 // A constant operand can also become the displacement field of an LEA.
55803 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55804 if (!AddOp1C)
55805 return SDValue();
55806
55807 // Don't make the 'add' bigger if there's no hope of combining it with some
55808 // other 'add' or 'shl' instruction.
55809 // TODO: It may be profitable to generate simpler LEA instructions in place
55810 // of single 'add' instructions, but the cost model for selecting an LEA
55811 // currently has a high threshold.
55812 bool HasLEAPotential = false;
55813 for (auto *User : Ext->users()) {
55814 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55815 HasLEAPotential = true;
55816 break;
55817 }
55818 }
55819 if (!HasLEAPotential)
55820 return SDValue();
55821
55822 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55823 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55824 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55825 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55826
55827 // The wider add is guaranteed to not wrap because both operands are
55828 // sign-extended.
55829 SDNodeFlags Flags;
55830 Flags.setNoSignedWrap(NSW);
55831 Flags.setNoUnsignedWrap(NUW);
55832 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55833}
55834
55835// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55836// operands and the result of CMOV is not used anywhere else - promote CMOV
55837// itself instead of promoting its result. This could be beneficial, because:
55838// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55839// (or more) pseudo-CMOVs only when they go one-after-another and
55840// getting rid of result extension code after CMOV will help that.
55841// 2) Promotion of constant CMOV arguments is free, hence the
55842// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55843// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55844// promotion is also good in terms of code-size.
55845// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55846// promotion).
55848 SDValue CMovN = Extend->getOperand(0);
55849 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55850 return SDValue();
55851
55852 EVT TargetVT = Extend->getValueType(0);
55853 unsigned ExtendOpcode = Extend->getOpcode();
55854 SDLoc DL(Extend);
55855
55856 EVT VT = CMovN.getValueType();
55857 SDValue CMovOp0 = CMovN.getOperand(0);
55858 SDValue CMovOp1 = CMovN.getOperand(1);
55859
55860 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55861 !isa<ConstantSDNode>(CMovOp1.getNode()))
55862 return SDValue();
55863
55864 // Only extend to i32 or i64.
55865 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55866 return SDValue();
55867
55868 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55869 // are free.
55870 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55871 return SDValue();
55872
55873 // If this a zero extend to i64, we should only extend to i32 and use a free
55874 // zero extend to finish.
55875 EVT ExtendVT = TargetVT;
55876 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55877 ExtendVT = MVT::i32;
55878
55879 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55880 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55881
55882 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55883 CMovN.getOperand(2), CMovN.getOperand(3));
55884
55885 // Finish extending if needed.
55886 if (ExtendVT != TargetVT)
55887 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55888
55889 return Res;
55890}
55891
55892// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55893// result type.
55895 const X86Subtarget &Subtarget) {
55896 SDValue N0 = N->getOperand(0);
55897 EVT VT = N->getValueType(0);
55898 SDLoc dl(N);
55899
55900 // Only do this combine with AVX512 for vector extends.
55901 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55902 return SDValue();
55903
55904 // Only combine legal element types.
55905 EVT SVT = VT.getVectorElementType();
55906 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55907 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55908 return SDValue();
55909
55910 // We don't have CMPP Instruction for vxf16
55911 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55912 return SDValue();
55913 // We can only do this if the vector size in 256 bits or less.
55914 unsigned Size = VT.getSizeInBits();
55915 if (Size > 256 && Subtarget.useAVX512Regs())
55916 return SDValue();
55917
55918 EVT N00VT = N0.getOperand(0).getValueType();
55919
55920 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55921 // that's the only integer compares with we have.
55923 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55924 return SDValue();
55925
55926 // Only do this combine if the extension will be fully consumed by the setcc.
55927 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55928 if (Size != MatchingVecType.getSizeInBits())
55929 return SDValue();
55930
55931 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55932
55933 if (N->getOpcode() == ISD::ZERO_EXTEND)
55934 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55935
55936 return Res;
55937}
55938
55941 const X86Subtarget &Subtarget) {
55942 SDValue N0 = N->getOperand(0);
55943 EVT VT = N->getValueType(0);
55944 SDLoc DL(N);
55945
55946 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55947 if (!DCI.isBeforeLegalizeOps() &&
55949 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55950 N0->getOperand(1));
55951 bool ReplaceOtherUses = !N0.hasOneUse();
55952 DCI.CombineTo(N, Setcc);
55953 // Replace other uses with a truncate of the widened setcc_carry.
55954 if (ReplaceOtherUses) {
55955 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55956 N0.getValueType(), Setcc);
55957 DCI.CombineTo(N0.getNode(), Trunc);
55958 }
55959
55960 return SDValue(N, 0);
55961 }
55962
55963 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55964 return NewCMov;
55965
55966 if (!DCI.isBeforeLegalizeOps())
55967 return SDValue();
55968
55969 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55970 return V;
55971
55972 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55973 DAG, DCI, Subtarget))
55974 return V;
55975
55976 if (VT.isVector()) {
55977 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55978 return R;
55979
55981 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55982 }
55983
55984 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55985 return NewAdd;
55986
55987 return SDValue();
55988}
55989
55990// Inverting a constant vector is profitable if it can be eliminated and the
55991// inverted vector is already present in DAG. Otherwise, it will be loaded
55992// anyway.
55993//
55994// We determine which of the values can be completely eliminated and invert it.
55995// If both are eliminable, select a vector with the first negative element.
55998 "ConstantFP build vector expected");
55999 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
56000 // can eliminate it. Since this function is invoked for each FMA with this
56001 // vector.
56002 auto IsNotFMA = [](SDNode *User) {
56003 return User->getOpcode() != ISD::FMA &&
56004 User->getOpcode() != ISD::STRICT_FMA;
56005 };
56006 if (llvm::any_of(V->users(), IsNotFMA))
56007 return SDValue();
56008
56010 EVT VT = V.getValueType();
56011 EVT EltVT = VT.getVectorElementType();
56012 for (const SDValue &Op : V->op_values()) {
56013 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56014 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
56015 } else {
56016 assert(Op.isUndef());
56017 Ops.push_back(DAG.getUNDEF(EltVT));
56018 }
56019 }
56020
56022 if (!NV)
56023 return SDValue();
56024
56025 // If an inverted version cannot be eliminated, choose it instead of the
56026 // original version.
56027 if (llvm::any_of(NV->users(), IsNotFMA))
56028 return SDValue(NV, 0);
56029
56030 // If the inverted version also can be eliminated, we have to consistently
56031 // prefer one of the values. We prefer a constant with a negative value on
56032 // the first place.
56033 // N.B. We need to skip undefs that may precede a value.
56034 for (const SDValue &Op : V->op_values()) {
56035 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56036 if (Cst->isNegative())
56037 return SDValue();
56038 break;
56039 }
56040 }
56041 return SDValue(NV, 0);
56042}
56043
56046 const X86Subtarget &Subtarget) {
56047 SDLoc dl(N);
56048 EVT VT = N->getValueType(0);
56050 bool IsStrict = N->isTargetOpcode()
56051 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
56052 : N->isStrictFPOpcode();
56053
56054 // Let legalize expand this if it isn't a legal type yet.
56055 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56056 if (!TLI.isTypeLegal(VT))
56057 return SDValue();
56058
56059 SDValue A = N->getOperand(IsStrict ? 1 : 0);
56060 SDValue B = N->getOperand(IsStrict ? 2 : 1);
56061 SDValue C = N->getOperand(IsStrict ? 3 : 2);
56062
56063 // If the operation allows fast-math and the target does not support FMA,
56064 // split this into mul+add to avoid libcall(s).
56065 SDNodeFlags Flags = N->getFlags();
56066 if (!IsStrict && Flags.hasAllowReassociation() &&
56067 TLI.isOperationExpand(ISD::FMA, VT)) {
56068 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
56069 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
56070 }
56071
56072 EVT ScalarVT = VT.getScalarType();
56073 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
56074 !Subtarget.hasAnyFMA()) &&
56075 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
56076 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
56077 return SDValue();
56078
56079 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
56081 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56082 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
56083 CodeSize)) {
56084 V = NegV;
56085 return true;
56086 }
56087 // Look through extract_vector_elts. If it comes from an FNEG, create a
56088 // new extract from the FNEG input.
56089 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56090 isNullConstant(V.getOperand(1))) {
56091 SDValue Vec = V.getOperand(0);
56092 if (SDValue NegV = TLI.getCheaperNegatedExpression(
56093 Vec, DAG, LegalOperations, CodeSize)) {
56094 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
56095 NegV, V.getOperand(1));
56096 return true;
56097 }
56098 }
56099 // Lookup if there is an inverted version of constant vector V in DAG.
56100 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
56101 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
56102 V = NegV;
56103 return true;
56104 }
56105 }
56106 return false;
56107 };
56108
56109 // Do not convert the passthru input of scalar intrinsics.
56110 // FIXME: We could allow negations of the lower element only.
56111 bool NegA = invertIfNegative(A);
56112 // Create a dummy use for A so that in the process of negating B or C
56113 // recursively, it is not deleted.
56114 HandleSDNode NegAHandle(A);
56115 bool NegB = invertIfNegative(B);
56116 // Similar to A, get a handle on B.
56117 HandleSDNode NegBHandle(B);
56118 bool NegC = invertIfNegative(C);
56119
56120 if (!NegA && !NegB && !NegC)
56121 return SDValue();
56122
56123 unsigned NewOpcode =
56124 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56125
56126 // Propagate fast-math-flags to new FMA node.
56127 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56128 if (IsStrict) {
56129 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56130 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56131 {N->getOperand(0), A, B, C});
56132 } else {
56133 if (N->getNumOperands() == 4)
56134 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56135 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56136 }
56137}
56138
56139// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56140// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56143 SDLoc dl(N);
56144 EVT VT = N->getValueType(0);
56145 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56147 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56148
56149 SDValue N2 = N->getOperand(2);
56150
56151 SDValue NegN2 =
56152 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56153 if (!NegN2)
56154 return SDValue();
56155 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56156
56157 if (N->getNumOperands() == 4)
56158 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56159 NegN2, N->getOperand(3));
56160 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56161 NegN2);
56162}
56163
56164// Try to widen the build vector and bitcast it to the type of zext.
56165// This is a special case for the 128-bit vector types. Intention is to remove
56166// the zext and replace it with a bitcast the wider type. While lowering
56167// the bitcast is removed and extra commutation due to zext is avoided.
56168// For example:
56169// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56170// build_vector (x, 0, y, 0, z, w, 0)
56172
56173 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56174 return SDValue();
56175
56176 EVT ExtendVT = Extend->getValueType(0);
56177
56178 SDValue BV = Extend->getOperand(0);
56179 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56180 return SDValue();
56181
56182 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56183 // If the build vector has undef elements, we cannot widen it.
56184 // The widening would create a vector with more undef elements, which
56185 // is not valid.
56186 return SDValue();
56187 }
56188
56189 if (!all_of(BV->op_values(),
56190 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56191 // If the build vector any element other than \ISD::LOAD, we cannot widen
56192 // it.
56193 return SDValue();
56194 }
56195
56196 SDLoc dl(BV);
56197 EVT VT = BV.getValueType();
56198 EVT EltVT = BV.getOperand(0).getValueType();
56199 unsigned NumElts = VT.getVectorNumElements();
56200
56201 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56202
56203 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56205 return SDValue();
56206
56207 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56208 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56209
56210 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56211 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56212 // Fill the new elements with Zero.
56213 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56214 // Compute the step to place the elements in the right place and control the
56215 // iteration.
56216 unsigned step = WidenNumElts / NumElts;
56217 if (WidenVT.is128BitVector()) {
56218 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56219 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56220 i--, j -= step) {
56221 SDValue temp = NewOps[i];
56222 NewOps[i] = NewOps[j];
56223 NewOps[j] = temp;
56224 }
56225 // Create new build vector with WidenVT and NewOps
56226 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56227 // Replace the old build vector with the new one. Bitcast the
56228 // new build vector to the type of the zext.
56229 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56230 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56231 return NewBV;
56232 }
56233 }
56234 return SDValue();
56235}
56236
56239 const X86Subtarget &Subtarget) {
56240 SDLoc dl(N);
56241 SDValue N0 = N->getOperand(0);
56242 EVT VT = N->getValueType(0);
56243
56244 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56245 // FIXME: Is this needed? We don't seem to have any tests for it.
56246 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56248 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56249 N0->getOperand(1));
56250 bool ReplaceOtherUses = !N0.hasOneUse();
56251 DCI.CombineTo(N, Setcc);
56252 // Replace other uses with a truncate of the widened setcc_carry.
56253 if (ReplaceOtherUses) {
56254 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56255 N0.getValueType(), Setcc);
56256 DCI.CombineTo(N0.getNode(), Trunc);
56257 }
56258
56259 return SDValue(N, 0);
56260 }
56261
56262 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56263 return NewCMov;
56264
56265 if (DCI.isBeforeLegalizeOps())
56266 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56267 return V;
56268
56269 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56270 DAG, DCI, Subtarget))
56271 return V;
56272
56273 if (VT.isVector())
56274 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56275 return R;
56276
56277 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56278 return NewAdd;
56279
56280 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56281 return R;
56282
56283 // TODO: Combine with any target/faux shuffle.
56284 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56286 SDValue N00 = N0.getOperand(0);
56287 SDValue N01 = N0.getOperand(1);
56288 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56289 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56290 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56291 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56292 return concatSubVectors(N00, N01, DAG, dl);
56293 }
56294 }
56295
56296 if (SDValue V = widenBuildVec(N, DAG))
56297 return V;
56298
56299 return SDValue();
56300}
56301
56302/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56303/// pre-promote its result type since vXi1 vectors don't get promoted
56304/// during type legalization.
56307 const SDLoc &DL, SelectionDAG &DAG,
56308 const X86Subtarget &Subtarget) {
56309 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56310 VT.getVectorElementType() == MVT::i1 &&
56311 (OpVT.getVectorElementType() == MVT::i8 ||
56312 OpVT.getVectorElementType() == MVT::i16)) {
56313 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56314 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56315 }
56316 return SDValue();
56317}
56318
56319// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56320// eq/ne) is generated when using an integer as a mask. Instead of generating a
56321// broadcast + vptest, we can directly move the integer to a mask register.
56323 const SDLoc &DL, SelectionDAG &DAG,
56324 const X86Subtarget &Subtarget) {
56325 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56326 return SDValue();
56327
56328 if (!Subtarget.hasAVX512())
56329 return SDValue();
56330
56331 if (Op0.getOpcode() != ISD::AND)
56332 return SDValue();
56333
56334 SDValue Broadcast = Op0.getOperand(0);
56335 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56336 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56337 return SDValue();
56338
56339 SDValue Load = Op0.getOperand(1);
56340 EVT LoadVT = Load.getSimpleValueType();
56341
56342 APInt UndefElts;
56343 SmallVector<APInt, 32> EltBits;
56345 UndefElts, EltBits,
56346 /*AllowWholeUndefs*/ true,
56347 /*AllowPartialUndefs*/ false) ||
56348 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56349 return SDValue();
56350
56351 // Check if the constant pool contains only powers of 2 starting from some
56352 // 2^N. The table may also contain undefs because of widening of vector
56353 // operands.
56354 unsigned N = EltBits[0].logBase2();
56355 unsigned Len = UndefElts.getBitWidth();
56356 for (unsigned I = 1; I != Len; ++I) {
56357 if (UndefElts[I]) {
56358 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56359 return SDValue();
56360 break;
56361 }
56362
56363 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56364 return SDValue();
56365 }
56366
56367 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56368 SDValue BroadcastOp;
56369 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56370 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56371 Broadcast, DAG.getVectorIdxConstant(0, DL));
56372 } else {
56373 BroadcastOp = Broadcast.getOperand(0);
56374 if (BroadcastOp.getValueType().isVector())
56375 return SDValue();
56376 }
56377
56378 SDValue Masked = BroadcastOp;
56379 if (N != 0) {
56380 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56381 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56382
56383 if (NumDefinedElts > BroadcastOpBitWidth)
56384 return SDValue();
56385
56386 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56387 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56388 DAG.getConstant(N, DL, BroadcastOpVT));
56389 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56390 DAG.getConstant(Mask, DL, BroadcastOpVT));
56391 }
56392 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56393 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56394 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56395 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56396
56397 if (CC == ISD::SETEQ)
56398 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56399
56400 if (VT != MVT::v16i1)
56401 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56402 DAG.getVectorIdxConstant(0, DL));
56403
56404 return Bitcast;
56405}
56406
56409 const X86Subtarget &Subtarget) {
56410 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56411 const SDValue LHS = N->getOperand(0);
56412 const SDValue RHS = N->getOperand(1);
56413 EVT VT = N->getValueType(0);
56414 EVT OpVT = LHS.getValueType();
56415 SDLoc DL(N);
56416
56417 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56418 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56419 Subtarget))
56420 return V;
56421 }
56422
56423 if (VT == MVT::i1) {
56424 X86::CondCode X86CC;
56425 if (SDValue V =
56426 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56427 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56428 }
56429
56430 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56431 if (OpVT.isScalarInteger()) {
56432 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56433 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56434 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56435 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56436 if (N0.getOperand(0) == N1)
56437 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56438 N0.getOperand(1));
56439 if (N0.getOperand(1) == N1)
56440 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56441 N0.getOperand(0));
56442 }
56443 return SDValue();
56444 };
56445 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56446 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56447 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56448 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56449
56450 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56451 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56452 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56453 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56454 if (N0.getOperand(0) == N1)
56455 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56456 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56457 if (N0.getOperand(1) == N1)
56458 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56459 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56460 }
56461 return SDValue();
56462 };
56463 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56464 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56465 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56466 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56467
56468 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56469 // cmpne(trunc(x),C) --> cmpne(x,C)
56470 // iff x upper bits are zero.
56471 if (LHS.getOpcode() == ISD::TRUNCATE &&
56472 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56474 EVT SrcVT = LHS.getOperand(0).getValueType();
56476 OpVT.getScalarSizeInBits());
56477 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56478 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56479 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56480 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56481 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56482 }
56483
56484 // With C as a power of 2 and C != 0 and C != INT_MIN:
56485 // icmp eq Abs(X) C ->
56486 // (icmp eq A, C) | (icmp eq A, -C)
56487 // icmp ne Abs(X) C ->
56488 // (icmp ne A, C) & (icmp ne A, -C)
56489 // Both of these patterns can be better optimized in
56490 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56491 // integers which is checked above.
56492 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56493 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56494 const APInt &CInt = C->getAPIntValue();
56495 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56496 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56497 SDValue BaseOp = LHS.getOperand(0);
56498 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56499 SDValue SETCC1 = DAG.getSetCC(
56500 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56501 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56502 SETCC0, SETCC1);
56503 }
56504 }
56505 }
56506 }
56507 }
56508
56509 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56510 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56511 // Using temporaries to avoid messing up operand ordering for later
56512 // transformations if this doesn't work.
56513 SDValue Op0 = LHS;
56514 SDValue Op1 = RHS;
56515 ISD::CondCode TmpCC = CC;
56516 // Put build_vector on the right.
56517 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56518 std::swap(Op0, Op1);
56519 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56520 }
56521
56522 bool IsSEXT0 =
56523 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56524 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56525 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56526
56527 if (IsSEXT0 && IsVZero1) {
56528 assert(VT == Op0.getOperand(0).getValueType() &&
56529 "Unexpected operand type");
56530 if (TmpCC == ISD::SETGT)
56531 return DAG.getConstant(0, DL, VT);
56532 if (TmpCC == ISD::SETLE)
56533 return DAG.getConstant(1, DL, VT);
56534 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56535 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56536
56537 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56538 "Unexpected condition code!");
56539 return Op0.getOperand(0);
56540 }
56541
56542 if (IsVZero1)
56543 if (SDValue V =
56544 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56545 return V;
56546 }
56547
56548 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56549 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56550 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56551 // a mask, there are signed AVX512 comparisons).
56552 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56553 bool CanMakeSigned = false;
56554 if (ISD::isUnsignedIntSetCC(CC)) {
56555 KnownBits CmpKnown =
56557 // If we know LHS/RHS share the same sign bit at each element we can
56558 // make this signed.
56559 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56560 // across all lanes. So a pattern where the sign varies from lane to
56561 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56562 // missed. We could get around this by demanding each lane
56563 // independently, but this isn't the most important optimization and
56564 // that may eat into compile time.
56565 CanMakeSigned =
56566 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56567 }
56568 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56569 SDValue LHSOut = LHS;
56570 SDValue RHSOut = RHS;
56571 ISD::CondCode NewCC = CC;
56572 switch (CC) {
56573 case ISD::SETGE:
56574 case ISD::SETUGE:
56575 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56576 /*NSW*/ true))
56577 LHSOut = NewLHS;
56578 else if (SDValue NewRHS = incDecVectorConstant(
56579 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56580 RHSOut = NewRHS;
56581 else
56582 break;
56583
56584 [[fallthrough]];
56585 case ISD::SETUGT:
56586 NewCC = ISD::SETGT;
56587 break;
56588
56589 case ISD::SETLE:
56590 case ISD::SETULE:
56591 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56592 /*NSW*/ true))
56593 LHSOut = NewLHS;
56594 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56595 /*NSW*/ true))
56596 RHSOut = NewRHS;
56597 else
56598 break;
56599
56600 [[fallthrough]];
56601 case ISD::SETULT:
56602 // Will be swapped to SETGT in LowerVSETCC*.
56603 NewCC = ISD::SETLT;
56604 break;
56605 default:
56606 break;
56607 }
56608 if (NewCC != CC) {
56609 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56610 NewCC, DL, DAG, Subtarget))
56611 return R;
56612 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56613 }
56614 }
56615 }
56616
56617 if (SDValue R =
56618 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56619 return R;
56620
56621 // In the middle end transforms:
56622 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56623 // -> `(icmp ult (add x, -C), 2)`
56624 // Likewise inverted cases with `ugt`.
56625 //
56626 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56627 // in worse codegen. So, undo the middle-end transform and go back to `(or
56628 // (icmp eq), (icmp eq))` form.
56629 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56630 // the xmm approach.
56631 //
56632 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56633 // ne))` as it doesn't end up instruction positive.
56634 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56635 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56636 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56637 !Subtarget.hasAVX512() &&
56638 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56639 Subtarget.hasAVX2()) &&
56640 LHS.hasOneUse()) {
56641
56642 APInt CmpC;
56643 SDValue AddC = LHS.getOperand(1);
56644 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56646 // See which form we have depending on the constant/condition.
56647 SDValue C0 = SDValue();
56648 SDValue C1 = SDValue();
56649
56650 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56651 // we will end up generating an additional constant. Keeping in the
56652 // current form has a slight latency cost, but it probably worth saving a
56653 // constant.
56656 // Pass
56657 }
56658 // Normal Cases
56659 else if ((CC == ISD::SETULT && CmpC == 2) ||
56660 (CC == ISD::SETULE && CmpC == 1)) {
56661 // These will constant fold.
56662 C0 = DAG.getNegative(AddC, DL, OpVT);
56663 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56664 DAG.getAllOnesConstant(DL, OpVT));
56665 }
56666 // Inverted Cases
56667 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56668 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56669 // These will constant fold.
56670 C0 = DAG.getNOT(DL, AddC, OpVT);
56671 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56672 DAG.getAllOnesConstant(DL, OpVT));
56673 }
56674 if (C0 && C1) {
56675 SDValue NewLHS =
56676 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56677 SDValue NewRHS =
56678 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56679 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56680 }
56681 }
56682 }
56683
56684 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56685 // to avoid scalarization via legalization because v4i32 is not a legal type.
56686 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56687 LHS.getValueType() == MVT::v4f32)
56688 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56689
56690 // X pred 0.0 --> X pred -X
56691 // If the negation of X already exists, use it in the comparison. This removes
56692 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56693 // instructions in patterns with a 'select' node.
56695 SDVTList FNegVT = DAG.getVTList(OpVT);
56696 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56697 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56698 }
56699
56700 return SDValue();
56701}
56702
56705 const X86Subtarget &Subtarget) {
56706 SDValue Src = N->getOperand(0);
56707 MVT SrcVT = Src.getSimpleValueType();
56708 MVT VT = N->getSimpleValueType(0);
56709 unsigned NumBits = VT.getScalarSizeInBits();
56710 unsigned NumElts = SrcVT.getVectorNumElements();
56711 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56712 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56713
56714 // Perform constant folding.
56715 APInt UndefElts;
56716 SmallVector<APInt, 32> EltBits;
56717 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56718 /*AllowWholeUndefs*/ true,
56719 /*AllowPartialUndefs*/ true)) {
56720 APInt Imm(32, 0);
56721 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56722 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56723 Imm.setBit(Idx);
56724
56725 return DAG.getConstant(Imm, SDLoc(N), VT);
56726 }
56727
56728 // Look through int->fp bitcasts that don't change the element width.
56729 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56730 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56731 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56732 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56733
56734 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56735 // with scalar comparisons.
56736 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56737 SDLoc DL(N);
56738 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56739 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56740 return DAG.getNode(ISD::XOR, DL, VT,
56741 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56742 DAG.getConstant(NotMask, DL, VT));
56743 }
56744
56745 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56746 // results with scalar comparisons.
56747 if (Src.getOpcode() == X86ISD::PCMPGT &&
56748 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56749 SDLoc DL(N);
56750 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56751 return DAG.getNode(ISD::XOR, DL, VT,
56752 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56753 DAG.getConstant(NotMask, DL, VT));
56754 }
56755
56756 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56757 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56758 // iff pow2splat(c1).
56759 // Use KnownBits to determine if only a single bit is non-zero
56760 // in each element (pow2 or zero), and shift that bit to the msb.
56761 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56762 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56763 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56764 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56765 if (KnownLHS.countMaxPopulation() == 1 &&
56766 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56767 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56768 SDLoc DL(N);
56769 MVT ShiftVT = SrcVT;
56770 SDValue ShiftLHS = Src.getOperand(0);
56771 SDValue ShiftRHS = Src.getOperand(1);
56772 if (ShiftVT.getScalarType() == MVT::i8) {
56773 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56774 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56775 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56776 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56777 }
56778 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56779 ShiftLHS, ShiftAmt, DAG);
56780 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56781 ShiftRHS, ShiftAmt, DAG);
56782 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56783 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56784 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56785 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56786 }
56787 }
56788
56789 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56790 if (N->isOnlyUserOf(Src.getNode())) {
56792 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56793 APInt UndefElts;
56794 SmallVector<APInt, 32> EltBits;
56795 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56796 UndefElts, EltBits)) {
56797 APInt Mask = APInt::getZero(NumBits);
56798 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56799 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56800 Mask.setBit(Idx);
56801 }
56802 SDLoc DL(N);
56803 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56804 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56805 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56806 DAG.getConstant(Mask, DL, VT));
56807 }
56808 }
56809 }
56810
56811 // Simplify the inputs.
56812 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56813 APInt DemandedMask(APInt::getAllOnes(NumBits));
56814 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56815 return SDValue(N, 0);
56816
56817 return SDValue();
56818}
56819
56822 const X86Subtarget &Subtarget) {
56823 MVT VT = N->getSimpleValueType(0);
56824 unsigned NumBits = VT.getScalarSizeInBits();
56825
56826 // Simplify the inputs.
56827 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56828 APInt DemandedMask(APInt::getAllOnes(NumBits));
56829 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56830 return SDValue(N, 0);
56831
56832 return SDValue();
56833}
56834
56838 SDValue Mask = MemOp->getMask();
56839
56840 // With vector masks we only demand the upper bit of the mask.
56841 if (Mask.getScalarValueSizeInBits() != 1) {
56842 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56843 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56844 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56845 if (N->getOpcode() != ISD::DELETED_NODE)
56846 DCI.AddToWorklist(N);
56847 return SDValue(N, 0);
56848 }
56849 }
56850
56851 return SDValue();
56852}
56853
56855 SDValue Index, SDValue Base, SDValue Scale,
56856 SelectionDAG &DAG) {
56857 SDLoc DL(GorS);
56858
56859 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56860 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56861 Gather->getMask(), Base, Index, Scale } ;
56862 return DAG.getMaskedGather(Gather->getVTList(),
56863 Gather->getMemoryVT(), DL, Ops,
56864 Gather->getMemOperand(),
56865 Gather->getIndexType(),
56866 Gather->getExtensionType());
56867 }
56868 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56869 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56870 Scatter->getMask(), Base, Index, Scale };
56871 return DAG.getMaskedScatter(Scatter->getVTList(),
56872 Scatter->getMemoryVT(), DL,
56873 Ops, Scatter->getMemOperand(),
56874 Scatter->getIndexType(),
56875 Scatter->isTruncatingStore());
56876}
56877
56880 SDLoc DL(N);
56881 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56882 SDValue Index = GorS->getIndex();
56883 SDValue Base = GorS->getBasePtr();
56884 SDValue Scale = GorS->getScale();
56885 EVT IndexVT = Index.getValueType();
56886 EVT IndexSVT = IndexVT.getVectorElementType();
56887 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56888 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56889 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56890
56891 if (DCI.isBeforeLegalize()) {
56892 // Attempt to move shifted index into the address scale, allows further
56893 // index truncation below.
56894 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56895 isa<ConstantSDNode>(Scale)) {
56896 unsigned ScaleAmt = Scale->getAsZExtVal();
56897 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56898 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56899 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56900 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56901 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56902 if (N->getOpcode() != ISD::DELETED_NODE)
56903 DCI.AddToWorklist(N);
56904 return SDValue(N, 0);
56905 }
56906 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56907 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56908 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56909 SDValue ShAmt = Index.getOperand(1);
56910 SDValue NewShAmt =
56911 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56912 DAG.getConstant(1, DL, ShAmt.getValueType()));
56913 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56914 Index.getOperand(0), NewShAmt);
56915 SDValue NewScale =
56916 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56917 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56918 }
56919 }
56920 }
56921
56922 // Shrink indices if they are larger than 32-bits.
56923 // Only do this before legalize types since v2i64 could become v2i32.
56924 // FIXME: We could check that the type is legal if we're after legalize
56925 // types, but then we would need to construct test cases where that happens.
56926 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56927 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56928
56929 // FIXME: We could support more than just constant fold, but we need to
56930 // careful with costing. A truncate that can be optimized out would be
56931 // fine. Otherwise we might only want to create a truncate if it avoids
56932 // a split.
56933 if (SDValue TruncIndex =
56934 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56935 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56936
56937 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56938 // there are sufficient sign bits. Only do this before legalize types to
56939 // avoid creating illegal types in truncate.
56940 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56941 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56942 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56943 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56944 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56945 }
56946
56947 // Shrink if we remove an illegal type.
56948 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56949 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56950 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56951 }
56952 }
56953 }
56954
56955 // Try to move splat adders from the index operand to the base
56956 // pointer operand. Taking care to multiply by the scale. We can only do
56957 // this when index element type is the same as the pointer type.
56958 // Otherwise we need to be sure the math doesn't wrap before the scale.
56959 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56960 isa<ConstantSDNode>(Scale)) {
56961 uint64_t ScaleAmt = Scale->getAsZExtVal();
56962
56963 for (unsigned I = 0; I != 2; ++I)
56964 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56965 BitVector UndefElts;
56966 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56967 if (UndefElts.none()) {
56968 // If the splat value is constant we can add the scaled splat value
56969 // to the existing base.
56970 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56971 APInt Adder = C->getAPIntValue() * ScaleAmt;
56972 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56973 DAG.getConstant(Adder, DL, PtrVT));
56974 SDValue NewIndex = Index.getOperand(1 - I);
56975 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56976 }
56977 // For non-constant cases, limit this to non-scaled cases.
56978 if (ScaleAmt == 1) {
56979 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
56980 SDValue NewIndex = Index.getOperand(1 - I);
56981 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56982 }
56983 }
56984 }
56985 // It's also possible base is just a constant. In that case, just
56986 // replace it with 0 and move the displacement into the index.
56987 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
56988 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
56989 // Combine the constant build_vector and the constant base.
56990 Splat =
56991 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
56992 // Add to the other half of the original Index add.
56993 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
56994 Index.getOperand(1 - I), Splat);
56995 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
56996 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56997 }
56998 }
56999 }
57000
57001 if (DCI.isBeforeLegalizeOps()) {
57002 // Make sure the index is either i32 or i64
57003 if (IndexWidth != 32 && IndexWidth != 64) {
57004 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
57005 IndexVT = IndexVT.changeVectorElementType(EltVT);
57006 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
57007 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
57008 }
57009 }
57010
57011 // With vector masks we only demand the upper bit of the mask.
57012 SDValue Mask = GorS->getMask();
57013 if (Mask.getScalarValueSizeInBits() != 1) {
57014 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
57015 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
57016 if (N->getOpcode() != ISD::DELETED_NODE)
57017 DCI.AddToWorklist(N);
57018 return SDValue(N, 0);
57019 }
57020 }
57021
57022 return SDValue();
57023}
57024
57025// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
57027 const X86Subtarget &Subtarget) {
57028 SDLoc DL(N);
57029 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
57030 SDValue EFLAGS = N->getOperand(1);
57031
57032 // Try to simplify the EFLAGS and condition code operands.
57033 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
57034 return getSETCC(CC, Flags, DL, DAG);
57035
57036 return SDValue();
57037}
57038
57039/// Optimize branch condition evaluation.
57041 const X86Subtarget &Subtarget) {
57042 SDLoc DL(N);
57043 SDValue EFLAGS = N->getOperand(3);
57044 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
57045
57046 // Try to simplify the EFLAGS and condition code operands.
57047 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
57048 // RAUW them under us.
57049 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
57050 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
57051 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
57052 N->getOperand(1), Cond, Flags);
57053 }
57054
57055 return SDValue();
57056}
57057
57058// TODO: Could we move this to DAGCombine?
57060 SelectionDAG &DAG) {
57061 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
57062 // to optimize away operation when it's from a constant.
57063 //
57064 // The general transformation is:
57065 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
57066 // AND(VECTOR_CMP(x,y), constant2)
57067 // constant2 = UNARYOP(constant)
57068
57069 // Early exit if this isn't a vector operation, the operand of the
57070 // unary operation isn't a bitwise AND, or if the sizes of the operations
57071 // aren't the same.
57072 EVT VT = N->getValueType(0);
57073 bool IsStrict = N->isStrictFPOpcode();
57074 unsigned NumEltBits = VT.getScalarSizeInBits();
57075 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57076 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
57077 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
57078 VT.getSizeInBits() != Op0.getValueSizeInBits())
57079 return SDValue();
57080
57081 // Now check that the other operand of the AND is a constant. We could
57082 // make the transformation for non-constant splats as well, but it's unclear
57083 // that would be a benefit as it would not eliminate any operations, just
57084 // perform one more step in scalar code before moving to the vector unit.
57085 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
57086 // Bail out if the vector isn't a constant.
57087 if (!BV->isConstant())
57088 return SDValue();
57089
57090 // Everything checks out. Build up the new and improved node.
57091 SDLoc DL(N);
57092 EVT IntVT = BV->getValueType(0);
57093 // Create a new constant of the appropriate type for the transformed
57094 // DAG.
57095 SDValue SourceConst;
57096 if (IsStrict)
57097 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
57098 {N->getOperand(0), SDValue(BV, 0)});
57099 else
57100 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
57101 // The AND node needs bitcasts to/from an integer vector type around it.
57102 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
57103 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57104 MaskConst);
57105 SDValue Res = DAG.getBitcast(VT, NewAnd);
57106 if (IsStrict)
57107 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57108 return Res;
57109 }
57110
57111 return SDValue();
57112}
57113
57114/// If we are converting a value to floating-point, try to replace scalar
57115/// truncate of an extracted vector element with a bitcast. This tries to keep
57116/// the sequence on XMM registers rather than moving between vector and GPRs.
57118 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57119 // to allow being called by any similar cast opcode.
57120 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57121 SDValue Trunc = N->getOperand(0);
57122 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57123 return SDValue();
57124
57125 SDValue ExtElt = Trunc.getOperand(0);
57126 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57127 !isNullConstant(ExtElt.getOperand(1)))
57128 return SDValue();
57129
57130 EVT TruncVT = Trunc.getValueType();
57131 EVT SrcVT = ExtElt.getValueType();
57132 unsigned DestWidth = TruncVT.getSizeInBits();
57133 unsigned SrcWidth = SrcVT.getSizeInBits();
57134 if (SrcWidth % DestWidth != 0)
57135 return SDValue();
57136
57137 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57138 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57139 unsigned VecWidth = SrcVecVT.getSizeInBits();
57140 unsigned NumElts = VecWidth / DestWidth;
57141 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57142 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57143 SDLoc DL(N);
57144 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57145 BitcastVec, ExtElt.getOperand(1));
57146 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57147}
57148
57150 const X86Subtarget &Subtarget) {
57151 bool IsStrict = N->isStrictFPOpcode();
57152 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57153 EVT VT = N->getValueType(0);
57154 EVT InVT = Op0.getValueType();
57155
57156 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57157 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57158 // if hasFP16 support:
57159 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57160 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57161 // else
57162 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57163 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57164 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57165 unsigned ScalarSize = InVT.getScalarSizeInBits();
57166 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57167 ScalarSize >= 64)
57168 return SDValue();
57169 SDLoc dl(N);
57170 EVT DstVT =
57172 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57173 : ScalarSize < 32 ? MVT::i32
57174 : MVT::i64,
57175 InVT.getVectorNumElements());
57176 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57177 if (IsStrict)
57178 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57179 {N->getOperand(0), P});
57180 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57181 }
57182
57183 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57184 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57185 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57186 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57187 VT.getScalarType() != MVT::f16) {
57188 SDLoc dl(N);
57189 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57190 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57191
57192 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57193 if (IsStrict)
57194 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57195 {N->getOperand(0), P});
57196 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57197 }
57198
57199 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57200 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57201 // the optimization here.
57202 SDNodeFlags Flags = N->getFlags();
57203 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57204 if (IsStrict)
57205 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57206 {N->getOperand(0), Op0});
57207 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57208 }
57209
57210 return SDValue();
57211}
57212
57215 const X86Subtarget &Subtarget) {
57216 // First try to optimize away the conversion entirely when it's
57217 // conditionally from a constant. Vectors only.
57218 bool IsStrict = N->isStrictFPOpcode();
57220 return Res;
57221
57222 // Now move on to more general possibilities.
57223 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57224 EVT VT = N->getValueType(0);
57225 EVT InVT = Op0.getValueType();
57226
57227 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57228 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57229 // if hasFP16 support:
57230 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57231 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57232 // else
57233 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57234 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57235 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57236 unsigned ScalarSize = InVT.getScalarSizeInBits();
57237 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57238 ScalarSize >= 64)
57239 return SDValue();
57240 SDLoc dl(N);
57241 EVT DstVT =
57243 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57244 : ScalarSize < 32 ? MVT::i32
57245 : MVT::i64,
57246 InVT.getVectorNumElements());
57247 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57248 if (IsStrict)
57249 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57250 {N->getOperand(0), P});
57251 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57252 }
57253
57254 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57255 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57256 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57257 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57258 VT.getScalarType() != MVT::f16) {
57259 SDLoc dl(N);
57260 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57261 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57262 if (IsStrict)
57263 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57264 {N->getOperand(0), P});
57265 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57266 }
57267
57268 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57269 // vectors and scalars, see if we know that the upper bits are all the sign
57270 // bit, in which case we can truncate the input to i32 and convert from that.
57271 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57272 unsigned BitWidth = InVT.getScalarSizeInBits();
57273 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57274 if (NumSignBits >= (BitWidth - 31)) {
57275 EVT TruncVT = MVT::i32;
57276 if (InVT.isVector())
57277 TruncVT = InVT.changeVectorElementType(TruncVT);
57278 SDLoc dl(N);
57279 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57280 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57281 if (IsStrict)
57282 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57283 {N->getOperand(0), Trunc});
57284 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57285 }
57286 // If we're after legalize and the type is v2i32 we need to shuffle and
57287 // use CVTSI2P.
57288 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57289 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57290 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57291 { 0, 2, -1, -1 });
57292 if (IsStrict)
57293 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57294 {N->getOperand(0), Shuf});
57295 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57296 }
57297 }
57298
57299 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57300 // a 32-bit target where SSE doesn't support i64->FP operations.
57301 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57302 Op0.getOpcode() == ISD::LOAD) {
57303 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57304
57305 // This transformation is not supported if the result type is f16 or f128.
57306 if (VT == MVT::f16 || VT == MVT::f128)
57307 return SDValue();
57308
57309 // If we have AVX512DQ we can use packed conversion instructions unless
57310 // the VT is f80.
57311 if (Subtarget.hasDQI() && VT != MVT::f80)
57312 return SDValue();
57313
57314 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57315 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57316 std::pair<SDValue, SDValue> Tmp =
57317 Subtarget.getTargetLowering()->BuildFILD(
57318 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57319 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57320 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57321 return Tmp.first;
57322 }
57323 }
57324
57325 if (IsStrict)
57326 return SDValue();
57327
57328 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57329 return V;
57330
57331 return SDValue();
57332}
57333
57335 const X86Subtarget &Subtarget) {
57336 EVT VT = N->getValueType(0);
57337 SDValue Src = N->getOperand(0);
57338 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57339 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57340 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57341
57342 return SDValue();
57343}
57344
57345// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57347 const X86Subtarget &Subtarget) {
57348 if (!Subtarget.hasAVX10_2())
57349 return SDValue();
57350
57351 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57352 EVT SrcVT = N->getOperand(0).getValueType();
57353 EVT DstVT = N->getValueType(0);
57354 SDLoc dl(N);
57355
57356 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57357 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57358
57359 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57360 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57361 N->getOperand(0), V2F32Value);
57362
57363 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57364 if (IsSigned)
57365 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57366
57367 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57368 }
57369 return SDValue();
57370}
57371
57373 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57374
57375 for (const SDNode *User : Flags->users()) {
57376 X86::CondCode CC;
57377 switch (User->getOpcode()) {
57378 default:
57379 // Be conservative.
57380 return true;
57381 case X86ISD::SETCC:
57383 CC = (X86::CondCode)User->getConstantOperandVal(0);
57384 break;
57385 case X86ISD::BRCOND:
57386 case X86ISD::CMOV:
57387 CC = (X86::CondCode)User->getConstantOperandVal(2);
57388 break;
57389 }
57390
57391 switch (CC) {
57392 // clang-format off
57393 default: break;
57394 case X86::COND_A: case X86::COND_AE:
57395 case X86::COND_B: case X86::COND_BE:
57396 case X86::COND_O: case X86::COND_NO:
57397 case X86::COND_G: case X86::COND_GE:
57398 case X86::COND_L: case X86::COND_LE:
57399 return true;
57400 // clang-format on
57401 }
57402 }
57403
57404 return false;
57405}
57406
57407static bool onlyZeroFlagUsed(SDValue Flags) {
57408 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57409
57410 for (const SDNode *User : Flags->users()) {
57411 unsigned CCOpNo;
57412 switch (User->getOpcode()) {
57413 default:
57414 // Be conservative.
57415 return false;
57416 case X86ISD::SETCC:
57418 CCOpNo = 0;
57419 break;
57420 case X86ISD::BRCOND:
57421 case X86ISD::CMOV:
57422 CCOpNo = 2;
57423 break;
57424 }
57425
57426 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57427 if (CC != X86::COND_E && CC != X86::COND_NE)
57428 return false;
57429 }
57430
57431 return true;
57432}
57433
57436 const X86Subtarget &Subtarget) {
57437 // Only handle test patterns.
57438 if (!isNullConstant(N->getOperand(1)))
57439 return SDValue();
57440
57441 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57442 // and use its flags directly.
57443 // TODO: Maybe we should try promoting compares that only use the zero flag
57444 // first if we can prove the upper bits with computeKnownBits?
57445 SDLoc dl(N);
57446 SDValue Op = N->getOperand(0);
57447 EVT VT = Op.getValueType();
57448 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57449
57450 if (SDValue CMP =
57451 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57452 return CMP;
57453
57454 // If we have a constant logical shift that's only used in a comparison
57455 // against zero turn it into an equivalent AND. This allows turning it into
57456 // a TEST instruction later.
57457 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57458 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57459 onlyZeroFlagUsed(SDValue(N, 0))) {
57460 unsigned BitWidth = VT.getSizeInBits();
57461 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57462 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57463 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57464 APInt Mask = Op.getOpcode() == ISD::SRL
57465 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57466 : APInt::getLowBitsSet(BitWidth, MaskBits);
57467 if (Mask.isSignedIntN(32)) {
57468 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57469 DAG.getConstant(Mask, dl, VT));
57470 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57471 DAG.getConstant(0, dl, VT));
57472 }
57473 }
57474 }
57475
57476 // If we're extracting from a avx512 bool vector and comparing against zero,
57477 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57478 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57479 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57480 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57481 SDValue Src = Op.getOperand(0);
57482 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57483 isNullConstant(Src.getOperand(1)) &&
57484 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57485 SDValue BoolVec = Src.getOperand(0);
57486 unsigned ShAmt = 0;
57487 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57488 ShAmt = BoolVec.getConstantOperandVal(1);
57489 BoolVec = BoolVec.getOperand(0);
57490 }
57491 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57492 EVT VecVT = BoolVec.getValueType();
57493 unsigned BitWidth = VecVT.getVectorNumElements();
57494 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57495 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57496 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57497 Op = DAG.getBitcast(BCVT, BoolVec);
57498 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57499 DAG.getConstant(Mask, dl, BCVT));
57500 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57501 DAG.getConstant(0, dl, BCVT));
57502 }
57503 }
57504 }
57505
57506 // Peek through any zero-extend if we're only testing for a zero result.
57507 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57508 SDValue Src = Op.getOperand(0);
57509 EVT SrcVT = Src.getValueType();
57510 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57511 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57512 DAG.getConstant(0, dl, SrcVT));
57513 }
57514
57515 // Look for a truncate.
57516 if (Op.getOpcode() != ISD::TRUNCATE)
57517 return SDValue();
57518
57519 SDValue Trunc = Op;
57520 Op = Op.getOperand(0);
57521
57522 // See if we can compare with zero against the truncation source,
57523 // which should help using the Z flag from many ops. Only do this for
57524 // i32 truncated op to prevent partial-reg compares of promoted ops.
57525 EVT OpVT = Op.getValueType();
57526 APInt UpperBits =
57528 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57529 onlyZeroFlagUsed(SDValue(N, 0))) {
57530 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57531 DAG.getConstant(0, dl, OpVT));
57532 }
57533
57534 // After this the truncate and arithmetic op must have a single use.
57535 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57536 return SDValue();
57537
57538 unsigned NewOpc;
57539 switch (Op.getOpcode()) {
57540 default: return SDValue();
57541 case ISD::AND:
57542 // Skip and with constant. We have special handling for and with immediate
57543 // during isel to generate test instructions.
57544 if (isa<ConstantSDNode>(Op.getOperand(1)))
57545 return SDValue();
57546 NewOpc = X86ISD::AND;
57547 break;
57548 case ISD::OR: NewOpc = X86ISD::OR; break;
57549 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57550 case ISD::ADD:
57551 // If the carry or overflow flag is used, we can't truncate.
57553 return SDValue();
57554 NewOpc = X86ISD::ADD;
57555 break;
57556 case ISD::SUB:
57557 // If the carry or overflow flag is used, we can't truncate.
57559 return SDValue();
57560 NewOpc = X86ISD::SUB;
57561 break;
57562 }
57563
57564 // We found an op we can narrow. Truncate its inputs.
57565 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57566 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57567
57568 // Use a X86 specific opcode to avoid DAG combine messing with it.
57569 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57570 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57571
57572 // For AND, keep a CMP so that we can match the test pattern.
57573 if (NewOpc == X86ISD::AND)
57574 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57575 DAG.getConstant(0, dl, VT));
57576
57577 // Return the flags.
57578 return Op.getValue(1);
57579}
57580
57583 const X86Subtarget &ST) {
57584 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57585 "Expected X86ISD::ADD or X86ISD::SUB");
57586
57587 SDLoc DL(N);
57588 SDValue LHS = N->getOperand(0);
57589 SDValue RHS = N->getOperand(1);
57590 MVT VT = LHS.getSimpleValueType();
57591 bool IsSub = X86ISD::SUB == N->getOpcode();
57592 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57593
57594 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57595 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57596 return CMP;
57597
57598 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57599 if (!N->hasAnyUseOfValue(1)) {
57600 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57601 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57602 }
57603
57604 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57605 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57606 SDValue Ops[] = {N0, N1};
57607 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57608 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57609 SDValue Op(N, 0);
57610 if (Negate) {
57611 // Bail if this is only used by a user of the x86 add/sub.
57612 if (GenericAddSub->hasOneUse() &&
57613 GenericAddSub->user_begin()->isOnlyUserOf(N))
57614 return;
57615 Op = DAG.getNegative(Op, DL, VT);
57616 }
57617 DCI.CombineTo(GenericAddSub, Op);
57618 }
57619 };
57620 MatchGeneric(LHS, RHS, false);
57621 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57622
57623 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57624 // EFLAGS result doesn't change.
57625 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57626 /*ZeroSecondOpOnly*/ true);
57627}
57628
57630 SDValue LHS = N->getOperand(0);
57631 SDValue RHS = N->getOperand(1);
57632 SDValue BorrowIn = N->getOperand(2);
57633
57634 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57635 MVT VT = N->getSimpleValueType(0);
57636 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57637 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57638 }
57639
57640 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57641 // iff the flag result is dead.
57642 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57643 !N->hasAnyUseOfValue(1))
57644 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57645 LHS.getOperand(1), BorrowIn);
57646
57647 return SDValue();
57648}
57649
57650// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57653 SDValue LHS = N->getOperand(0);
57654 SDValue RHS = N->getOperand(1);
57655 SDValue CarryIn = N->getOperand(2);
57656 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57657 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57658
57659 // Canonicalize constant to RHS.
57660 if (LHSC && !RHSC)
57661 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57662 CarryIn);
57663
57664 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57665 // the result is either zero or one (depending on the input carry bit).
57666 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57667 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57668 // We don't have a good way to replace an EFLAGS use, so only do this when
57669 // dead right now.
57670 SDValue(N, 1).use_empty()) {
57671 SDLoc DL(N);
57672 EVT VT = N->getValueType(0);
57673 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57674 SDValue Res1 = DAG.getNode(
57675 ISD::AND, DL, VT,
57677 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57678 DAG.getConstant(1, DL, VT));
57679 return DCI.CombineTo(N, Res1, CarryOut);
57680 }
57681
57682 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57683 // iff the flag result is dead.
57684 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57685 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57686 SDLoc DL(N);
57687 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57688 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57689 DAG.getConstant(0, DL, LHS.getValueType()),
57690 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57691 }
57692
57693 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57694 MVT VT = N->getSimpleValueType(0);
57695 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57696 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57697 }
57698
57699 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57700 // iff the flag result is dead.
57701 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57702 !N->hasAnyUseOfValue(1))
57703 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57704 LHS.getOperand(1), CarryIn);
57705
57706 return SDValue();
57707}
57708
57710 const SDLoc &DL, EVT VT,
57711 const X86Subtarget &Subtarget) {
57712 using namespace SDPatternMatch;
57713
57714 // Example of pattern we try to detect:
57715 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57716 //(add (build_vector (extract_elt t, 0),
57717 // (extract_elt t, 2),
57718 // (extract_elt t, 4),
57719 // (extract_elt t, 6)),
57720 // (build_vector (extract_elt t, 1),
57721 // (extract_elt t, 3),
57722 // (extract_elt t, 5),
57723 // (extract_elt t, 7)))
57724
57725 if (!Subtarget.hasSSE2())
57726 return SDValue();
57727
57728 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57729 VT.getVectorNumElements() < 4 ||
57731 return SDValue();
57732
57733 SDValue Op0, Op1, Accum;
57738 m_Value(Op1))))))
57739 return SDValue();
57740
57741 // Check if one of Op0,Op1 is of the form:
57742 // (build_vector (extract_elt Mul, 0),
57743 // (extract_elt Mul, 2),
57744 // (extract_elt Mul, 4),
57745 // ...
57746 // the other is of the form:
57747 // (build_vector (extract_elt Mul, 1),
57748 // (extract_elt Mul, 3),
57749 // (extract_elt Mul, 5),
57750 // ...
57751 // and identify Mul.
57752 SDValue Mul;
57753 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57754 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57755 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57756 // TODO: Be more tolerant to undefs.
57757 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57758 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57759 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57760 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57761 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57762 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57763 return SDValue();
57764 // Commutativity of mul allows factors of a product to reorder.
57765 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57766 std::swap(Idx0L, Idx1L);
57767 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57768 std::swap(Idx0H, Idx1H);
57769 // Commutativity of add allows pairs of factors to reorder.
57770 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57771 std::swap(Idx0L, Idx0H);
57772 std::swap(Idx1L, Idx1H);
57773 }
57774 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57775 Idx1H != 2 * i + 3)
57776 return SDValue();
57777 if (!Mul) {
57778 // First time an extract_elt's source vector is visited. Must be a MUL
57779 // with 2X number of vector elements than the BUILD_VECTOR.
57780 // Both extracts must be from same MUL.
57781 Mul = Vec0L;
57782 if (Mul.getOpcode() != ISD::MUL ||
57783 Mul.getValueType().getVectorNumElements() != 2 * e)
57784 return SDValue();
57785 }
57786 // Check that the extract is from the same MUL previously seen.
57787 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57788 return SDValue();
57789 }
57790
57791 // Check if the Mul source can be safely shrunk.
57793 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57795 return SDValue();
57796
57797 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57798 VT.getVectorNumElements() * 2);
57799 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57800 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57801
57802 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57804 EVT InVT = Ops[0].getValueType();
57805 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57806 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57807 InVT.getVectorNumElements() / 2);
57808 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57809 };
57810 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57811 if (Accum)
57812 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57813 return R;
57814}
57815
57816// Attempt to turn this pattern into PMADDWD.
57817// (add (mul (sext (build_vector)), (sext (build_vector))),
57818// (mul (sext (build_vector)), (sext (build_vector)))
57820 const SDLoc &DL, EVT VT,
57821 const X86Subtarget &Subtarget) {
57822 using namespace SDPatternMatch;
57823
57824 if (!Subtarget.hasSSE2())
57825 return SDValue();
57826
57827 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57828 VT.getVectorNumElements() < 4 ||
57830 return SDValue();
57831
57832 // All inputs need to be sign extends.
57833 // TODO: Support ZERO_EXTEND from known positive?
57834 SDValue N00, N01, N10, N11;
57835 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57836 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57837 return SDValue();
57838
57839 // Must be extending from vXi16.
57840 EVT InVT = N00.getValueType();
57841 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57842 N10.getValueType() != InVT || N11.getValueType() != InVT)
57843 return SDValue();
57844
57845 // All inputs should be build_vectors.
57846 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57847 N01.getOpcode() != ISD::BUILD_VECTOR ||
57848 N10.getOpcode() != ISD::BUILD_VECTOR ||
57850 return SDValue();
57851
57852 // For each element, we need to ensure we have an odd element from one vector
57853 // multiplied by the odd element of another vector and the even element from
57854 // one of the same vectors being multiplied by the even element from the
57855 // other vector. So we need to make sure for each element i, this operator
57856 // is being performed:
57857 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57858 SDValue In0, In1;
57859 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57860 SDValue N00Elt = N00.getOperand(i);
57861 SDValue N01Elt = N01.getOperand(i);
57862 SDValue N10Elt = N10.getOperand(i);
57863 SDValue N11Elt = N11.getOperand(i);
57864 // TODO: Be more tolerant to undefs.
57865 SDValue N00In, N01In, N10In, N11In;
57866 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57867 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57868 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57869 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57870 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57871 return SDValue();
57872 // Add is commutative so indices can be reordered.
57873 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57874 std::swap(IdxN00, IdxN10);
57875 std::swap(IdxN01, IdxN11);
57876 }
57877 // N0 indices be the even element. N1 indices must be the next odd element.
57878 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57879 IdxN11 != 2 * i + 1)
57880 return SDValue();
57881
57882 // First time we find an input capture it.
57883 if (!In0) {
57884 In0 = N00In;
57885 In1 = N01In;
57886
57887 // The input vectors must be at least as wide as the output.
57888 // If they are larger than the output, we extract subvector below.
57889 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57890 In1.getValueSizeInBits() < VT.getSizeInBits())
57891 return SDValue();
57892 }
57893 // Mul is commutative so the input vectors can be in any order.
57894 // Canonicalize to make the compares easier.
57895 if (In0 != N00In)
57896 std::swap(N00In, N01In);
57897 if (In0 != N10In)
57898 std::swap(N10In, N11In);
57899 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57900 return SDValue();
57901 }
57902
57903 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57905 EVT OpVT = Ops[0].getValueType();
57906 assert(OpVT.getScalarType() == MVT::i16 &&
57907 "Unexpected scalar element type");
57908 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57909 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57910 OpVT.getVectorNumElements() / 2);
57911 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57912 };
57913
57914 // If the output is narrower than an input, extract the low part of the input
57915 // vector.
57916 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57917 VT.getVectorNumElements() * 2);
57918 if (OutVT16.bitsLT(In0.getValueType())) {
57919 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57920 DAG.getVectorIdxConstant(0, DL));
57921 }
57922 if (OutVT16.bitsLT(In1.getValueType())) {
57923 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57924 DAG.getVectorIdxConstant(0, DL));
57925 }
57926 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57927 PMADDBuilder);
57928}
57929
57930// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57931// If upper element in each pair of both VPMADDWD are zero then we can merge
57932// the operand elements and use the implicit add of VPMADDWD.
57933// TODO: Add support for VPMADDUBSW (which isn't commutable).
57935 const SDLoc &DL, EVT VT) {
57936 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57937 return SDValue();
57938
57939 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57940 if (VT.getSizeInBits() > 128)
57941 return SDValue();
57942
57943 unsigned NumElts = VT.getVectorNumElements();
57944 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57946 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57947
57948 bool Op0HiZero =
57949 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57950 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57951 bool Op1HiZero =
57952 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57953 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57954
57955 // TODO: Check for zero lower elements once we have actual codegen that
57956 // creates them.
57957 if (!Op0HiZero || !Op1HiZero)
57958 return SDValue();
57959
57960 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57961 SmallVector<int> Mask;
57962 for (int i = 0; i != (int)NumElts; ++i) {
57963 Mask.push_back(2 * i);
57964 Mask.push_back(2 * (i + NumElts));
57965 }
57966
57967 SDValue LHS =
57968 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57969 SDValue RHS =
57970 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57971 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57972}
57973
57974/// CMOV of constants requires materializing constant operands in registers.
57975/// Try to fold those constants into an 'add' instruction to reduce instruction
57976/// count. We do this with CMOV rather the generic 'select' because there are
57977/// earlier folds that may be used to turn select-of-constants into logic hacks.
57979 SelectionDAG &DAG,
57980 const X86Subtarget &Subtarget) {
57981 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57982 // better because we eliminate 1-2 instructions. This transform is still
57983 // an improvement without zero operands because we trade 2 move constants and
57984 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57985 // immediate asm operands (fit in 32-bits).
57986 auto isSuitableCmov = [](SDValue V) {
57987 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
57988 return false;
57989 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
57990 !isa<ConstantSDNode>(V.getOperand(1)))
57991 return false;
57992 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
57993 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
57994 V.getConstantOperandAPInt(1).isSignedIntN(32));
57995 };
57996
57997 // Match an appropriate CMOV as the first operand of the add.
57998 SDValue Cmov = N->getOperand(0);
57999 SDValue OtherOp = N->getOperand(1);
58000 if (!isSuitableCmov(Cmov))
58001 std::swap(Cmov, OtherOp);
58002 if (!isSuitableCmov(Cmov))
58003 return SDValue();
58004
58005 // Don't remove a load folding opportunity for the add. That would neutralize
58006 // any improvements from removing constant materializations.
58007 if (X86::mayFoldLoad(OtherOp, Subtarget))
58008 return SDValue();
58009
58010 EVT VT = N->getValueType(0);
58011 SDValue FalseOp = Cmov.getOperand(0);
58012 SDValue TrueOp = Cmov.getOperand(1);
58013
58014 // We will push the add through the select, but we can potentially do better
58015 // if we know there is another add in the sequence and this is pointer math.
58016 // In that case, we can absorb an add into the trailing memory op and avoid
58017 // a 3-operand LEA which is likely slower than a 2-operand LEA.
58018 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
58019 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
58020 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
58021 all_of(N->users(), [&](SDNode *Use) {
58022 auto *MemNode = dyn_cast<MemSDNode>(Use);
58023 return MemNode && MemNode->getBasePtr().getNode() == N;
58024 })) {
58025 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
58026 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
58027 // it is possible that choosing op1 might be better.
58028 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
58029 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
58030 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
58031 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
58032 Cmov.getOperand(2), Cmov.getOperand(3));
58033 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
58034 }
58035
58036 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
58037 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
58038 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
58039 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
58040 Cmov.getOperand(3));
58041}
58042
58043// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
58044// When upper 12 bits of x, y and MUL(x, y) are known to be 0
58046 EVT VT, const X86Subtarget &Subtarget) {
58047 using namespace SDPatternMatch;
58048 if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
58049 (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
58050 return SDValue();
58051
58052 // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
58053 if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
58054 VT.getSizeInBits() < 512)
58055 return SDValue();
58056
58057 const auto TotalSize = VT.getSizeInBits();
58058 if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
58059 return SDValue();
58060
58061 SDValue X, Y, Acc;
58062 if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
58063 return SDValue();
58064
58065 KnownBits KnownX = DAG.computeKnownBits(X);
58066 if (KnownX.countMinLeadingZeros() < 12)
58067 return SDValue();
58068 KnownBits KnownY = DAG.computeKnownBits(Y);
58069 if (KnownY.countMinLeadingZeros() < 12)
58070 return SDValue();
58071 KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
58072 if (KnownMul.countMinLeadingZeros() < 12)
58073 return SDValue();
58074
58075 auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
58076 ArrayRef<SDValue> SubOps) {
58077 EVT SubVT = SubOps[0].getValueType();
58078 assert(SubVT.getScalarSizeInBits() == 64 &&
58079 "Unexpected element size, only supports 64bit size");
58080 return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
58081 SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
58082 };
58083
58084 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
58085 /*CheckBWI*/ false,
58086 /*AllowAVX512*/ Subtarget.hasIFMA());
58087}
58088
58091 const X86Subtarget &Subtarget) {
58092 using namespace SDPatternMatch;
58093 EVT VT = N->getValueType(0);
58094 SDValue Op0 = N->getOperand(0);
58095 SDValue Op1 = N->getOperand(1);
58096 SDLoc DL(N);
58097
58098 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
58099 return Select;
58100
58101 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
58102 return MAdd;
58103 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
58104 return MAdd;
58105 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
58106 return MAdd;
58107
58108 // Try to synthesize horizontal adds from adds of shuffles.
58109 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58110 return V;
58111
58112 // Canonicalize hidden LEA pattern:
58113 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
58114 // iff c < 4
58115 if (VT == MVT::i32 || VT == MVT::i64) {
58116 SDValue Y, Z, Shift;
58117 APInt Amt;
58118 if (sd_match(
58120 m_Shl(m_Value(), m_ConstInt(Amt))),
58121 m_Value(Y))),
58122 m_Value(Z))) &&
58123 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
58124 return DAG.getNode(ISD::SUB, DL, VT,
58125 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
58126 }
58127 }
58128
58129 SDValue X, Y;
58130
58131 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
58132 // iff X and Y won't overflow.
58133 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
58135 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
58136 MVT OpVT = X.getSimpleValueType();
58137 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
58138 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
58139 getZeroVector(OpVT, Subtarget, DAG, DL));
58140 }
58141
58142 if (VT.isVector()) {
58143 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
58145
58146 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
58147 // (sub Y, (sext (vXi1 X))).
58148 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
58149 // in generic DAG combine without a legal type check, but adding this there
58150 // caused regressions.
58151 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58153 m_Value(Y)))) {
58154 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58155 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58156 }
58157
58158 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58159 // canonicalisation as we don't have good vXi8 shifts.
58160 if (VT.getScalarType() == MVT::i8 &&
58162 SDValue Cmp =
58163 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58164 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58165 }
58166 }
58167
58168 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58169 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58170 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58171 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58172 if (sd_match(N, m_Add(m_Value(Accum),
58175 m_Value(Lo1)),
58177 m_Value(Hi1)))))) {
58178 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58179 concatSubVectors(Lo0, Hi0, DAG, DL),
58180 concatSubVectors(Lo1, Hi1, DAG, DL));
58181 }
58182 }
58183
58184 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58185 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58186 X86::isZeroNode(Op0.getOperand(1))) {
58187 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58188 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58189 Op0.getOperand(0), Op0.getOperand(2));
58190 }
58191
58192 if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
58193 return IFMA52;
58194
58195 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58196}
58197
58198// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58199// condition comes from the subtract node that produced -X. This matches the
58200// cmov expansion for absolute value. By swapping the operands we convert abs
58201// to nabs.
58202static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58203 SelectionDAG &DAG) {
58204 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58205 return SDValue();
58206
58207 SDValue Cond = N1.getOperand(3);
58208 if (Cond.getOpcode() != X86ISD::SUB)
58209 return SDValue();
58210 assert(Cond.getResNo() == 1 && "Unexpected result number");
58211
58212 SDValue FalseOp = N1.getOperand(0);
58213 SDValue TrueOp = N1.getOperand(1);
58215
58216 // ABS condition should come from a negate operation.
58217 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58218 isNullConstant(Cond.getOperand(0))) {
58219 // Get the X and -X from the negate.
58220 SDValue NegX = Cond.getValue(0);
58221 SDValue X = Cond.getOperand(1);
58222
58223 // Cmov operands should be X and NegX. Order doesn't matter.
58224 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58225 return SDValue();
58226
58227 // Build a new CMOV with the operands swapped.
58228 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58229 N1.getOperand(2), Cond);
58230 // Convert sub to add.
58231 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58232 }
58233
58234 // Handle ABD special case:
58235 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58236 // ABD condition should come from a pair of matching subtracts.
58237 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58238 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58239 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58240 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58241 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58242 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58243 // Build a new CMOV with the operands swapped.
58244 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58245 Cond);
58246 }
58247
58248 return SDValue();
58249}
58250
58252 SDValue Op0 = N->getOperand(0);
58253 SDValue Op1 = N->getOperand(1);
58254
58255 // (sub C (zero_extend (setcc)))
58256 // =>
58257 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58258 // Don't disturb (sub 0 setcc), which is easily done with neg.
58259 EVT VT = N->getValueType(0);
58260 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58261 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58262 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58263 Op1.getOperand(0).hasOneUse()) {
58264 SDValue SetCC = Op1.getOperand(0);
58267 APInt NewImm = Op0C->getAPIntValue() - 1;
58268 SDLoc DL(Op1);
58269 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58270 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58271 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58272 DAG.getConstant(NewImm, DL, VT));
58273 }
58274
58275 return SDValue();
58276}
58277
58279 if (N->getConstantOperandVal(3) != X86::COND_NE)
58280 return SDValue();
58281
58282 SDValue Sub = N->getOperand(4);
58283 if (Sub.getOpcode() != X86ISD::SUB)
58284 return SDValue();
58285
58286 SDValue Op1 = Sub.getOperand(1);
58287
58288 if (!X86::isZeroNode(Sub.getOperand(0)))
58289 return SDValue();
58290
58291 SDLoc DL(N);
58292 SmallVector<SDValue, 5> Ops(N->op_values());
58293 if (Op1.getOpcode() == X86ISD::SETCC) {
58294 // res, flags2 = sub 0, (setcc cc, flag)
58295 // cload/cstore ..., cond_ne, flag2
58296 // ->
58297 // cload/cstore cc, flag
58298 Ops[3] = Op1.getOperand(0);
58299 Ops[4] = Op1.getOperand(1);
58300 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58301 SDValue Src = Op1;
58302 SDValue Op10 = Op1.getOperand(0);
58303 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
58304 // res, flags2 = sub 0, (and (xor X, -1), Y)
58305 // cload/cstore ..., cond_ne, flag2
58306 // ->
58307 // res, flags2 = sub 0, (and X, Y)
58308 // cload/cstore ..., cond_e, flag2
58309 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58310 Op1.getOperand(1));
58311 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58312 }
58313 // res, flags2 = sub 0, (and X, Y)
58314 // cload/cstore ..., cc, flag2
58315 // ->
58316 // res, flags2 = cmp (and X, Y), 0
58317 // cload/cstore ..., cc, flag2
58318 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58319 } else {
58320 return SDValue();
58321 }
58322
58323 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58324 cast<MemSDNode>(N)->getMemoryVT(),
58325 cast<MemSDNode>(N)->getMemOperand());
58326}
58327
58330 const X86Subtarget &Subtarget) {
58331 EVT VT = N->getValueType(0);
58332 SDValue Op0 = N->getOperand(0);
58333 SDValue Op1 = N->getOperand(1);
58334 SDLoc DL(N);
58335
58336 auto IsNonOpaqueConstant = [&](SDValue Op) {
58338 /*AllowOpaques*/ false);
58339 };
58340
58341 // X86 can't encode an immediate LHS of a sub. See if we can push the
58342 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58343 // one use and a constant, invert the immediate, saving one register.
58344 // However, ignore cases where C1 is 0, as those will become a NEG.
58345 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58346 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58347 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58348 Op1->hasOneUse()) {
58349 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58350 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58351 SDValue NewAdd =
58352 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58353 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58354 }
58355
58356 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58357 return V;
58358
58359 // Try to synthesize horizontal subs from subs of shuffles.
58360 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58361 return V;
58362
58363 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58364 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58365 X86::isZeroNode(Op1.getOperand(1))) {
58366 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58367 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58368 Op1.getOperand(0), Op1.getOperand(2));
58369 }
58370
58371 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58372 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58373 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58374 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58375 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58376 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58377 Op1.getOperand(1), Op1.getOperand(2));
58378 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58379 }
58380
58381 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58382 return V;
58383
58384 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58385 return V;
58386
58387 return combineSubSetcc(N, DAG);
58388}
58389
58391 const X86Subtarget &Subtarget) {
58392 unsigned Opcode = N->getOpcode();
58393 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58394 "Unknown PCMP opcode");
58395
58396 SDValue LHS = N->getOperand(0);
58397 SDValue RHS = N->getOperand(1);
58398 MVT VT = N->getSimpleValueType(0);
58399 unsigned EltBits = VT.getScalarSizeInBits();
58400 unsigned NumElts = VT.getVectorNumElements();
58401 SDLoc DL(N);
58402
58403 if (LHS == RHS)
58404 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58405 : DAG.getConstant(0, DL, VT);
58406
58407 // Constant Folding.
58408 // PCMPEQ(X,UNDEF) -> UNDEF
58409 // PCMPGT(X,UNDEF) -> 0
58410 // PCMPGT(UNDEF,X) -> 0
58411 APInt LHSUndefs, RHSUndefs;
58412 SmallVector<APInt> LHSBits, RHSBits;
58413 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58414 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58415 APInt Ones = APInt::getAllOnes(EltBits);
58416 APInt Zero = APInt::getZero(EltBits);
58417 SmallVector<APInt> Results(NumElts);
58418 for (unsigned I = 0; I != NumElts; ++I) {
58419 if (Opcode == X86ISD::PCMPEQ) {
58420 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58421 } else {
58422 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58423 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58424 }
58425 }
58426 if (Opcode == X86ISD::PCMPEQ)
58427 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58428 return getConstVector(Results, VT, DAG, DL);
58429 }
58430
58431 return SDValue();
58432}
58433
58434// Helper to determine if we can convert an integer comparison to a float
58435// comparison byt casting the operands.
58436static std::optional<unsigned>
58437CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58438 unsigned NumSignificantBitsRHS) {
58439 MVT SVT = VT.getScalarType();
58440 assert(SVT == MVT::f32 && "Only tested for float so far");
58441 const fltSemantics &Sem = SVT.getFltSemantics();
58442 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58443 "Only PCMPEQ/PCMPGT currently supported");
58444
58445 // TODO: Handle bitcastable integers.
58446
58447 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58448 // a fp value.
58449 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58450 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58451 return ISD::SINT_TO_FP;
58452
58453 return std::nullopt;
58454}
58455
58456/// Helper that combines an array of subvector ops as if they were the operands
58457/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58458/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58461 const X86Subtarget &Subtarget,
58462 unsigned Depth) {
58463 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58464 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58465
58466 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58467 return DAG.getUNDEF(VT);
58468
58469 if (llvm::all_of(Ops, [](SDValue Op) {
58470 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58471 }))
58472 return getZeroVector(VT, Subtarget, DAG, DL);
58473
58475 return SDValue(); // Limit search depth.
58476
58477 SDValue Op0 = Ops[0];
58478 bool IsSplat = llvm::all_equal(Ops);
58479 unsigned NumOps = Ops.size();
58480 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58481 LLVMContext &Ctx = *DAG.getContext();
58482
58483 // Repeated subvectors.
58484 if (IsSplat &&
58485 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58486 // If this broadcast is inserted into both halves, use a larger broadcast.
58487 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58488 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58489
58490 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58491 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58492 (Subtarget.hasAVX2() ||
58494 VT.getScalarType(), Subtarget)))
58495 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58496 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58497 Op0.getOperand(0),
58498 DAG.getVectorIdxConstant(0, DL)));
58499
58500 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58501 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58502 (Subtarget.hasAVX2() ||
58503 (EltSizeInBits >= 32 &&
58504 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58505 Op0.getOperand(0).getValueType() == VT.getScalarType())
58506 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58507
58508 // concat_vectors(extract_subvector(splat(x)),
58509 // extract_subvector(splat(x))) -> splat(x)
58510 // concat_vectors(extract_subvector(subv_broadcast(x)),
58511 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58512 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58513 Op0.getOperand(0).getValueType() == VT) {
58514 SDValue SrcVec = Op0.getOperand(0);
58515 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58516 return SrcVec;
58517 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58518 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58519 return SrcVec;
58520 }
58521
58522 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58523 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58524 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58525 return DAG.getNode(Op0.getOpcode(), DL, VT,
58527 Op0.getOperand(0), Op0.getOperand(0)),
58528 Op0.getOperand(1));
58529 }
58530
58531 // TODO: This should go in combineX86ShufflesRecursively eventually.
58532 if (NumOps == 2) {
58533 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58534 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58535 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58537 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58538 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58539 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58540 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58541 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58542 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58543 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58544 // Only concat of subvector high halves which vperm2x128 is best at or if
58545 // it should fold into a subvector broadcast.
58546 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58547 SrcVT1.is256BitVector()) {
58548 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58549 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58550 "Bad subvector index");
58551 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58552 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58553 unsigned Index = 0;
58554 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58555 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58556 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58557 DAG.getBitcast(VT, Src0.getOperand(0)),
58558 DAG.getBitcast(VT, Src1.getOperand(0)),
58559 DAG.getTargetConstant(Index, DL, MVT::i8));
58560 }
58561 }
58562 // Widen extract_subvector
58563 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58564 // --> extract_subvector(x,lo)
58565 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58566 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58567 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58568 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58569 return DAG.getBitcast(VT,
58571 Src0.getConstantOperandVal(1),
58572 DAG, DL, VT.getSizeInBits()));
58573 }
58574 }
58575 }
58576
58577 // Repeated opcode.
58578 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58579 // but it currently struggles with different vector widths.
58580 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58581 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58582 })) {
58583 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58585 for (SDValue SubOp : SubOps)
58586 Subs.push_back(SubOp.getOperand(I));
58587 // Attempt to peek through bitcasts and concat the original subvectors.
58588 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58589 if (SubVT.isSimple() && SubVT.isVector()) {
58590 MVT ConcatVT =
58592 SubVT.getVectorElementCount() * Subs.size());
58593 for (SDValue &Sub : Subs)
58594 Sub = DAG.getBitcast(SubVT, Sub);
58595 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58596 Subtarget, Depth + 1))
58597 return DAG.getBitcast(VT, ConcatSrc);
58598 return DAG.getBitcast(
58599 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58600 }
58601 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58602 };
58603 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58604 bool AllConstants = true;
58605 bool AllSubs = true;
58606 unsigned VecSize = VT.getSizeInBits();
58607 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58608 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58609 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58610 }))
58611 return true;
58612 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58613 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58614 unsigned SubSize = BC.getValueSizeInBits();
58615 unsigned EltSize = BC.getScalarValueSizeInBits();
58616 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58618 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58619 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58620 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58621 }
58622 return AllConstants || AllSubs;
58623 };
58624 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58625 bool AllConstants = true;
58627 for (SDValue SubOp : SubOps) {
58628 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58629 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58631 Subs.push_back(SubOp.getOperand(I));
58632 }
58633 if (AllConstants)
58634 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58635 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58636 };
58637
58638 unsigned Opcode = Op0.getOpcode();
58639 switch (Opcode) {
58640 case ISD::BITCAST: {
58641 // TODO: Support AVX1/AVX2 bitcasts.
58643 for (SDValue SubOp : Ops)
58644 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58645 EVT InnerVT = SubOps[0].getValueType();
58646 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58647 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58648 (Subtarget.hasBWI() ||
58649 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58650 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58651 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58652 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58653 return Op.getValueType() == InnerVT;
58654 })) {
58655 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58656 MVT ConcatVT = MVT::getVectorVT(
58657 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58658 if (SDValue ConcatSrc = combineConcatVectorOps(
58659 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58660 return DAG.getBitcast(VT, ConcatSrc);
58661 }
58662 break;
58663 }
58664 case ISD::VECTOR_SHUFFLE: {
58665 // TODO: Generalize NumOps support.
58666 if (!IsSplat && NumOps == 2 &&
58667 ((VT.is256BitVector() &&
58668 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58669 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58670 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58671 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58672 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58673 if (Concat0 || Concat1 ||
58674 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58675 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58676 Subtarget.hasVBMI())) {
58677 int NumSubElts = Op0.getValueType().getVectorNumElements();
58678 SmallVector<int> NewMask;
58679 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58680 M = M >= NumSubElts ? M + NumSubElts : M;
58681 NewMask.push_back(M);
58682 }
58683 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58684 if (0 <= M)
58685 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58686 NewMask.push_back(M);
58687 }
58688 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58689 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58690 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58691 }
58692 }
58693 break;
58694 }
58695 case X86ISD::VBROADCAST: {
58696 // TODO: 512-bit VBROADCAST concatenation.
58697 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58698 return Op.getOperand(0).getValueType().is128BitVector();
58699 })) {
58700 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58701 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58702 ConcatSubOperand(VT, Ops, 0),
58703 ConcatSubOperand(VT, Ops, 0));
58704 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58705 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58706 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58708 DL, VT, ConcatSubOperand(VT, Ops, 0),
58709 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58710 }
58711 break;
58712 }
58713 case X86ISD::MOVDDUP:
58714 case X86ISD::MOVSHDUP:
58715 case X86ISD::MOVSLDUP: {
58716 if (!IsSplat && (VT.is256BitVector() ||
58717 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58718 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58719 break;
58720 }
58721 case X86ISD::SHUFP: {
58722 if (!IsSplat &&
58723 (VT == MVT::v8f32 ||
58724 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58725 llvm::all_of(Ops, [Op0](SDValue Op) {
58726 return Op.getOperand(2) == Op0.getOperand(2);
58727 })) {
58728 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58729 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58730 if (Concat0 || Concat1)
58731 return DAG.getNode(Opcode, DL, VT,
58732 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58733 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58734 Op0.getOperand(2));
58735 }
58736 break;
58737 }
58738 case X86ISD::UNPCKH:
58739 case X86ISD::UNPCKL: {
58740 // TODO: UNPCK should use CombineSubOperand
58741 // Don't concatenate build_vector patterns.
58742 if (!IsSplat &&
58743 ((VT.is256BitVector() &&
58744 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58745 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58746 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58747 none_of(Ops, [](SDValue Op) {
58748 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58750 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58752 })) {
58753 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58754 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58755 if (Concat0 || Concat1 ||
58756 (Subtarget.hasInt256() && EltSizeInBits == 64))
58757 return DAG.getNode(Opcode, DL, VT,
58758 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58759 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58760 }
58761 break;
58762 }
58763 case X86ISD::PSHUFHW:
58764 case X86ISD::PSHUFLW:
58765 case X86ISD::PSHUFD:
58766 if (!IsSplat &&
58767 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58768 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58769 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58770 llvm::all_of(Ops, [Op0](SDValue Op) {
58771 return Op.getOperand(1) == Op0.getOperand(1);
58772 })) {
58773 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58774 Op0.getOperand(1));
58775 }
58776 [[fallthrough]];
58777 case X86ISD::VPERMILPI:
58778 if (!IsSplat && EltSizeInBits == 32 &&
58779 (VT.is256BitVector() ||
58780 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58781 all_of(Ops, [&Op0](SDValue Op) {
58782 return Op0.getOperand(1) == Op.getOperand(1);
58783 })) {
58784 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58785 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58786 Res =
58787 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58788 return DAG.getBitcast(VT, Res);
58789 }
58790 break;
58791 case X86ISD::VPERMILPV:
58792 if (!IsSplat && (VT.is256BitVector() ||
58793 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58794 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58795 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58796 if (Concat0 || Concat1)
58797 return DAG.getNode(Opcode, DL, VT,
58798 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58799 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58800 }
58801 break;
58802 case X86ISD::PSHUFB:
58803 case X86ISD::PSADBW:
58804 case X86ISD::VPMADDUBSW:
58805 case X86ISD::VPMADDWD:
58806 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58807 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58808 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58809 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58810 NumOps * SrcVT.getVectorNumElements());
58811 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58812 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58813 if (Concat0 || Concat1)
58814 return DAG.getNode(
58815 Opcode, DL, VT,
58816 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58817 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58818 }
58819 break;
58820 case X86ISD::VPERMV:
58821 // TODO: Handle 256-bit and NumOps == 4 cases.
58822 if (!IsSplat && NumOps == 2 &&
58823 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58824 MVT OpVT = Op0.getSimpleValueType();
58825 int NumSrcElts = OpVT.getVectorNumElements();
58826 SmallVector<int, 64> ConcatMask;
58827 for (unsigned i = 0; i != NumOps; ++i) {
58828 SmallVector<int, 64> SubMask;
58830 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58831 break;
58832 for (int M : SubMask) {
58833 if (0 <= M)
58834 M += i * NumSrcElts;
58835 ConcatMask.push_back(M);
58836 }
58837 }
58838 if (ConcatMask.size() == (NumOps * NumSrcElts))
58839 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58840 ConcatSubOperand(VT, Ops, 1),
58841 DAG.getUNDEF(VT), Subtarget, DAG);
58842 }
58843 break;
58844 case X86ISD::VPERMV3:
58845 // TODO: Handle 256-bit and NumOps == 4 cases.
58846 if (!IsSplat && NumOps == 2 &&
58847 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58848 MVT OpVT = Op0.getSimpleValueType();
58849 int NumSrcElts = OpVT.getVectorNumElements();
58850 SmallVector<int, 64> ConcatMask;
58851 for (unsigned i = 0; i != NumOps; ++i) {
58852 SmallVector<int, 64> SubMask;
58854 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58855 break;
58856 for (int M : SubMask) {
58857 if (0 <= M) {
58858 int Src = M < NumSrcElts ? 0 : 2;
58859 M += M < NumSrcElts ? 0 : NumSrcElts;
58860
58861 // Reference the lowest sub if the upper sub is the same.
58862 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58863 M += i * NumSrcElts;
58864 }
58865 ConcatMask.push_back(M);
58866 }
58867 }
58868 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58869 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58870 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58871 if (Concat0 || Concat1)
58872 return lowerShuffleWithPERMV(
58873 DL, VT, ConcatMask,
58874 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58875 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58876 DAG);
58877 }
58878 }
58879 break;
58880 case X86ISD::VPERM2X128: {
58881 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58882 assert(NumOps == 2 && "Bad concat_vectors operands");
58883 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58884 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58885 // TODO: Handle zero'd subvectors.
58886 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58887 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58888 (int)((Imm1 >> 4) & 0x3)};
58889 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58890 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58891 Ops[0].getOperand(1), DAG, DL);
58892 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58893 Ops[1].getOperand(1), DAG, DL);
58894 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58895 DAG.getBitcast(ShuffleVT, LHS),
58896 DAG.getBitcast(ShuffleVT, RHS),
58897 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58898 return DAG.getBitcast(VT, Res);
58899 }
58900 }
58901 break;
58902 }
58903 case X86ISD::SHUF128: {
58904 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58905 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58906 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58907 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58908 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58909 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58910 Ops[0].getOperand(1), DAG, DL);
58911 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58912 Ops[1].getOperand(1), DAG, DL);
58913 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58914 DAG.getTargetConstant(Imm, DL, MVT::i8));
58915 }
58916 break;
58917 }
58918 case ISD::TRUNCATE:
58919 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58920 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58921 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58922 SrcVT == Ops[1].getOperand(0).getValueType() &&
58923 Subtarget.useAVX512Regs() &&
58924 Subtarget.getPreferVectorWidth() >= 512 &&
58925 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58926 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58927 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58928 ConcatSubOperand(NewSrcVT, Ops, 0));
58929 }
58930 }
58931 break;
58932 case ISD::ANY_EXTEND:
58933 case ISD::SIGN_EXTEND:
58934 case ISD::ZERO_EXTEND:
58935 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58936 if (!IsSplat && NumOps == 2 &&
58937 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58938 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58939 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58940 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58941 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58942 SrcVT == Ops[1].getOperand(0).getValueType()) {
58943 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58944 return DAG.getNode(Opcode, DL, VT,
58945 ConcatSubOperand(NewSrcVT, Ops, 0));
58946 }
58947 }
58948 break;
58952 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58953 if (!IsSplat && NumOps == 2 &&
58954 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58955 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58956 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58958 Op0.getOperand(0).getValueType() ==
58959 Ops[0].getOperand(0).getValueType()) {
58960 EVT SrcVT = Op0.getOperand(0).getValueType();
58961 unsigned NumElts = VT.getVectorNumElements();
58962 MVT UnpackSVT =
58963 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
58964 MVT UnpackVT =
58965 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
58966 SDValue Unpack =
58967 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
58968 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
58969 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
58970 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
58971 DAG.getBitcast(SrcVT, Unpack), DAG);
58972 }
58973 break;
58974 }
58975 case X86ISD::VSHLI:
58976 case X86ISD::VSRLI:
58977 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
58978 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
58979 llvm::all_of(Ops, [](SDValue Op) {
58980 return Op.getConstantOperandAPInt(1) == 32;
58981 })) {
58982 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
58983 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
58984 Res = DAG.getBitcast(MVT::v8i32, Res);
58985 if (Opcode == X86ISD::VSHLI) {
58986 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58987 {8, 0, 8, 2, 8, 4, 8, 6});
58988 } else {
58989 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58990 {1, 8, 3, 8, 5, 8, 7, 8});
58991 }
58992 return DAG.getBitcast(VT, Res);
58993 }
58994 }
58995 [[fallthrough]];
58996 case X86ISD::VSRAI:
58997 case X86ISD::VSHL:
58998 case X86ISD::VSRL:
58999 case X86ISD::VSRA:
59000 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
59001 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59002 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
59003 llvm::all_of(Ops, [Op0](SDValue Op) {
59004 return Op0.getOperand(1) == Op.getOperand(1);
59005 })) {
59006 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59007 Op0.getOperand(1));
59008 }
59009 break;
59010 case X86ISD::VPERMI:
59011 case X86ISD::VROTLI:
59012 case X86ISD::VROTRI:
59013 if (!IsSplat &&
59014 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
59015 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59016 llvm::all_of(Ops, [Op0](SDValue Op) {
59017 return Op0.getOperand(1) == Op.getOperand(1);
59018 })) {
59019 assert(!(Opcode == X86ISD::VPERMI &&
59020 Op0.getValueType().is128BitVector()) &&
59021 "Illegal 128-bit X86ISD::VPERMI nodes");
59022 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59023 Op0.getOperand(1));
59024 }
59025 break;
59026 case ISD::AND:
59027 case ISD::OR:
59028 case ISD::XOR:
59029 case X86ISD::ANDNP:
59030 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
59031 if (!IsSplat && (VT.is256BitVector() ||
59032 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59033 // Don't concatenate root AVX1 NOT patterns.
59034 // TODO: Allow NOT folding if Concat0 succeeds.
59035 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
59036 llvm::all_of(Ops, [](SDValue X) {
59037 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
59038 }))
59039 break;
59040 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59041 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59042 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
59043 return DAG.getNode(Opcode, DL, VT,
59044 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59045 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59046 }
59047 break;
59048 case X86ISD::PCMPEQ:
59049 case X86ISD::PCMPGT:
59050 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
59051 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
59052 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59053 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59054 if (Concat0 || Concat1)
59055 return DAG.getNode(Opcode, DL, VT,
59056 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59057 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59058 break;
59059 }
59060
59061 if (!IsSplat && VT == MVT::v8i32) {
59062 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
59063 // TODO: Handle v4f64 as well?
59064 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
59065 for (unsigned I = 0; I != NumOps; ++I) {
59066 MaxSigBitsLHS =
59067 std::max(MaxSigBitsLHS,
59068 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
59069 MaxSigBitsRHS =
59070 std::max(MaxSigBitsRHS,
59071 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
59072 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
59073 break;
59074 }
59075
59076 ISD::CondCode ICC =
59077 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
59078 ISD::CondCode FCC =
59080
59081 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
59082 MVT FpVT = VT.changeVectorElementType(FpSVT);
59083
59084 if (std::optional<unsigned> CastOpc =
59085 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
59086 SDValue LHS = CombineSubOperand(VT, Ops, 0);
59087 SDValue RHS = CombineSubOperand(VT, Ops, 1);
59088 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
59089 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
59090 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
59091 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
59092
59093 bool IsAlwaysSignaling;
59094 unsigned FSETCC =
59095 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
59096 return DAG.getBitcast(
59097 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
59098 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
59099 }
59100 }
59101 break;
59102 case ISD::CTPOP:
59103 case ISD::CTTZ:
59104 case ISD::CTLZ:
59107 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59108 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59109 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
59110 }
59111 break;
59113 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
59114 if (!IsSplat &&
59115 (VT.is256BitVector() ||
59116 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59117 llvm::all_of(Ops, [Op0](SDValue Op) {
59118 return Op0.getOperand(2) == Op.getOperand(2);
59119 })) {
59120 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59121 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
59122 }
59123 break;
59124 case ISD::ADD:
59125 case ISD::SUB:
59126 case ISD::MUL:
59127 // TODO: Add more integer binops?
59128 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59129 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59130 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59131 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59132 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59133 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
59134 return Op.getOperand(0) == Op.getOperand(1);
59135 }))
59136 return DAG.getNode(Opcode, DL, VT,
59137 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59138 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59139 }
59140 break;
59141 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
59142 // their latency are short, so here we don't replace them unless we won't
59143 // introduce extra VINSERT.
59144 case ISD::FADD:
59145 case ISD::FSUB:
59146 case ISD::FMUL:
59147 if (!IsSplat && (VT.is256BitVector() ||
59148 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59149 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59150 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59151 if (Concat0 || Concat1)
59152 return DAG.getNode(Opcode, DL, VT,
59153 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59154 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59155 }
59156 break;
59157 // Always prefer to concatenate high latency FDIV instructions.
59158 case ISD::FDIV:
59159 if (!IsSplat && (VT.is256BitVector() ||
59160 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59161 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59162 ConcatSubOperand(VT, Ops, 1));
59163 }
59164 break;
59165 case X86ISD::HADD:
59166 case X86ISD::HSUB:
59167 case X86ISD::FHADD:
59168 case X86ISD::FHSUB:
59169 if (!IsSplat && VT.is256BitVector() &&
59170 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59171 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59172 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59173 if (Concat0 || Concat1)
59174 return DAG.getNode(Opcode, DL, VT,
59175 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59176 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59177 }
59178 break;
59179 case X86ISD::PACKSS:
59180 case X86ISD::PACKUS:
59181 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59182 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59183 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59184 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59185 NumOps * SrcVT.getVectorNumElements());
59186 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59187 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59188 if (Concat0 || Concat1)
59189 return DAG.getNode(
59190 Opcode, DL, VT,
59191 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59192 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59193 }
59194 break;
59195 case X86ISD::VSHLD:
59196 case X86ISD::VSHRD:
59197 case X86ISD::PALIGNR:
59198 if (!IsSplat &&
59199 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59200 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59201 llvm::all_of(Ops, [Op0](SDValue Op) {
59202 return Op0.getOperand(2) == Op.getOperand(2);
59203 })) {
59204 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59205 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59206 if (Concat0 || Concat1)
59207 return DAG.getNode(Opcode, DL, VT,
59208 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59209 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59210 Op0.getOperand(2));
59211 }
59212 break;
59213 case X86ISD::BLENDI:
59214 if (VT.is256BitVector() && NumOps == 2 &&
59215 (EltSizeInBits >= 32 ||
59216 (Subtarget.hasInt256() &&
59217 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59218 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59219 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59220 if (Concat0 || Concat1) {
59221 unsigned NumElts = VT.getVectorNumElements();
59222 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59223 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59224 Mask = Mask.zextOrTrunc(8);
59225 return DAG.getNode(Opcode, DL, VT,
59226 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59227 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59228 DAG.getTargetConstant(Mask, DL, MVT::i8));
59229 }
59230 }
59231 // TODO: BWI targets should only use CombineSubOperand.
59232 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59233 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59234 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59235 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59236 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59237 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59238 unsigned NumElts = VT.getVectorNumElements();
59239 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59240 for (unsigned I = 1; I != NumOps; ++I)
59241 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59242 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59243 Mask = Mask.zextOrTrunc(NumMaskBits);
59244 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59245 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59246 SDValue Sel =
59247 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59248 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59249 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59250 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59251 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59252 }
59253 }
59254 break;
59255 case ISD::VSELECT:
59256 // TODO: VSELECT should use CombineSubOperand.
59257 if (!IsSplat && Subtarget.hasAVX512() &&
59258 (VT.is256BitVector() ||
59259 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59260 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59261 EVT SelVT = Ops[0].getOperand(0).getValueType();
59262 if (SelVT.getVectorElementType() == MVT::i1) {
59263 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59264 NumOps * SelVT.getVectorNumElements());
59265 if (TLI.isTypeLegal(SelVT))
59266 return DAG.getNode(
59267 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59268 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59269 }
59270 }
59271 [[fallthrough]];
59272 case X86ISD::BLENDV:
59273 // TODO: BLENDV should use CombineSubOperand.
59274 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59275 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59276 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59277 EVT SelVT = Ops[0].getOperand(0).getValueType();
59278 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59279 if (TLI.isTypeLegal(SelVT))
59280 return DAG.getNode(
59281 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59282 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59283 }
59284 break;
59285 }
59286 }
59287
59288 // Fold subvector loads into one.
59289 // If needed, look through bitcasts to get to the load.
59290 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59291 unsigned Fast;
59292 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59293 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59294 *FirstLd->getMemOperand(), &Fast) &&
59295 Fast) {
59296 if (SDValue Ld =
59297 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59298 return Ld;
59299 }
59300 }
59301
59302 // Attempt to fold target constant loads.
59303 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59304 SmallVector<APInt> EltBits;
59305 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59306 for (unsigned I = 0; I != NumOps; ++I) {
59307 APInt OpUndefElts;
59308 SmallVector<APInt> OpEltBits;
59309 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59310 OpEltBits, /*AllowWholeUndefs*/ true,
59311 /*AllowPartialUndefs*/ false))
59312 break;
59313 EltBits.append(OpEltBits);
59314 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59315 }
59316 if (EltBits.size() == VT.getVectorNumElements()) {
59317 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59318 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59319 SDValue CV = DAG.getConstantPool(C, PVT);
59322 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59323 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59325 return Ld;
59326 }
59327 }
59328
59329 // If this simple subvector or scalar/subvector broadcast_load is inserted
59330 // into both halves, use a larger broadcast_load. Update other uses to use
59331 // an extracted subvector.
59332 if (IsSplat &&
59333 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59334 if (ISD::isNormalLoad(Op0.getNode()) ||
59337 auto *Mem = cast<MemSDNode>(Op0);
59338 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59341 if (SDValue BcastLd =
59342 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59343 SDValue BcastSrc =
59344 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59345 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59346 return BcastLd;
59347 }
59348 }
59349 }
59350
59351 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59352 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59353 Subtarget.useAVX512Regs()) {
59354 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59355 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59356 Res = DAG.getBitcast(ShuffleVT, Res);
59357 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59358 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59359 return DAG.getBitcast(VT, Res);
59360 }
59361
59362 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59363 if (!IsSplat &&
59364 ((NumOps == 2 && VT == MVT::v4f64) ||
59365 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59366 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59367 // Collect the individual per-lane v2f64/v4f64 shuffles.
59368 MVT OpVT = Ops[0].getSimpleValueType();
59369 unsigned NumOpElts = OpVT.getVectorNumElements();
59372 if (all_of(seq<int>(NumOps), [&](int I) {
59373 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59374 Depth + 1) &&
59375 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59376 none_of(SrcMasks[I], isUndefOrZero) &&
59377 SrcMasks[I].size() == NumOpElts &&
59378 all_of(SrcOps[I], [&OpVT](SDValue V) {
59379 return V.getValueType() == OpVT;
59380 });
59381 })) {
59382 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59383 bool Unary = true;
59384 unsigned SHUFPDMask = 0;
59386 for (unsigned I = 0; I != NumOps; ++I) {
59387 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59388 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59389 Unary &= LHS[I] == RHS[I];
59390 for (unsigned J = 0; J != NumOpElts; ++J)
59391 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59392 }
59393 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59394 // PERMILPD mask and we can always profitably concatenate them.
59395 SDValue Concat0 =
59396 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59397 SDValue Concat1 =
59398 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59399 if (Unary || Concat0 || Concat1) {
59400 Concat0 =
59401 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59402 Concat1 =
59403 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59404 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59405 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59406 }
59407 }
59408 }
59409
59410 return SDValue();
59411}
59412
59415 const X86Subtarget &Subtarget) {
59416 EVT VT = N->getValueType(0);
59417 EVT SrcVT = N->getOperand(0).getValueType();
59418 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59420
59421 if (VT.getVectorElementType() == MVT::i1) {
59422 // Attempt to constant fold.
59423 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59425 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59427 if (!C) break;
59428 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59429 if (I == (E - 1)) {
59430 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59431 if (TLI.isTypeLegal(IntVT))
59432 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59433 }
59434 }
59435
59436 // Don't do anything else for i1 vectors.
59437 return SDValue();
59438 }
59439
59440 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59441 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59442 Subtarget))
59443 return R;
59444 }
59445
59446 return SDValue();
59447}
59448
59451 const X86Subtarget &Subtarget) {
59452 if (DCI.isBeforeLegalizeOps())
59453 return SDValue();
59454
59455 MVT OpVT = N->getSimpleValueType(0);
59456
59457 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59458
59459 SDLoc dl(N);
59460 SDValue Vec = N->getOperand(0);
59461 SDValue SubVec = N->getOperand(1);
59462
59463 uint64_t IdxVal = N->getConstantOperandVal(2);
59464 MVT SubVecVT = SubVec.getSimpleValueType();
59465 int VecNumElts = OpVT.getVectorNumElements();
59466 int SubVecNumElts = SubVecVT.getVectorNumElements();
59467
59468 if (Vec.isUndef() && SubVec.isUndef())
59469 return DAG.getUNDEF(OpVT);
59470
59471 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59472 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59473 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59474 return getZeroVector(OpVT, Subtarget, DAG, dl);
59475
59477 // If we're inserting into a zero vector and then into a larger zero vector,
59478 // just insert into the larger zero vector directly.
59479 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59481 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59482 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59483 getZeroVector(OpVT, Subtarget, DAG, dl),
59484 SubVec.getOperand(1),
59485 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59486 }
59487
59488 // If we're inserting into a zero vector and our input was extracted from an
59489 // insert into a zero vector of the same type and the extraction was at
59490 // least as large as the original insertion. Just insert the original
59491 // subvector into a zero vector.
59492 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59493 isNullConstant(SubVec.getOperand(1)) &&
59495 SDValue Ins = SubVec.getOperand(0);
59496 if (isNullConstant(Ins.getOperand(2)) &&
59497 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59498 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59499 SubVecVT.getFixedSizeInBits())
59500 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59501 getZeroVector(OpVT, Subtarget, DAG, dl),
59502 Ins.getOperand(1), N->getOperand(2));
59503 }
59504 }
59505
59506 // Stop here if this is an i1 vector.
59507 if (IsI1Vector)
59508 return SDValue();
59509
59510 // Eliminate an intermediate vector widening:
59511 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59512 // insert_subvector X, Y, Idx
59513 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59514 // there?
59515 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59516 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59517 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59518 SubVec.getOperand(1), N->getOperand(2));
59519
59520 // If this is an insert of an extract, combine to a shuffle. Don't do this
59521 // if the insert or extract can be represented with a subregister operation.
59522 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59523 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59524 (IdxVal != 0 ||
59525 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59526 SDValue ExtSrc = SubVec.getOperand(0);
59527 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59528 // Create a shuffle mask matching the extraction and insertion.
59529 SmallVector<int, 64> Mask(VecNumElts);
59530 std::iota(Mask.begin(), Mask.end(), 0);
59531 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59532 ExtIdxVal + VecNumElts);
59533 if (ExtIdxVal != 0)
59534 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59535 // See if we can use a blend instead of extract/insert pair.
59536 SmallVector<int, 64> BlendMask(VecNumElts);
59537 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59538 std::iota(BlendMask.begin() + IdxVal,
59539 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59540 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59541 VecNumElts == (2 * SubVecNumElts)) {
59542 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59543 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59544 SDValue Blend = DAG.getNode(
59545 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59546 DAG.getBitcast(MVT::v8f32, ExtSrc),
59547 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59548 return DAG.getBitcast(OpVT, Blend);
59549 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59550 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59551 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59552 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59553 SDValue Shuffle =
59554 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59555 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59556 return DAG.getBitcast(OpVT, Shuffle);
59557 }
59558 }
59559 }
59560
59561 // Match concat_vector style patterns.
59562 SmallVector<SDValue, 2> SubVectorOps;
59563 if (collectConcatOps(N, SubVectorOps, DAG)) {
59564 if (SDValue Fold =
59565 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59566 return Fold;
59567
59568 // If we're inserting all zeros into the upper half, change this to
59569 // a concat with zero. We will match this to a move
59570 // with implicit upper bit zeroing during isel.
59571 // We do this here because we don't want combineConcatVectorOps to
59572 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59573 if (SubVectorOps.size() == 2 &&
59574 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59575 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59576 getZeroVector(OpVT, Subtarget, DAG, dl),
59577 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59578
59579 // Attempt to recursively combine to a shuffle.
59580 if (all_of(SubVectorOps, [](SDValue SubOp) {
59582 })) {
59583 SDValue Op(N, 0);
59584 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59585 return Res;
59586 }
59587 }
59588
59589 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59590 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59591 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59592
59593 // If this is a broadcast load inserted into an upper undef, use a larger
59594 // broadcast load.
59595 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59596 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59597 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59599 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59600 }
59601
59602 // If we're splatting the lower half subvector of a full vector load into the
59603 // upper half, attempt to create a subvector broadcast.
59604 if ((int)IdxVal == (VecNumElts / 2) &&
59605 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59606 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59607 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59608 if (VecLd && SubLd &&
59610 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59612 SubVecVT, SubLd, 0, DAG);
59613 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59614 BcastLd, DAG.getVectorIdxConstant(0, dl));
59615 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59616 return BcastLd;
59617 }
59618 }
59619
59620 // Attempt to constant fold (if we're not widening).
59621 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59622 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59623 APInt VecUndefElts, SubUndefElts;
59624 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59625 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59626 VecEltBits) &&
59627 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59628 SubEltBits)) {
59629 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59630 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59631 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59632 }
59633 }
59634
59635 // Attempt to recursively combine to a shuffle.
59638 SDValue Op(N, 0);
59639 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59640 return Res;
59641 }
59642
59643 // Match insertion of subvector load that perfectly aliases a base load.
59644 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59645 ISD::isNormalLoad(SubVec.getNode()) &&
59647 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59648 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59649 return Vec;
59650
59651 return SDValue();
59652}
59653
59654/// If we are extracting a subvector of a vector select and the select condition
59655/// is composed of concatenated vectors, try to narrow the select width. This
59656/// is a common pattern for AVX1 integer code because 256-bit selects may be
59657/// legal, but there is almost no integer math/logic available for 256-bit.
59658/// This function should only be called with legal types (otherwise, the calls
59659/// to get simple value types will assert).
59661 SelectionDAG &DAG) {
59662 SDValue Sel = Ext->getOperand(0);
59663 if (Sel.getOpcode() != ISD::VSELECT ||
59664 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59665 return SDValue();
59666
59667 // Note: We assume simple value types because this should only be called with
59668 // legal operations/types.
59669 // TODO: This can be extended to handle extraction to 256-bits.
59670 MVT VT = Ext->getSimpleValueType(0);
59671 if (!VT.is128BitVector())
59672 return SDValue();
59673
59674 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59675 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59676 return SDValue();
59677
59678 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59679 MVT SelVT = Sel.getSimpleValueType();
59680 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59681 "Unexpected vector type with legal operations");
59682
59683 unsigned SelElts = SelVT.getVectorNumElements();
59684 unsigned CastedElts = WideVT.getVectorNumElements();
59685 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59686 if (SelElts % CastedElts == 0) {
59687 // The select has the same or more (narrower) elements than the extract
59688 // operand. The extraction index gets scaled by that factor.
59689 ExtIdx *= (SelElts / CastedElts);
59690 } else if (CastedElts % SelElts == 0) {
59691 // The select has less (wider) elements than the extract operand. Make sure
59692 // that the extraction index can be divided evenly.
59693 unsigned IndexDivisor = CastedElts / SelElts;
59694 if (ExtIdx % IndexDivisor != 0)
59695 return SDValue();
59696 ExtIdx /= IndexDivisor;
59697 } else {
59698 llvm_unreachable("Element count of simple vector types are not divisible?");
59699 }
59700
59701 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59702 unsigned NarrowElts = SelElts / NarrowingFactor;
59703 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59704 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59705 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59706 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59707 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59708 return DAG.getBitcast(VT, NarrowSel);
59709}
59710
59713 const X86Subtarget &Subtarget) {
59714 if (!N->getValueType(0).isSimple())
59715 return SDValue();
59716
59717 MVT VT = N->getSimpleValueType(0);
59718 SDValue InVec = N->getOperand(0);
59719 unsigned IdxVal = N->getConstantOperandVal(1);
59720 EVT InVecVT = InVec.getValueType();
59721 unsigned SizeInBits = VT.getSizeInBits();
59722 unsigned InSizeInBits = InVecVT.getSizeInBits();
59723 unsigned NumSubElts = VT.getVectorNumElements();
59724 unsigned NumInElts = InVecVT.getVectorNumElements();
59725 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59726 SDLoc DL(N);
59727
59728 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59729 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59730 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59731 // We let generic combining take over from there to simplify the
59732 // insert/extract and 'not'.
59733 // This pattern emerges during AVX1 legalization. We handle it before lowering
59734 // to avoid complications like splitting constant vector loads.
59735 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59736 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59737 auto isConcatenatedNot = [](SDValue V) {
59738 V = peekThroughBitcasts(V);
59739 if (!isBitwiseNot(V))
59740 return false;
59741 SDValue NotOp = V->getOperand(0);
59743 };
59744 if (isConcatenatedNot(InVec.getOperand(0)) ||
59745 isConcatenatedNot(InVec.getOperand(1))) {
59746 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59747 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59748 splitVectorIntBinary(InVec, DAG, DL),
59749 N->getOperand(1));
59750 }
59751 }
59752
59753 if (DCI.isBeforeLegalizeOps())
59754 return SDValue();
59755
59756 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59757 return V;
59758
59760 return getZeroVector(VT, Subtarget, DAG, DL);
59761
59762 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59763 if (VT.getScalarType() == MVT::i1)
59764 return DAG.getConstant(1, DL, VT);
59765 return getOnesVector(VT, DAG, DL);
59766 }
59767
59768 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59769 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59770
59771 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59772 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59773 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59774 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59775 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59776 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59777 }
59778
59779 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59780 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59781 // iff SUB is entirely contained in the extraction.
59782 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59783 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59784 SDValue Src = InVec.getOperand(0);
59785 SDValue Sub = InVec.getOperand(1);
59786 EVT SubVT = Sub.getValueType();
59787 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59788 if (IdxVal <= InsIdx &&
59789 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59790 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59791 DAG.getVectorIdxConstant(IdxVal, DL));
59792 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59793 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59794 }
59795 }
59796
59797 // If we're extracting an upper subvector see if we'd get the same elements if
59798 // we extracted the lowest subvector instead which should allow
59799 // SimplifyDemandedVectorElts do more simplifications.
59800 if (IdxVal != 0) {
59801 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59802 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59803 });
59804 if (AllEquiv)
59805 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59806 }
59807
59808 // Check if we're extracting a whole broadcasted subvector.
59809 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59810 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59811 EVT MemVT = MemIntr->getMemoryVT();
59812 if (MemVT == VT) {
59813 // If this is the only use, we can replace with a regular load (this may
59814 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59815 // memory chain).
59816 if (InVec.hasOneUse()) {
59817 SDValue Ld =
59818 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59819 MemIntr->getMemOperand());
59820 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59821 return Ld;
59822 }
59823 }
59824 }
59825
59826 // Attempt to extract from the source of a shuffle vector.
59827 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59828 SmallVector<int, 32> ShuffleMask;
59829 SmallVector<int, 32> ScaledMask;
59830 SmallVector<SDValue, 2> ShuffleInputs;
59831 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59832 // Decode the shuffle mask and scale it so its shuffling subvectors.
59833 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59834 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59835 unsigned SubVecIdx = IdxVal / NumSubElts;
59836 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59837 return DAG.getUNDEF(VT);
59838 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59839 return getZeroVector(VT, Subtarget, DAG, DL);
59840 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59841 if (Src.getValueSizeInBits() == InSizeInBits) {
59842 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59843 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59844 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59845 DL, SizeInBits);
59846 }
59847 }
59848 }
59849
59850 auto IsExtractFree = [](SDValue V) {
59851 if (V.hasOneUse()) {
59853 if (V.getOpcode() == ISD::LOAD)
59854 return true;
59855 }
59856 V = peekThroughBitcasts(V);
59857 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59858 return true;
59860 return true;
59861 return V.isUndef();
59862 };
59863
59864 // If we're extracting the lowest subvector and we're the only user,
59865 // we may be able to perform this with a smaller vector width.
59866 unsigned InOpcode = InVec.getOpcode();
59867 if (InVec.hasOneUse()) {
59868 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59869 // v2f64 CVTDQ2PD(v4i32).
59870 if (InOpcode == ISD::SINT_TO_FP &&
59871 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59872 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59873 }
59874 // v2f64 CVTUDQ2PD(v4i32).
59875 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59876 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59877 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59878 }
59879 // v2f64 CVTPS2PD(v4f32).
59880 if (InOpcode == ISD::FP_EXTEND &&
59881 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59882 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59883 }
59884 }
59885 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59886 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59887 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59888 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59889 Subtarget.hasVLX())) &&
59890 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59891 SDValue Src = InVec.getOperand(0);
59892 if (Src.getValueType().getScalarSizeInBits() == 32)
59893 return DAG.getNode(InOpcode, DL, VT,
59894 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59895 }
59896 if (IdxVal == 0 &&
59897 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59898 (SizeInBits == 128 || SizeInBits == 256) &&
59899 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59900 SDValue Ext = InVec.getOperand(0);
59901 if (Ext.getValueSizeInBits() > SizeInBits)
59902 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59903 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59904 return DAG.getNode(ExtOp, DL, VT, Ext);
59905 }
59906 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59907 InVec.getOperand(0).getValueType().is256BitVector() &&
59908 InVec.getOperand(1).getValueType().is256BitVector() &&
59909 InVec.getOperand(2).getValueType().is256BitVector()) {
59910 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59911 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59912 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59913 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59914 }
59915 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59916 (SizeInBits == 128 || SizeInBits == 256)) {
59917 SDValue InVecSrc = InVec.getOperand(0);
59918 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59919 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59920 return DAG.getNode(InOpcode, DL, VT, Ext);
59921 }
59922
59923 if (SizeInBits == 128 || SizeInBits == 256) {
59924 switch (InOpcode) {
59925 case X86ISD::MOVDDUP:
59926 return DAG.getNode(
59927 InOpcode, DL, VT,
59928 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59929 case X86ISD::PSHUFD:
59930 case X86ISD::VPERMILPI:
59931 if (InVec.getOperand(0).hasOneUse()) {
59932 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59933 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59934 return DAG.getNode(InOpcode, DL, VT,
59935 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59936 DL, SizeInBits),
59937 DAG.getTargetConstant(M, DL, MVT::i8));
59938 }
59939 break;
59940 case X86ISD::PCMPEQ:
59941 case X86ISD::PCMPGT:
59942 case X86ISD::UNPCKH:
59943 case X86ISD::UNPCKL:
59944 if (IsExtractFree(InVec.getOperand(0)) ||
59945 IsExtractFree(InVec.getOperand(1)))
59946 return DAG.getNode(InOpcode, DL, VT,
59947 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59948 DL, SizeInBits),
59949 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59950 DL, SizeInBits));
59951 break;
59952 case X86ISD::CMPP:
59953 if (IsExtractFree(InVec.getOperand(0)) ||
59954 IsExtractFree(InVec.getOperand(1)))
59955 return DAG.getNode(InOpcode, DL, VT,
59956 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59957 DL, SizeInBits),
59958 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59959 DL, SizeInBits),
59960 InVec.getOperand(2));
59961 break;
59962 case X86ISD::BLENDI:
59963 if (IsExtractFree(InVec.getOperand(0)) ||
59964 IsExtractFree(InVec.getOperand(1))) {
59965 uint64_t M = InVec.getConstantOperandVal(2) & 255;
59966 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
59967 return DAG.getNode(InOpcode, DL, VT,
59968 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59969 DL, SizeInBits),
59970 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59971 DL, SizeInBits),
59972 DAG.getTargetConstant(M, DL, MVT::i8));
59973 }
59974 break;
59975 case X86ISD::VPERMV:
59976 if (IdxVal != 0) {
59977 SDValue Mask = InVec.getOperand(0);
59978 SDValue Src = InVec.getOperand(1);
59979 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59980 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59981 DL, InSizeInBits);
59982 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
59983 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59984 }
59985 break;
59986 case X86ISD::VPERMV3:
59987 if (IdxVal != 0) {
59988 SDValue Src0 = InVec.getOperand(0);
59989 SDValue Mask = InVec.getOperand(1);
59990 SDValue Src1 = InVec.getOperand(2);
59991 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59992 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59993 DL, InSizeInBits);
59994 SDValue Shuffle =
59995 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
59996 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59997 }
59998 break;
59999 }
60000 }
60001 }
60002
60003 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
60004 // as this is very likely to fold into a shuffle/truncation.
60005 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
60006 InVecVT.getScalarSizeInBits() == 64 &&
60007 InVec.getConstantOperandAPInt(1) == 32) {
60008 SDValue Ext =
60009 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
60010 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
60011 }
60012
60013 return SDValue();
60014}
60015
60017 const X86Subtarget &Subtarget) {
60018 using namespace SDPatternMatch;
60019 EVT VT = N->getValueType(0);
60020 SDValue Src = N->getOperand(0);
60021 SDLoc DL(N);
60022
60023 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
60024 // This occurs frequently in our masked scalar intrinsic code and our
60025 // floating point select lowering with AVX512.
60026 // TODO: SimplifyDemandedBits instead?
60027 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
60028 isOneConstant(Src.getOperand(1)))
60029 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
60030
60031 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
60032 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60033 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
60034 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
60035 isNullConstant(Src.getOperand(1)))
60036 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
60037 Src.getOperand(1));
60038
60039 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
60040 // TODO: Move to DAGCombine/SimplifyDemandedBits?
60041 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
60042 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
60043 if (Op.getValueType() != MVT::i64)
60044 return SDValue();
60045 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
60046 if (Op.getOpcode() == Opc &&
60047 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
60048 return Op.getOperand(0);
60049 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
60050 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
60051 if (Ld->getExtensionType() == Ext &&
60052 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
60053 return Op;
60054 if (IsZeroExt) {
60055 KnownBits Known = DAG.computeKnownBits(Op);
60056 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
60057 return Op;
60058 }
60059 return SDValue();
60060 };
60061
60062 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
60063 return DAG.getBitcast(
60064 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60065 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
60066
60067 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
60068 return DAG.getBitcast(
60069 VT,
60070 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
60071 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60072 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
60073 }
60074
60075 if (Src.getOpcode() == ISD::BITCAST) {
60076 SDValue SrcOp = Src.getOperand(0);
60077 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
60078 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
60079 return DAG.getBitcast(
60080 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
60081 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
60082 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
60083 return DAG.getBitcast(
60084 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
60085 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
60086 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
60087 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
60088 }
60089
60090 if (VT == MVT::v4i32) {
60091 SDValue HalfSrc;
60092 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
60093 // to remove XMM->GPR->XMM moves.
60094 if (sd_match(Src, m_AnyExt(m_BitCast(
60095 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
60096 return DAG.getBitcast(
60097 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
60098 }
60099
60100 // See if we're broadcasting the scalar value, in which case just reuse that.
60101 // Ensure the same SDValue from the SDNode use is being used.
60102 if (VT.getScalarType() == Src.getValueType())
60103 for (SDNode *User : Src->users())
60104 if (User->getOpcode() == X86ISD::VBROADCAST &&
60105 Src == User->getOperand(0)) {
60106 unsigned SizeInBits = VT.getFixedSizeInBits();
60107 unsigned BroadcastSizeInBits =
60108 User->getValueSizeInBits(0).getFixedValue();
60109 if (BroadcastSizeInBits == SizeInBits)
60110 return SDValue(User, 0);
60111 if (BroadcastSizeInBits > SizeInBits)
60112 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
60113 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
60114 // coverage.
60115 }
60116
60117 // Check for cases where we've ended up with a scalarized shift, typically
60118 // during type legalization.
60119 switch (Src.getOpcode()) {
60120 case ISD::SHL:
60121 case ISD::SRL:
60122 case ISD::SRA:
60123 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
60124 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
60125 Src.hasOneUse()) {
60126 SDValue SrcVec =
60127 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60128 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
60129 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
60130 Amt->getZExtValue(), DAG);
60131 }
60132 }
60133 break;
60134 case ISD::FSHL:
60135 case ISD::FSHR:
60136 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
60137 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
60138 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60139 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60140 Src.hasOneUse()) {
60141 uint64_t AmtVal =
60142 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
60143 SDValue SrcVec0 =
60144 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60145 SDValue SrcVec1 =
60146 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
60147 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
60148 DAG.getConstant(AmtVal, DL, VT));
60149 }
60150 }
60151 break;
60152 }
60153
60154 return SDValue();
60155}
60156
60157// Simplify PMULDQ and PMULUDQ operations.
60160 const X86Subtarget &Subtarget) {
60161 SDValue LHS = N->getOperand(0);
60162 SDValue RHS = N->getOperand(1);
60163
60164 // Canonicalize constant to RHS.
60167 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60168
60169 // Multiply by zero.
60170 // Don't return RHS as it may contain UNDEFs.
60171 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60172 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60173
60174 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60175 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60176 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60177 return SDValue(N, 0);
60178
60179 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60180 // convert it to any_extend_invec, due to the LegalOperations check, do the
60181 // conversion directly to a vector shuffle manually. This exposes combine
60182 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60183 // combineX86ShufflesRecursively on SSE4.1 targets.
60184 // FIXME: This is basically a hack around several other issues related to
60185 // ANY_EXTEND_VECTOR_INREG.
60186 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60187 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60188 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60189 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60190 SDLoc dl(N);
60191 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60192 LHS.getOperand(0), { 0, -1, 1, -1 });
60193 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60194 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60195 }
60196 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60197 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60198 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60199 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60200 SDLoc dl(N);
60201 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60202 RHS.getOperand(0), { 0, -1, 1, -1 });
60203 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60204 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60205 }
60206
60207 return SDValue();
60208}
60209
60210// Simplify VPMADDUBSW/VPMADDWD operations.
60213 MVT VT = N->getSimpleValueType(0);
60214 SDValue LHS = N->getOperand(0);
60215 SDValue RHS = N->getOperand(1);
60216 unsigned Opc = N->getOpcode();
60217 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60219 "Unexpected PMADD opcode");
60220
60221 // Multiply by zero.
60222 // Don't return LHS/RHS as it may contain UNDEFs.
60223 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60225 return DAG.getConstant(0, SDLoc(N), VT);
60226
60227 // Constant folding.
60228 APInt LHSUndefs, RHSUndefs;
60229 SmallVector<APInt> LHSBits, RHSBits;
60230 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60231 unsigned DstEltBits = VT.getScalarSizeInBits();
60232 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60233 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60234 SmallVector<APInt> Result;
60235 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60236 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60237 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60238 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60239 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60240 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60241 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60242 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60243 Result.push_back(Res);
60244 }
60245 return getConstVector(Result, VT, DAG, SDLoc(N));
60246 }
60247
60248 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60249 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60250 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60251 return SDValue(N, 0);
60252
60253 return SDValue();
60254}
60255
60256// Simplify VPMADD52L/VPMADD52H operations.
60259 MVT VT = N->getSimpleValueType(0);
60260
60261 bool AddLow = N->getOpcode() == X86ISD::VPMADD52L;
60262 SDValue Op0 = N->getOperand(0);
60263 SDValue Op1 = N->getOperand(1);
60264 SDValue Op2 = N->getOperand(2);
60265 SDLoc DL(N);
60266
60267 APInt C0, C1;
60268 bool HasC0 = X86::isConstantSplat(Op0, C0),
60269 HasC1 = X86::isConstantSplat(Op1, C1);
60270
60271 // lo/hi(C * X) + Z --> lo/hi(X * C) + Z
60272 if (HasC0 && !HasC1)
60273 return DAG.getNode(N->getOpcode(), DL, VT, Op1, Op0, Op2);
60274
60275 // lo(X * 1) + Z --> lo(X) + Z iff X == lo(X)
60276 if (AddLow && HasC1 && C1.trunc(52).isOne()) {
60277 KnownBits KnownOp0 = DAG.computeKnownBits(Op0);
60278 if (KnownOp0.countMinLeadingZeros() >= 12)
60279 return DAG.getNode(ISD::ADD, DL, VT, Op0, Op2);
60280 }
60281
60282 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60283 unsigned NumEltBits = VT.getScalarSizeInBits();
60284 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60285 DCI))
60286 return SDValue(N, 0);
60287
60288 return SDValue();
60289}
60290
60293 const X86Subtarget &Subtarget) {
60294 EVT VT = N->getValueType(0);
60295 SDValue In = N->getOperand(0);
60296 unsigned Opcode = N->getOpcode();
60297 unsigned InOpcode = In.getOpcode();
60298 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60299 SDLoc DL(N);
60300
60301 // Try to merge vector loads and extend_inreg to an extload.
60302 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60303 In.hasOneUse()) {
60304 auto *Ld = cast<LoadSDNode>(In);
60305 if (Ld->isSimple()) {
60306 MVT SVT = In.getSimpleValueType().getVectorElementType();
60309 : ISD::ZEXTLOAD;
60310 EVT MemVT = VT.changeVectorElementType(SVT);
60311 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60312 SDValue Load = DAG.getExtLoad(
60313 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60314 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60315 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60316 return Load;
60317 }
60318 }
60319 }
60320
60321 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60322 if (Opcode == InOpcode)
60323 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60324
60325 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60326 // -> EXTEND_VECTOR_INREG(X).
60327 // TODO: Handle non-zero subvector indices.
60328 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60329 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60330 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60331 In.getValueSizeInBits())
60332 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60333
60334 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60335 // TODO: Move to DAGCombine?
60336 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60337 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60338 In.getValueSizeInBits() == VT.getSizeInBits()) {
60339 unsigned NumElts = VT.getVectorNumElements();
60340 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60341 EVT EltVT = In.getOperand(0).getValueType();
60342 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60343 for (unsigned I = 0; I != NumElts; ++I)
60344 Elts[I * Scale] = In.getOperand(I);
60345 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60346 }
60347
60348 // Attempt to combine as a shuffle on SSE41+ targets.
60349 if (Subtarget.hasSSE41()) {
60350 SDValue Op(N, 0);
60351 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60352 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60353 return Res;
60354 }
60355
60356 return SDValue();
60357}
60358
60361 EVT VT = N->getValueType(0);
60362 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60363 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60364 return DAG.getConstant(0, SDLoc(N), VT);
60365
60366 // Fold kshiftr(extract_subvector(X,C1),C2)
60367 // --> extract_subvector(kshiftr(X,C1+C2),0)
60368 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60369 if (N->getOpcode() == X86ISD::KSHIFTR) {
60370 SDLoc DL(N);
60371 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60372 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60373 SDValue Src = N->getOperand(0).getOperand(0);
60374 uint64_t Amt = N->getConstantOperandVal(1) +
60375 N->getOperand(0).getConstantOperandVal(1);
60376 EVT SrcVT = Src.getValueType();
60377 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60378 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60379 DAG.getTargetConstant(Amt, DL, MVT::i8));
60380 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60381 DAG.getVectorIdxConstant(0, DL));
60382 }
60383 }
60384 }
60385
60386 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60387 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60388 return SDValue(N, 0);
60389
60390 return SDValue();
60391}
60392
60393// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60394// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60395// extra instructions between the conversion due to going to scalar and back.
60397 const X86Subtarget &Subtarget) {
60398 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60399 return SDValue();
60400
60401 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60402 return SDValue();
60403
60404 if (N->getValueType(0) != MVT::f32 ||
60405 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60406 return SDValue();
60407
60408 SDLoc dl(N);
60409 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60410 N->getOperand(0).getOperand(0));
60411 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60412 DAG.getTargetConstant(4, dl, MVT::i32));
60413 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60414 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60415 DAG.getVectorIdxConstant(0, dl));
60416}
60417
60420 const X86Subtarget &Subtarget) {
60421 EVT VT = N->getValueType(0);
60422 bool IsStrict = N->isStrictFPOpcode();
60423 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60424 EVT SrcVT = Src.getValueType();
60425
60426 SDLoc dl(N);
60427 if (SrcVT.getScalarType() == MVT::bf16) {
60428 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60429 !IsStrict && Src.getOperand(0).getValueType() == VT)
60430 return Src.getOperand(0);
60431
60432 if (!SrcVT.isVector())
60433 return SDValue();
60434
60435 assert(!IsStrict && "Strict FP doesn't support BF16");
60436 if (VT.getVectorElementType() == MVT::f64) {
60437 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60438 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60439 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60440 }
60441 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60442 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60443 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60444 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60445 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60446 return DAG.getBitcast(VT, Src);
60447 }
60448
60449 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60450 return SDValue();
60451
60452 if (Subtarget.hasFP16())
60453 return SDValue();
60454
60455 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60456 return SDValue();
60457
60458 if (VT.getVectorElementType() != MVT::f32 &&
60459 VT.getVectorElementType() != MVT::f64)
60460 return SDValue();
60461
60462 unsigned NumElts = VT.getVectorNumElements();
60463 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60464 return SDValue();
60465
60466 // Convert the input to vXi16.
60467 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60468 Src = DAG.getBitcast(IntVT, Src);
60469
60470 // Widen to at least 8 input elements.
60471 if (NumElts < 8) {
60472 unsigned NumConcats = 8 / NumElts;
60473 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60474 : DAG.getConstant(0, dl, IntVT);
60475 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60476 Ops[0] = Src;
60477 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60478 }
60479
60480 // Destination is vXf32 with at least 4 elements.
60481 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60482 std::max(4U, NumElts));
60483 SDValue Cvt, Chain;
60484 if (IsStrict) {
60485 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60486 {N->getOperand(0), Src});
60487 Chain = Cvt.getValue(1);
60488 } else {
60489 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60490 }
60491
60492 if (NumElts < 4) {
60493 assert(NumElts == 2 && "Unexpected size");
60494 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60495 DAG.getVectorIdxConstant(0, dl));
60496 }
60497
60498 if (IsStrict) {
60499 // Extend to the original VT if necessary.
60500 if (Cvt.getValueType() != VT) {
60501 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60502 {Chain, Cvt});
60503 Chain = Cvt.getValue(1);
60504 }
60505 return DAG.getMergeValues({Cvt, Chain}, dl);
60506 }
60507
60508 // Extend to the original VT if necessary.
60509 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60510}
60511
60512// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60515 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60516 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60517 "Unknown broadcast load type");
60518
60519 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60520 SDValue Ptr = MemIntrin->getBasePtr();
60521 SDValue Chain = MemIntrin->getChain();
60522 EVT VT = N->getSimpleValueType(0);
60523 EVT MemVT = MemIntrin->getMemoryVT();
60524
60525 // Look at other users of our base pointer and try to find a wider broadcast.
60526 // The input chain and the size of the memory VT must match.
60527 for (SDNode *User : Ptr->users())
60528 if (User != N && User->getOpcode() == N->getOpcode() &&
60529 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60530 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60531 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60532 MemVT.getSizeInBits() &&
60533 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60535 MemIntrin->isSimple() && "Illegal broadcast load type");
60537 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60538 VT.getSizeInBits());
60539 Extract = DAG.getBitcast(VT, Extract);
60540 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60541 return Extract;
60542 }
60543
60544 return SDValue();
60545}
60546
60548 const X86Subtarget &Subtarget) {
60549 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60550 return SDValue();
60551
60552 bool IsStrict = N->isStrictFPOpcode();
60553 EVT VT = N->getValueType(0);
60554 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60555 EVT SrcVT = Src.getValueType();
60556
60557 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60558 SrcVT.getVectorElementType() != MVT::f32)
60559 return SDValue();
60560
60561 SDLoc dl(N);
60562
60563 SDValue Cvt, Chain;
60564 unsigned NumElts = VT.getVectorNumElements();
60565 if (Subtarget.hasFP16()) {
60566 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60567 // v4f32 (xint_to_fp v4i64))))
60568 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60569 // v8f16 (CVTXI2P v4i64)))
60570 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60571 Src.getNumOperands() == 2) {
60572 SDValue Cvt0, Cvt1;
60573 SDValue Op0 = Src.getOperand(0);
60574 SDValue Op1 = Src.getOperand(1);
60575 bool IsOp0Strict = Op0->isStrictFPOpcode();
60576 if (Op0.getOpcode() != Op1.getOpcode() ||
60577 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60578 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60579 return SDValue();
60580 }
60581 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60582 if (IsStrict) {
60583 assert(IsOp0Strict && "Op0 must be strict node");
60584 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60587 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60588 {Op0.getOperand(0), Op0.getOperand(1)});
60589 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60590 {Op1.getOperand(0), Op1.getOperand(1)});
60591 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60592 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60593 }
60594 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60596 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60597 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60598 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60599 }
60600 return SDValue();
60601 }
60602
60603 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60604 return SDValue();
60605
60606 // Widen to at least 4 input elements.
60607 if (NumElts < 4)
60608 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60609 DAG.getConstantFP(0.0, dl, SrcVT));
60610
60611 // Destination is v8i16 with at least 8 elements.
60612 EVT CvtVT =
60613 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60614 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60615 if (IsStrict) {
60616 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60617 {N->getOperand(0), Src, Rnd});
60618 Chain = Cvt.getValue(1);
60619 } else {
60620 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60621 }
60622
60623 // Extract down to real number of elements.
60624 if (NumElts < 8) {
60626 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60627 DAG.getVectorIdxConstant(0, dl));
60628 }
60629
60630 Cvt = DAG.getBitcast(VT, Cvt);
60631
60632 if (IsStrict)
60633 return DAG.getMergeValues({Cvt, Chain}, dl);
60634
60635 return Cvt;
60636}
60637
60639 SDValue Src = N->getOperand(0);
60640
60641 // Turn MOVDQ2Q+simple_load into an mmx load.
60642 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60643 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60644
60645 if (LN->isSimple()) {
60646 SDValue NewLd =
60647 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60648 LN->getPointerInfo(), LN->getBaseAlign(),
60649 LN->getMemOperand()->getFlags());
60650 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60651 return NewLd;
60652 }
60653 }
60654
60655 return SDValue();
60656}
60657
60660 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60661 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60662 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60663 return SDValue(N, 0);
60664
60665 return SDValue();
60666}
60667
60668// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60669// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60670// use x86mmx instead.
60672 SDLoc dl(N);
60673
60674 bool MadeChange = false, CastReturnVal = false;
60676 for (const SDValue &Arg : N->op_values()) {
60677 if (Arg.getValueType() == MVT::v1i64) {
60678 MadeChange = true;
60679 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60680 } else
60681 Args.push_back(Arg);
60682 }
60683 SDVTList VTs = N->getVTList();
60684 SDVTList NewVTs = VTs;
60685 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60686 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60687 NewVTArr[0] = MVT::x86mmx;
60688 NewVTs = DAG.getVTList(NewVTArr);
60689 MadeChange = true;
60690 CastReturnVal = true;
60691 }
60692
60693 if (MadeChange) {
60694 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60695 if (CastReturnVal) {
60697 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60698 Returns.push_back(Result.getValue(i));
60699 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60700 return DAG.getMergeValues(Returns, dl);
60701 }
60702 return Result;
60703 }
60704 return SDValue();
60705}
60708 if (!DCI.isBeforeLegalize())
60709 return SDValue();
60710
60711 unsigned IntNo = N->getConstantOperandVal(0);
60712 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60713
60714 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60715 return FixupMMXIntrinsicTypes(N, DAG);
60716
60717 return SDValue();
60718}
60719
60722 if (!DCI.isBeforeLegalize())
60723 return SDValue();
60724
60725 unsigned IntNo = N->getConstantOperandVal(1);
60726 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60727
60728 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60729 return FixupMMXIntrinsicTypes(N, DAG);
60730
60731 return SDValue();
60732}
60733
60736 if (!DCI.isBeforeLegalize())
60737 return SDValue();
60738
60739 unsigned IntNo = N->getConstantOperandVal(1);
60740 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60741
60742 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60743 return FixupMMXIntrinsicTypes(N, DAG);
60744
60745 return SDValue();
60746}
60747
60749 DAGCombinerInfo &DCI) const {
60750 SelectionDAG &DAG = DCI.DAG;
60751 switch (N->getOpcode()) {
60752 // clang-format off
60753 default: break;
60755 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60757 case X86ISD::PEXTRW:
60758 case X86ISD::PEXTRB:
60759 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60761 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60763 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60765 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60766 case ISD::VSELECT:
60767 case ISD::SELECT:
60768 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60769 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60770 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60771 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60772 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60773 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60774 case X86ISD::ADD:
60775 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60776 case X86ISD::CLOAD:
60777 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60778 case X86ISD::SBB: return combineSBB(N, DAG);
60779 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60780 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60781 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60782 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60783 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60784 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60785 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60786 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60787 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60788 case ISD::AVGCEILS:
60789 case ISD::AVGCEILU:
60790 case ISD::AVGFLOORS:
60791 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60792 case X86ISD::BEXTR:
60793 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60794 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60795 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60796 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60797 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60799 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60800 case ISD::SINT_TO_FP:
60802 return combineSIntToFP(N, DAG, DCI, Subtarget);
60803 case ISD::UINT_TO_FP:
60805 return combineUIntToFP(N, DAG, Subtarget);
60806 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60807 case ISD::LRINT:
60808 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60809 case ISD::FADD:
60810 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60811 case X86ISD::VFCMULC:
60812 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60813 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60814 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60815 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60816 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60817 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60818 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60819 case X86ISD::FXOR:
60820 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60821 case X86ISD::FMIN:
60822 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60823 case ISD::FMINNUM:
60824 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60825 case X86ISD::CVTSI2P:
60826 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60827 case X86ISD::CVTP2SI:
60828 case X86ISD::CVTP2UI:
60830 case X86ISD::CVTTP2SI:
60832 case X86ISD::CVTTP2UI:
60833 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60835 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60836 case X86ISD::BT: return combineBT(N, DAG, DCI);
60837 case ISD::ANY_EXTEND:
60838 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60839 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60840 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60844 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60845 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60846 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60847 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60848 case X86ISD::PACKSS:
60849 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60850 case X86ISD::HADD:
60851 case X86ISD::HSUB:
60852 case X86ISD::FHADD:
60853 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60854 case X86ISD::VSHL:
60855 case X86ISD::VSRA:
60856 case X86ISD::VSRL:
60857 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60858 case X86ISD::VSHLI:
60859 case X86ISD::VSRAI:
60860 case X86ISD::VSRLI:
60861 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60863 case X86ISD::PINSRB:
60864 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60865 case X86ISD::SHUFP: // Handle all target specific shuffles
60866 case X86ISD::INSERTPS:
60867 case X86ISD::EXTRQI:
60868 case X86ISD::INSERTQI:
60869 case X86ISD::VALIGN:
60870 case X86ISD::PALIGNR:
60871 case X86ISD::VSHLDQ:
60872 case X86ISD::VSRLDQ:
60873 case X86ISD::BLENDI:
60874 case X86ISD::UNPCKH:
60875 case X86ISD::UNPCKL:
60876 case X86ISD::MOVHLPS:
60877 case X86ISD::MOVLHPS:
60878 case X86ISD::PSHUFB:
60879 case X86ISD::PSHUFD:
60880 case X86ISD::PSHUFHW:
60881 case X86ISD::PSHUFLW:
60882 case X86ISD::MOVSHDUP:
60883 case X86ISD::MOVSLDUP:
60884 case X86ISD::MOVDDUP:
60885 case X86ISD::MOVSS:
60886 case X86ISD::MOVSD:
60887 case X86ISD::MOVSH:
60888 case X86ISD::VBROADCAST:
60889 case X86ISD::VPPERM:
60890 case X86ISD::VPERMI:
60891 case X86ISD::VPERMV:
60892 case X86ISD::VPERMV3:
60893 case X86ISD::VPERMIL2:
60894 case X86ISD::VPERMILPI:
60895 case X86ISD::VPERMILPV:
60896 case X86ISD::VPERM2X128:
60897 case X86ISD::SHUF128:
60898 case X86ISD::VZEXT_MOVL:
60899 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60900 case X86ISD::FMADD_RND:
60901 case X86ISD::FMSUB:
60903 case X86ISD::FMSUB_RND:
60904 case X86ISD::FNMADD:
60906 case X86ISD::FNMADD_RND:
60907 case X86ISD::FNMSUB:
60909 case X86ISD::FNMSUB_RND:
60910 case ISD::FMA:
60911 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60914 case X86ISD::FMADDSUB:
60915 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60916 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60917 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60918 case X86ISD::MGATHER:
60919 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60920 case ISD::MGATHER:
60921 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60922 case X86ISD::PCMPEQ:
60923 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60924 case X86ISD::PMULDQ:
60925 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60926 case X86ISD::VPMADDUBSW:
60927 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60928 case X86ISD::VPMADD52L:
60929 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60930 case X86ISD::KSHIFTL:
60931 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60932 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60934 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60936 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60938 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60939 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60940 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60941 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60942 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60943 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60945 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60946 // clang-format on
60947 }
60948
60949 return SDValue();
60950}
60951
60953 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60954}
60955
60956// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60958 EVT ExtVT) const {
60959 return Subtarget.hasAVX512() || !VT.isVector();
60960}
60961
60963 if (!isTypeLegal(VT))
60964 return false;
60965
60966 // There are no vXi8 shifts.
60967 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
60968 return false;
60969
60970 // TODO: Almost no 8-bit ops are desirable because they have no actual
60971 // size/speed advantages vs. 32-bit ops, but they do have a major
60972 // potential disadvantage by causing partial register stalls.
60973 //
60974 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
60975 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
60976 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
60977 // check for a constant operand to the multiply.
60978 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
60979 return false;
60980
60981 // i16 instruction encodings are longer and some i16 instructions are slow,
60982 // so those are not desirable.
60983 if (VT == MVT::i16) {
60984 switch (Opc) {
60985 default:
60986 break;
60987 case ISD::LOAD:
60988 case ISD::SIGN_EXTEND:
60989 case ISD::ZERO_EXTEND:
60990 case ISD::ANY_EXTEND:
60991 case ISD::MUL:
60992 return false;
60993 case ISD::SHL:
60994 case ISD::SRA:
60995 case ISD::SRL:
60996 case ISD::SUB:
60997 case ISD::ADD:
60998 case ISD::AND:
60999 case ISD::OR:
61000 case ISD::XOR:
61001 // NDD instruction never has "partial register write" issue b/c it has
61002 // destination register's upper bits [63:OSIZE]) zeroed even when
61003 // OSIZE=8/16.
61004 return Subtarget.hasNDD();
61005 }
61006 }
61007
61008 // Any legal type not explicitly accounted for above here is desirable.
61009 return true;
61010}
61011
61013 SDValue Value, SDValue Addr,
61014 int JTI,
61015 SelectionDAG &DAG) const {
61016 const Module *M = DAG.getMachineFunction().getFunction().getParent();
61017 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
61018 if (IsCFProtectionSupported) {
61019 // In case control-flow branch protection is enabled, we need to add
61020 // notrack prefix to the indirect branch.
61021 // In order to do that we create NT_BRIND SDNode.
61022 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
61023 SDValue Chain = Value;
61024 // Jump table debug info is only needed if CodeView is enabled.
61026 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
61027 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
61028 }
61029
61030 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
61031}
61032
61035 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
61037 EVT VT = LogicOp->getValueType(0);
61038 EVT OpVT = SETCC0->getOperand(0).getValueType();
61039 if (!VT.isInteger())
61041
61042 if (VT.isVector())
61047
61048 // Don't use `NotAnd` as even though `not` is generally shorter code size than
61049 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
61050 // `NotAnd` applies, `AddAnd` does as well.
61051 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
61052 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
61054}
61055
61057 EVT VT = Op.getValueType();
61058 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
61059 isa<ConstantSDNode>(Op.getOperand(1));
61060
61061 // i16 is legal, but undesirable since i16 instruction encodings are longer
61062 // and some i16 instructions are slow.
61063 // 8-bit multiply-by-constant can usually be expanded to something cheaper
61064 // using LEA and/or other ALU ops.
61065 if (VT != MVT::i16 && !Is8BitMulByConstant)
61066 return false;
61067
61068 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
61069 if (!Op.hasOneUse())
61070 return false;
61071 SDNode *User = *Op->user_begin();
61073 return false;
61074 auto *Ld = cast<LoadSDNode>(Load);
61075 auto *St = cast<StoreSDNode>(User);
61076 return Ld->getBasePtr() == St->getBasePtr();
61077 };
61078
61079 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
61080 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
61081 return false;
61082 if (!Op.hasOneUse())
61083 return false;
61084 SDNode *User = *Op->user_begin();
61085 if (User->getOpcode() != ISD::ATOMIC_STORE)
61086 return false;
61087 auto *Ld = cast<AtomicSDNode>(Load);
61088 auto *St = cast<AtomicSDNode>(User);
61089 return Ld->getBasePtr() == St->getBasePtr();
61090 };
61091
61092 auto IsFoldableZext = [](SDValue Op) {
61093 if (!Op.hasOneUse())
61094 return false;
61095 SDNode *User = *Op->user_begin();
61096 EVT VT = User->getValueType(0);
61097 return (User->getOpcode() == ISD::ZERO_EXTEND &&
61098 (VT == MVT::i32 || VT == MVT::i64));
61099 };
61100
61101 bool Commute = false;
61102 switch (Op.getOpcode()) {
61103 default: return false;
61104 case ISD::SIGN_EXTEND:
61105 case ISD::ZERO_EXTEND:
61106 case ISD::ANY_EXTEND:
61107 break;
61108 case ISD::SHL:
61109 case ISD::SRA:
61110 case ISD::SRL: {
61111 SDValue N0 = Op.getOperand(0);
61112 // Look out for (store (shl (load), x)).
61113 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
61114 return false;
61115 break;
61116 }
61117 case ISD::MUL:
61118 // When ZU is enabled, we prefer to not promote for MUL by a constant
61119 // when there is an opportunity to fold a zext with imulzu.
61120 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
61121 (isa<ConstantSDNode>(Op.getOperand(0)) ||
61122 isa<ConstantSDNode>(Op.getOperand(1))))
61123 return false;
61124 [[fallthrough]];
61125 case ISD::ADD:
61126 case ISD::AND:
61127 case ISD::OR:
61128 case ISD::XOR:
61129 Commute = true;
61130 [[fallthrough]];
61131 case ISD::SUB: {
61132 SDValue N0 = Op.getOperand(0);
61133 SDValue N1 = Op.getOperand(1);
61134 // Avoid disabling potential load folding opportunities.
61135 if (X86::mayFoldLoad(N1, Subtarget) &&
61136 (!Commute || !isa<ConstantSDNode>(N0) ||
61137 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
61138 return false;
61139 if (X86::mayFoldLoad(N0, Subtarget) &&
61140 ((Commute && !isa<ConstantSDNode>(N1)) ||
61141 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
61142 return false;
61143 if (IsFoldableAtomicRMW(N0, Op) ||
61144 (Commute && IsFoldableAtomicRMW(N1, Op)))
61145 return false;
61146 }
61147 }
61148
61149 PVT = MVT::i32;
61150 return true;
61151}
61152
61153//===----------------------------------------------------------------------===//
61154// X86 Inline Assembly Support
61155//===----------------------------------------------------------------------===//
61156
61159 .Case("{@cca}", X86::COND_A)
61160 .Case("{@ccae}", X86::COND_AE)
61161 .Case("{@ccb}", X86::COND_B)
61162 .Case("{@ccbe}", X86::COND_BE)
61163 .Case("{@ccc}", X86::COND_B)
61164 .Case("{@cce}", X86::COND_E)
61165 .Case("{@ccz}", X86::COND_E)
61166 .Case("{@ccg}", X86::COND_G)
61167 .Case("{@ccge}", X86::COND_GE)
61168 .Case("{@ccl}", X86::COND_L)
61169 .Case("{@ccle}", X86::COND_LE)
61170 .Case("{@ccna}", X86::COND_BE)
61171 .Case("{@ccnae}", X86::COND_B)
61172 .Case("{@ccnb}", X86::COND_AE)
61173 .Case("{@ccnbe}", X86::COND_A)
61174 .Case("{@ccnc}", X86::COND_AE)
61175 .Case("{@ccne}", X86::COND_NE)
61176 .Case("{@ccnz}", X86::COND_NE)
61177 .Case("{@ccng}", X86::COND_LE)
61178 .Case("{@ccnge}", X86::COND_L)
61179 .Case("{@ccnl}", X86::COND_GE)
61180 .Case("{@ccnle}", X86::COND_G)
61181 .Case("{@ccno}", X86::COND_NO)
61182 .Case("{@ccnp}", X86::COND_NP)
61183 .Case("{@ccns}", X86::COND_NS)
61184 .Case("{@cco}", X86::COND_O)
61185 .Case("{@ccp}", X86::COND_P)
61186 .Case("{@ccs}", X86::COND_S)
61188 return Cond;
61189}
61190
61191/// Given a constraint letter, return the type of constraint for this target.
61194 if (Constraint.size() == 1) {
61195 switch (Constraint[0]) {
61196 case 'R':
61197 case 'q':
61198 case 'Q':
61199 case 'f':
61200 case 't':
61201 case 'u':
61202 case 'y':
61203 case 'x':
61204 case 'v':
61205 case 'l':
61206 case 'k': // AVX512 masking registers.
61207 return C_RegisterClass;
61208 case 'a':
61209 case 'b':
61210 case 'c':
61211 case 'd':
61212 case 'S':
61213 case 'D':
61214 case 'A':
61215 return C_Register;
61216 case 'I':
61217 case 'J':
61218 case 'K':
61219 case 'N':
61220 case 'G':
61221 case 'L':
61222 case 'M':
61223 return C_Immediate;
61224 case 'C':
61225 case 'e':
61226 case 'Z':
61227 return C_Other;
61228 default:
61229 break;
61230 }
61231 }
61232 else if (Constraint.size() == 2) {
61233 switch (Constraint[0]) {
61234 default:
61235 break;
61236 case 'W':
61237 if (Constraint[1] != 's')
61238 break;
61239 return C_Other;
61240 case 'Y':
61241 switch (Constraint[1]) {
61242 default:
61243 break;
61244 case 'z':
61245 return C_Register;
61246 case 'i':
61247 case 'm':
61248 case 'k':
61249 case 't':
61250 case '2':
61251 return C_RegisterClass;
61252 }
61253 break;
61254 case 'j':
61255 switch (Constraint[1]) {
61256 default:
61257 break;
61258 case 'r':
61259 case 'R':
61260 return C_RegisterClass;
61261 }
61262 }
61263 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61264 return C_Other;
61265 return TargetLowering::getConstraintType(Constraint);
61266}
61267
61268/// Examine constraint type and operand type and determine a weight value.
61269/// This object must already have been set up with the operand type
61270/// and the current alternative constraint selected.
61273 AsmOperandInfo &Info, const char *Constraint) const {
61275 Value *CallOperandVal = Info.CallOperandVal;
61276 // If we don't have a value, we can't do a match,
61277 // but allow it at the lowest weight.
61278 if (!CallOperandVal)
61279 return CW_Default;
61280 Type *Ty = CallOperandVal->getType();
61281 // Look at the constraint type.
61282 switch (*Constraint) {
61283 default:
61285 [[fallthrough]];
61286 case 'R':
61287 case 'q':
61288 case 'Q':
61289 case 'a':
61290 case 'b':
61291 case 'c':
61292 case 'd':
61293 case 'S':
61294 case 'D':
61295 case 'A':
61296 if (CallOperandVal->getType()->isIntegerTy())
61297 Wt = CW_SpecificReg;
61298 break;
61299 case 'f':
61300 case 't':
61301 case 'u':
61302 if (Ty->isFloatingPointTy())
61303 Wt = CW_SpecificReg;
61304 break;
61305 case 'y':
61306 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61307 Wt = CW_SpecificReg;
61308 break;
61309 case 'Y':
61310 if (StringRef(Constraint).size() != 2)
61311 break;
61312 switch (Constraint[1]) {
61313 default:
61314 return CW_Invalid;
61315 // XMM0
61316 case 'z':
61317 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61318 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61319 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61320 return CW_SpecificReg;
61321 return CW_Invalid;
61322 // Conditional OpMask regs (AVX512)
61323 case 'k':
61324 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61325 return CW_Register;
61326 return CW_Invalid;
61327 // Any MMX reg
61328 case 'm':
61329 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61330 return CW_SpecificReg;
61331 return CW_Invalid;
61332 // Any SSE reg when ISA >= SSE2, same as 'x'
61333 case 'i':
61334 case 't':
61335 case '2':
61336 if (!Subtarget.hasSSE2())
61337 return CW_Invalid;
61338 break;
61339 }
61340 break;
61341 case 'j':
61342 if (StringRef(Constraint).size() != 2)
61343 break;
61344 switch (Constraint[1]) {
61345 default:
61346 return CW_Invalid;
61347 case 'r':
61348 case 'R':
61349 if (CallOperandVal->getType()->isIntegerTy())
61350 Wt = CW_SpecificReg;
61351 break;
61352 }
61353 break;
61354 case 'v':
61355 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61356 Wt = CW_Register;
61357 [[fallthrough]];
61358 case 'x':
61359 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61360 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61361 Wt = CW_Register;
61362 break;
61363 case 'k':
61364 // Enable conditional vector operations using %k<#> registers.
61365 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61366 Wt = CW_Register;
61367 break;
61368 case 'I':
61369 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61370 if (C->getZExtValue() <= 31)
61371 Wt = CW_Constant;
61372 break;
61373 case 'J':
61374 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61375 if (C->getZExtValue() <= 63)
61376 Wt = CW_Constant;
61377 break;
61378 case 'K':
61379 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61380 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61381 Wt = CW_Constant;
61382 break;
61383 case 'L':
61384 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61385 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61386 Wt = CW_Constant;
61387 break;
61388 case 'M':
61389 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61390 if (C->getZExtValue() <= 3)
61391 Wt = CW_Constant;
61392 break;
61393 case 'N':
61394 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61395 if (C->getZExtValue() <= 0xff)
61396 Wt = CW_Constant;
61397 break;
61398 case 'G':
61399 case 'C':
61400 if (isa<ConstantFP>(CallOperandVal))
61401 Wt = CW_Constant;
61402 break;
61403 case 'e':
61404 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61405 if ((C->getSExtValue() >= -0x80000000LL) &&
61406 (C->getSExtValue() <= 0x7fffffffLL))
61407 Wt = CW_Constant;
61408 break;
61409 case 'Z':
61410 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61411 if (C->getZExtValue() <= 0xffffffff)
61412 Wt = CW_Constant;
61413 break;
61414 }
61415 return Wt;
61416}
61417
61418/// Try to replace an X constraint, which matches anything, with another that
61419/// has more specific requirements based on the type of the corresponding
61420/// operand.
61422LowerXConstraint(EVT ConstraintVT) const {
61423 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61424 // 'f' like normal targets.
61425 if (ConstraintVT.isFloatingPoint()) {
61426 if (Subtarget.hasSSE1())
61427 return "x";
61428 }
61429
61430 return TargetLowering::LowerXConstraint(ConstraintVT);
61431}
61432
61433// Lower @cc targets via setcc.
61435 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61436 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61437 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
61438 if (Cond == X86::COND_INVALID)
61439 return SDValue();
61440 // Check that return type is valid.
61441 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61442 OpInfo.ConstraintVT.getSizeInBits() < 8)
61443 report_fatal_error("Glue output operand is of invalid type");
61444
61445 // Get EFLAGS register. Only update chain when copyfrom is glued.
61446 if (Glue.getNode()) {
61447 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61448 Chain = Glue.getValue(1);
61449 } else
61450 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61451 // Extract CC code.
61452 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61453 // Extend to 32-bits
61454 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61455
61456 return Result;
61457}
61458
61459/// Lower the specified operand into the Ops vector.
61460/// If it is invalid, don't add anything to Ops.
61462 StringRef Constraint,
61463 std::vector<SDValue> &Ops,
61464 SelectionDAG &DAG) const {
61465 SDValue Result;
61466 char ConstraintLetter = Constraint[0];
61467 switch (ConstraintLetter) {
61468 default: break;
61469 case 'I':
61470 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61471 if (C->getZExtValue() <= 31) {
61472 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61473 Op.getValueType());
61474 break;
61475 }
61476 }
61477 return;
61478 case 'J':
61479 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61480 if (C->getZExtValue() <= 63) {
61481 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61482 Op.getValueType());
61483 break;
61484 }
61485 }
61486 return;
61487 case 'K':
61488 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61489 if (isInt<8>(C->getSExtValue())) {
61490 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61491 Op.getValueType());
61492 break;
61493 }
61494 }
61495 return;
61496 case 'L':
61497 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61498 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61499 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61500 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61501 Op.getValueType());
61502 break;
61503 }
61504 }
61505 return;
61506 case 'M':
61507 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61508 if (C->getZExtValue() <= 3) {
61509 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61510 Op.getValueType());
61511 break;
61512 }
61513 }
61514 return;
61515 case 'N':
61516 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61517 if (C->getZExtValue() <= 255) {
61518 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61519 Op.getValueType());
61520 break;
61521 }
61522 }
61523 return;
61524 case 'O':
61525 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61526 if (C->getZExtValue() <= 127) {
61527 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61528 Op.getValueType());
61529 break;
61530 }
61531 }
61532 return;
61533 case 'e': {
61534 // 32-bit signed value
61535 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61537 C->getSExtValue())) {
61538 // Widen to 64 bits here to get it sign extended.
61539 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61540 break;
61541 }
61542 // FIXME gcc accepts some relocatable values here too, but only in certain
61543 // memory models; it's complicated.
61544 }
61545 return;
61546 }
61547 case 'W': {
61548 assert(Constraint[1] == 's');
61549 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61550 // offset.
61551 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61552 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61553 BA->getValueType(0)));
61554 } else {
61555 int64_t Offset = 0;
61556 if (Op->getOpcode() == ISD::ADD &&
61557 isa<ConstantSDNode>(Op->getOperand(1))) {
61558 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61559 Op = Op->getOperand(0);
61560 }
61561 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61562 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61563 GA->getValueType(0), Offset));
61564 }
61565 return;
61566 }
61567 case 'Z': {
61568 // 32-bit unsigned value
61569 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61571 C->getZExtValue())) {
61572 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61573 Op.getValueType());
61574 break;
61575 }
61576 }
61577 // FIXME gcc accepts some relocatable values here too, but only in certain
61578 // memory models; it's complicated.
61579 return;
61580 }
61581 case 'i': {
61582 // Literal immediates are always ok.
61583 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61584 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61585 BooleanContent BCont = getBooleanContents(MVT::i64);
61586 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61588 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61589 : CST->getSExtValue();
61590 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61591 break;
61592 }
61593
61594 // In any sort of PIC mode addresses need to be computed at runtime by
61595 // adding in a register or some sort of table lookup. These can't
61596 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61597 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61599 return;
61600
61601 // If we are in non-pic codegen mode, we allow the address of a global (with
61602 // an optional displacement) to be used with 'i'.
61603 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61604 // If we require an extra load to get this address, as in PIC mode, we
61605 // can't accept it.
61607 Subtarget.classifyGlobalReference(GA->getGlobal())))
61608 return;
61609 break;
61610 }
61611 }
61612
61613 if (Result.getNode()) {
61614 Ops.push_back(Result);
61615 return;
61616 }
61617 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61618}
61619
61620/// Check if \p RC is a general purpose register class.
61621/// I.e., GR* or one of their variant.
61622static bool isGRClass(const TargetRegisterClass &RC) {
61623 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61624 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61625 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61626 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61627 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61628}
61629
61630/// Check if \p RC is a vector register class.
61631/// I.e., FR* / VR* or one of their variant.
61632static bool isFRClass(const TargetRegisterClass &RC) {
61633 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61634 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61635 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61636 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61637 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61638 RC.hasSuperClassEq(&X86::VR512RegClass);
61639}
61640
61641/// Check if \p RC is a mask register class.
61642/// I.e., VK* or one of their variant.
61643static bool isVKClass(const TargetRegisterClass &RC) {
61644 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61645 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61646 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61647 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61648 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61649 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61650 RC.hasSuperClassEq(&X86::VK64RegClass);
61651}
61652
61653static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61654 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61655}
61656
61657std::pair<unsigned, const TargetRegisterClass *>
61659 StringRef Constraint,
61660 MVT VT) const {
61661 // First, see if this is a constraint that directly corresponds to an LLVM
61662 // register class.
61663 if (Constraint.size() == 1) {
61664 // GCC Constraint Letters
61665 switch (Constraint[0]) {
61666 default: break;
61667 // 'A' means [ER]AX + [ER]DX.
61668 case 'A':
61669 if (Subtarget.is64Bit())
61670 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61671 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61672 "Expecting 64, 32 or 16 bit subtarget");
61673 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61674
61675 // TODO: Slight differences here in allocation order and leaving
61676 // RIP in the class. Do they matter any more here than they do
61677 // in the normal allocation?
61678 case 'k':
61679 if (Subtarget.hasAVX512()) {
61680 if (VT == MVT::v1i1 || VT == MVT::i1)
61681 return std::make_pair(0U, &X86::VK1RegClass);
61682 if (VT == MVT::v8i1 || VT == MVT::i8)
61683 return std::make_pair(0U, &X86::VK8RegClass);
61684 if (VT == MVT::v16i1 || VT == MVT::i16)
61685 return std::make_pair(0U, &X86::VK16RegClass);
61686 }
61687 if (Subtarget.hasBWI()) {
61688 if (VT == MVT::v32i1 || VT == MVT::i32)
61689 return std::make_pair(0U, &X86::VK32RegClass);
61690 if (VT == MVT::v64i1 || VT == MVT::i64)
61691 return std::make_pair(0U, &X86::VK64RegClass);
61692 }
61693 break;
61694 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61695 if (Subtarget.is64Bit()) {
61696 if (VT == MVT::i8 || VT == MVT::i1)
61697 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61698 ? &X86::GR8RegClass
61699 : &X86::GR8_NOREX2RegClass);
61700 if (VT == MVT::i16)
61701 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61702 ? &X86::GR16RegClass
61703 : &X86::GR16_NOREX2RegClass);
61704 if (VT == MVT::i32 || VT == MVT::f32)
61705 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61706 ? &X86::GR32RegClass
61707 : &X86::GR32_NOREX2RegClass);
61708 if (VT != MVT::f80 && !VT.isVector())
61709 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61710 ? &X86::GR64RegClass
61711 : &X86::GR64_NOREX2RegClass);
61712 break;
61713 }
61714 [[fallthrough]];
61715 // 32-bit fallthrough
61716 case 'Q': // Q_REGS
61717 if (VT == MVT::i8 || VT == MVT::i1)
61718 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61719 if (VT == MVT::i16)
61720 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61721 if (VT == MVT::i32 || VT == MVT::f32 ||
61722 (!VT.isVector() && !Subtarget.is64Bit()))
61723 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61724 if (VT != MVT::f80 && !VT.isVector())
61725 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61726 break;
61727 case 'r': // GENERAL_REGS
61728 case 'l': // INDEX_REGS
61729 if (VT == MVT::i8 || VT == MVT::i1)
61730 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61731 ? &X86::GR8RegClass
61732 : &X86::GR8_NOREX2RegClass);
61733 if (VT == MVT::i16)
61734 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61735 ? &X86::GR16RegClass
61736 : &X86::GR16_NOREX2RegClass);
61737 if (VT == MVT::i32 || VT == MVT::f32 ||
61738 (!VT.isVector() && !Subtarget.is64Bit()))
61739 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61740 ? &X86::GR32RegClass
61741 : &X86::GR32_NOREX2RegClass);
61742 if (VT != MVT::f80 && !VT.isVector())
61743 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61744 ? &X86::GR64RegClass
61745 : &X86::GR64_NOREX2RegClass);
61746 break;
61747 case 'R': // LEGACY_REGS
61748 if (VT == MVT::i8 || VT == MVT::i1)
61749 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61750 if (VT == MVT::i16)
61751 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61752 if (VT == MVT::i32 || VT == MVT::f32 ||
61753 (!VT.isVector() && !Subtarget.is64Bit()))
61754 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61755 if (VT != MVT::f80 && !VT.isVector())
61756 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61757 break;
61758 case 'f': // FP Stack registers.
61759 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61760 // value to the correct fpstack register class.
61761 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61762 return std::make_pair(0U, &X86::RFP32RegClass);
61763 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61764 return std::make_pair(0U, &X86::RFP64RegClass);
61765 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61766 return std::make_pair(0U, &X86::RFP80RegClass);
61767 break;
61768 case 'y': // MMX_REGS if MMX allowed.
61769 if (!Subtarget.hasMMX()) break;
61770 return std::make_pair(0U, &X86::VR64RegClass);
61771 case 'v':
61772 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61773 if (!Subtarget.hasSSE1()) break;
61774 bool VConstraint = (Constraint[0] == 'v');
61775
61776 switch (VT.SimpleTy) {
61777 default: break;
61778 // Scalar SSE types.
61779 case MVT::f16:
61780 if (VConstraint && Subtarget.hasFP16())
61781 return std::make_pair(0U, &X86::FR16XRegClass);
61782 break;
61783 case MVT::f32:
61784 case MVT::i32:
61785 if (VConstraint && Subtarget.hasVLX())
61786 return std::make_pair(0U, &X86::FR32XRegClass);
61787 return std::make_pair(0U, &X86::FR32RegClass);
61788 case MVT::f64:
61789 case MVT::i64:
61790 if (VConstraint && Subtarget.hasVLX())
61791 return std::make_pair(0U, &X86::FR64XRegClass);
61792 return std::make_pair(0U, &X86::FR64RegClass);
61793 case MVT::i128:
61794 if (Subtarget.is64Bit()) {
61795 if (VConstraint && Subtarget.hasVLX())
61796 return std::make_pair(0U, &X86::VR128XRegClass);
61797 return std::make_pair(0U, &X86::VR128RegClass);
61798 }
61799 break;
61800 // Vector types and fp128.
61801 case MVT::v8f16:
61802 if (!Subtarget.hasFP16())
61803 break;
61804 if (VConstraint)
61805 return std::make_pair(0U, &X86::VR128XRegClass);
61806 return std::make_pair(0U, &X86::VR128RegClass);
61807 case MVT::v8bf16:
61808 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61809 break;
61810 if (VConstraint)
61811 return std::make_pair(0U, &X86::VR128XRegClass);
61812 return std::make_pair(0U, &X86::VR128RegClass);
61813 case MVT::f128:
61814 if (!Subtarget.is64Bit())
61815 break;
61816 [[fallthrough]];
61817 case MVT::v16i8:
61818 case MVT::v8i16:
61819 case MVT::v4i32:
61820 case MVT::v2i64:
61821 case MVT::v4f32:
61822 case MVT::v2f64:
61823 if (VConstraint && Subtarget.hasVLX())
61824 return std::make_pair(0U, &X86::VR128XRegClass);
61825 return std::make_pair(0U, &X86::VR128RegClass);
61826 // AVX types.
61827 case MVT::v16f16:
61828 if (!Subtarget.hasFP16())
61829 break;
61830 if (VConstraint)
61831 return std::make_pair(0U, &X86::VR256XRegClass);
61832 return std::make_pair(0U, &X86::VR256RegClass);
61833 case MVT::v16bf16:
61834 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61835 break;
61836 if (VConstraint)
61837 return std::make_pair(0U, &X86::VR256XRegClass);
61838 return std::make_pair(0U, &X86::VR256RegClass);
61839 case MVT::v32i8:
61840 case MVT::v16i16:
61841 case MVT::v8i32:
61842 case MVT::v4i64:
61843 case MVT::v8f32:
61844 case MVT::v4f64:
61845 if (VConstraint && Subtarget.hasVLX())
61846 return std::make_pair(0U, &X86::VR256XRegClass);
61847 if (Subtarget.hasAVX())
61848 return std::make_pair(0U, &X86::VR256RegClass);
61849 break;
61850 case MVT::v32f16:
61851 if (!Subtarget.hasFP16())
61852 break;
61853 if (VConstraint)
61854 return std::make_pair(0U, &X86::VR512RegClass);
61855 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61856 case MVT::v32bf16:
61857 if (!Subtarget.hasBF16())
61858 break;
61859 if (VConstraint)
61860 return std::make_pair(0U, &X86::VR512RegClass);
61861 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61862 case MVT::v64i8:
61863 case MVT::v32i16:
61864 case MVT::v8f64:
61865 case MVT::v16f32:
61866 case MVT::v16i32:
61867 case MVT::v8i64:
61868 if (!Subtarget.hasAVX512()) break;
61869 if (VConstraint)
61870 return std::make_pair(0U, &X86::VR512RegClass);
61871 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61872 }
61873 break;
61874 }
61875 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61876 switch (Constraint[1]) {
61877 default:
61878 break;
61879 case 'i':
61880 case 't':
61881 case '2':
61882 return getRegForInlineAsmConstraint(TRI, "x", VT);
61883 case 'm':
61884 if (!Subtarget.hasMMX()) break;
61885 return std::make_pair(0U, &X86::VR64RegClass);
61886 case 'z':
61887 if (!Subtarget.hasSSE1()) break;
61888 switch (VT.SimpleTy) {
61889 default: break;
61890 // Scalar SSE types.
61891 case MVT::f16:
61892 if (!Subtarget.hasFP16())
61893 break;
61894 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61895 case MVT::f32:
61896 case MVT::i32:
61897 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61898 case MVT::f64:
61899 case MVT::i64:
61900 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61901 case MVT::v8f16:
61902 if (!Subtarget.hasFP16())
61903 break;
61904 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61905 case MVT::v8bf16:
61906 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61907 break;
61908 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61909 case MVT::f128:
61910 case MVT::v16i8:
61911 case MVT::v8i16:
61912 case MVT::v4i32:
61913 case MVT::v2i64:
61914 case MVT::v4f32:
61915 case MVT::v2f64:
61916 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61917 // AVX types.
61918 case MVT::v16f16:
61919 if (!Subtarget.hasFP16())
61920 break;
61921 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61922 case MVT::v16bf16:
61923 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61924 break;
61925 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61926 case MVT::v32i8:
61927 case MVT::v16i16:
61928 case MVT::v8i32:
61929 case MVT::v4i64:
61930 case MVT::v8f32:
61931 case MVT::v4f64:
61932 if (Subtarget.hasAVX())
61933 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61934 break;
61935 case MVT::v32f16:
61936 if (!Subtarget.hasFP16())
61937 break;
61938 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61939 case MVT::v32bf16:
61940 if (!Subtarget.hasBF16())
61941 break;
61942 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61943 case MVT::v64i8:
61944 case MVT::v32i16:
61945 case MVT::v8f64:
61946 case MVT::v16f32:
61947 case MVT::v16i32:
61948 case MVT::v8i64:
61949 if (Subtarget.hasAVX512())
61950 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61951 break;
61952 }
61953 break;
61954 case 'k':
61955 // This register class doesn't allocate k0 for masked vector operation.
61956 if (Subtarget.hasAVX512()) {
61957 if (VT == MVT::v1i1 || VT == MVT::i1)
61958 return std::make_pair(0U, &X86::VK1WMRegClass);
61959 if (VT == MVT::v8i1 || VT == MVT::i8)
61960 return std::make_pair(0U, &X86::VK8WMRegClass);
61961 if (VT == MVT::v16i1 || VT == MVT::i16)
61962 return std::make_pair(0U, &X86::VK16WMRegClass);
61963 }
61964 if (Subtarget.hasBWI()) {
61965 if (VT == MVT::v32i1 || VT == MVT::i32)
61966 return std::make_pair(0U, &X86::VK32WMRegClass);
61967 if (VT == MVT::v64i1 || VT == MVT::i64)
61968 return std::make_pair(0U, &X86::VK64WMRegClass);
61969 }
61970 break;
61971 }
61972 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
61973 switch (Constraint[1]) {
61974 default:
61975 break;
61976 case 'r':
61977 if (VT == MVT::i8 || VT == MVT::i1)
61978 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
61979 if (VT == MVT::i16)
61980 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
61981 if (VT == MVT::i32 || VT == MVT::f32)
61982 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
61983 if (VT != MVT::f80 && !VT.isVector())
61984 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
61985 break;
61986 case 'R':
61987 if (VT == MVT::i8 || VT == MVT::i1)
61988 return std::make_pair(0U, &X86::GR8RegClass);
61989 if (VT == MVT::i16)
61990 return std::make_pair(0U, &X86::GR16RegClass);
61991 if (VT == MVT::i32 || VT == MVT::f32)
61992 return std::make_pair(0U, &X86::GR32RegClass);
61993 if (VT != MVT::f80 && !VT.isVector())
61994 return std::make_pair(0U, &X86::GR64RegClass);
61995 break;
61996 }
61997 }
61998
61999 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
62000 return std::make_pair(0U, &X86::GR32RegClass);
62001
62002 // Use the default implementation in TargetLowering to convert the register
62003 // constraint into a member of a register class.
62004 std::pair<Register, const TargetRegisterClass*> Res;
62006
62007 // Not found as a standard register?
62008 if (!Res.second) {
62009 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
62010 // to/from f80.
62011 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
62012 // Map st(0) -> st(7) -> ST0
62013 if (Constraint.size() == 7 && Constraint[0] == '{' &&
62014 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
62015 Constraint[3] == '(' &&
62016 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
62017 Constraint[5] == ')' && Constraint[6] == '}') {
62018 // st(7) is not allocatable and thus not a member of RFP80. Return
62019 // singleton class in cases where we have a reference to it.
62020 if (Constraint[4] == '7')
62021 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
62022 return std::make_pair(X86::FP0 + Constraint[4] - '0',
62023 &X86::RFP80RegClass);
62024 }
62025
62026 // GCC allows "st(0)" to be called just plain "st".
62027 if (StringRef("{st}").equals_insensitive(Constraint))
62028 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
62029 }
62030
62031 // flags -> EFLAGS
62032 if (StringRef("{flags}").equals_insensitive(Constraint))
62033 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
62034
62035 // dirflag -> DF
62036 // Only allow for clobber.
62037 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
62038 VT == MVT::Other)
62039 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
62040
62041 // fpsr -> FPSW
62042 // Only allow for clobber.
62043 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
62044 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
62045
62046 return Res;
62047 }
62048
62049 // Make sure it isn't a register that requires 64-bit mode.
62050 if (!Subtarget.is64Bit() &&
62051 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
62052 TRI->getEncodingValue(Res.first) >= 8) {
62053 // Register requires REX prefix, but we're in 32-bit mode.
62054 return std::make_pair(0, nullptr);
62055 }
62056
62057 // Make sure it isn't a register that requires AVX512.
62058 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
62059 TRI->getEncodingValue(Res.first) & 0x10) {
62060 // Register requires EVEX prefix.
62061 return std::make_pair(0, nullptr);
62062 }
62063
62064 // Otherwise, check to see if this is a register class of the wrong value
62065 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
62066 // turn into {ax},{dx}.
62067 // MVT::Other is used to specify clobber names.
62068 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
62069 return Res; // Correct type already, nothing to do.
62070
62071 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
62072 // return "eax". This should even work for things like getting 64bit integer
62073 // registers when given an f64 type.
62074 const TargetRegisterClass *Class = Res.second;
62075 // The generic code will match the first register class that contains the
62076 // given register. Thus, based on the ordering of the tablegened file,
62077 // the "plain" GR classes might not come first.
62078 // Therefore, use a helper method.
62079 if (isGRClass(*Class)) {
62080 unsigned Size = VT.getSizeInBits();
62081 if (Size == 1) Size = 8;
62082 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
62083 return std::make_pair(0, nullptr);
62084 Register DestReg = getX86SubSuperRegister(Res.first, Size);
62085 if (DestReg.isValid()) {
62086 bool is64Bit = Subtarget.is64Bit();
62087 const TargetRegisterClass *RC =
62088 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
62089 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
62090 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
62091 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
62092 if (Size == 64 && !is64Bit) {
62093 // Model GCC's behavior here and select a fixed pair of 32-bit
62094 // registers.
62095 switch (DestReg) {
62096 case X86::RAX:
62097 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
62098 case X86::RDX:
62099 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
62100 case X86::RCX:
62101 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
62102 case X86::RBX:
62103 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
62104 case X86::RSI:
62105 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
62106 case X86::RDI:
62107 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
62108 case X86::RBP:
62109 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
62110 default:
62111 return std::make_pair(0, nullptr);
62112 }
62113 }
62114 if (RC && RC->contains(DestReg))
62115 return std::make_pair(DestReg, RC);
62116 return Res;
62117 }
62118 // No register found/type mismatch.
62119 return std::make_pair(0, nullptr);
62120 } else if (isFRClass(*Class)) {
62121 // Handle references to XMM physical registers that got mapped into the
62122 // wrong class. This can happen with constraints like {xmm0} where the
62123 // target independent register mapper will just pick the first match it can
62124 // find, ignoring the required type.
62125
62126 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
62127 if (VT == MVT::f16)
62128 Res.second = &X86::FR16XRegClass;
62129 else if (VT == MVT::f32 || VT == MVT::i32)
62130 Res.second = &X86::FR32XRegClass;
62131 else if (VT == MVT::f64 || VT == MVT::i64)
62132 Res.second = &X86::FR64XRegClass;
62133 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
62134 Res.second = &X86::VR128XRegClass;
62135 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
62136 Res.second = &X86::VR256XRegClass;
62137 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
62138 Res.second = &X86::VR512RegClass;
62139 else {
62140 // Type mismatch and not a clobber: Return an error;
62141 Res.first = 0;
62142 Res.second = nullptr;
62143 }
62144 } else if (isVKClass(*Class)) {
62145 if (VT == MVT::v1i1 || VT == MVT::i1)
62146 Res.second = &X86::VK1RegClass;
62147 else if (VT == MVT::v8i1 || VT == MVT::i8)
62148 Res.second = &X86::VK8RegClass;
62149 else if (VT == MVT::v16i1 || VT == MVT::i16)
62150 Res.second = &X86::VK16RegClass;
62151 else if (VT == MVT::v32i1 || VT == MVT::i32)
62152 Res.second = &X86::VK32RegClass;
62153 else if (VT == MVT::v64i1 || VT == MVT::i64)
62154 Res.second = &X86::VK64RegClass;
62155 else {
62156 // Type mismatch and not a clobber: Return an error;
62157 Res.first = 0;
62158 Res.second = nullptr;
62159 }
62160 }
62161
62162 return Res;
62163}
62164
62165bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
62166 // Integer division on x86 is expensive. However, when aggressively optimizing
62167 // for code size, we prefer to use a div instruction, as it is usually smaller
62168 // than the alternative sequence.
62169 // The exception to this is vector division. Since x86 doesn't have vector
62170 // integer division, leaving the division as-is is a loss even in terms of
62171 // size, because it will have to be scalarized, while the alternative code
62172 // sequence can be performed in vector form.
62173 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62174 return OptSize && !VT.isVector();
62175}
62176
62177void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62178 if (!Subtarget.is64Bit())
62179 return;
62180
62181 // Update IsSplitCSR in X86MachineFunctionInfo.
62183 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62184 AFI->setIsSplitCSR(true);
62185}
62186
62187void X86TargetLowering::insertCopiesSplitCSR(
62188 MachineBasicBlock *Entry,
62189 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62190 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62191 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62192 if (!IStart)
62193 return;
62194
62195 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62196 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62197 MachineBasicBlock::iterator MBBI = Entry->begin();
62198 for (const MCPhysReg *I = IStart; *I; ++I) {
62199 const TargetRegisterClass *RC = nullptr;
62200 if (X86::GR64RegClass.contains(*I))
62201 RC = &X86::GR64RegClass;
62202 else
62203 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62204
62205 Register NewVR = MRI->createVirtualRegister(RC);
62206 // Create copy from CSR to a virtual register.
62207 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62208 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62209 // nounwind. If we want to generalize this later, we may need to emit
62210 // CFI pseudo-instructions.
62211 assert(
62212 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62213 "Function should be nounwind in insertCopiesSplitCSR!");
62214 Entry->addLiveIn(*I);
62215 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62216 .addReg(*I);
62217
62218 // Insert the copy-back instructions right before the terminator.
62219 for (auto *Exit : Exits)
62220 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62221 TII->get(TargetOpcode::COPY), *I)
62222 .addReg(NewVR);
62223 }
62224}
62225
62227 return Subtarget.is64Bit();
62228}
62229
62233 const TargetInstrInfo *TII) const {
62234 assert(MBBI->isCall() && MBBI->getCFIType() &&
62235 "Invalid call instruction for a KCFI check");
62236
62237 MachineFunction &MF = *MBB.getParent();
62238 // If the call target is a memory operand, unfold it and use R11 for the
62239 // call, so KCFI_CHECK won't have to recompute the address.
62240 switch (MBBI->getOpcode()) {
62241 case X86::CALL64m:
62242 case X86::CALL64m_NT:
62243 case X86::TAILJMPm64:
62244 case X86::TAILJMPm64_REX: {
62247 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62248 /*UnfoldStore=*/false, NewMIs))
62249 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62250 for (auto *NewMI : NewMIs)
62251 MBBI = MBB.insert(OrigCall, NewMI);
62252 assert(MBBI->isCall() &&
62253 "Unexpected instruction after memory operand unfolding");
62254 if (OrigCall->shouldUpdateAdditionalCallInfo())
62255 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62256 MBBI->setCFIType(MF, OrigCall->getCFIType());
62257 OrigCall->eraseFromParent();
62258 break;
62259 }
62260 default:
62261 break;
62262 }
62263
62264 MachineOperand &Target = MBBI->getOperand(0);
62265 Register TargetReg;
62266 switch (MBBI->getOpcode()) {
62267 case X86::CALL64r:
62268 case X86::CALL64r_ImpCall:
62269 case X86::CALL64r_NT:
62270 case X86::TAILJMPr64:
62271 case X86::TAILJMPr64_REX:
62272 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62273 Target.setIsRenamable(false);
62274 TargetReg = Target.getReg();
62275 break;
62276 case X86::CALL64pcrel32:
62277 case X86::TAILJMPd64:
62278 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62279 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62280 // 64-bit indirect thunk calls.
62281 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62282 "Unexpected register for an indirect thunk call");
62283 TargetReg = X86::R11;
62284 break;
62285 default:
62286 llvm_unreachable("Unexpected CFI call opcode");
62287 break;
62288 }
62289
62290 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62291 .addReg(TargetReg)
62292 .addImm(MBBI->getCFIType())
62293 .getInstr();
62294}
62295
62296/// Returns true if stack probing through a function call is requested.
62300
62301/// Returns true if stack probing through inline assembly is requested.
62303
62304 // No inline stack probe for Windows, they have their own mechanism.
62305 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62306 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62307 return false;
62308
62309 // If the function specifically requests inline stack probes, emit them.
62310 if (MF.getFunction().hasFnAttribute("probe-stack"))
62311 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62312 "inline-asm";
62313
62314 return false;
62315}
62316
62317/// Returns the name of the symbol used to emit stack probes or the empty
62318/// string if not applicable.
62321 // Inline Stack probes disable stack probe call
62322 if (hasInlineStackProbe(MF))
62323 return "";
62324
62325 // If the function specifically requests stack probes, emit them.
62326 if (MF.getFunction().hasFnAttribute("probe-stack"))
62327 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62328
62329 // Generally, if we aren't on Windows, the platform ABI does not include
62330 // support for stack probes, so don't emit them.
62331 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62332 Subtarget.isTargetMachO() ||
62333 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62334 return "";
62335
62336 // We need a stack probe to conform to the Windows ABI. Choose the right
62337 // symbol.
62338 if (Subtarget.is64Bit())
62339 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62340 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62341}
62342
62343unsigned
62345 // The default stack probe size is 4096 if the function has no stackprobesize
62346 // attribute.
62347 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62348 4096);
62349}
62350
62352 if (ML && ML->isInnermost() &&
62353 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62356}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISDPAC...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true, bool AllowAVX512=true)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code?
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
The Input class is used to parse a yaml document into in-memory structs and vectors.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition APFloat.cpp:6082
void clearSign()
Definition APFloat.h:1298
opStatus next(bool nextDown)
Definition APFloat.h:1254
void changeSign()
Definition APFloat.h:1297
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
APInt abs() const
Get the absolute value.
Definition APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition APInt.h:1079
int32_t exactLogBase2() const
Definition APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
unsigned countTrailingZeros() const
Definition APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition APInt.h:1435
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:471
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
This is an SDNode representing atomic operations.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition BitVector.h:181
bool any() const
any - Returns true if any bit is set.
Definition BitVector.h:189
bool none() const
none - Returns true if none of the bits are set.
Definition BitVector.h:207
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_NE
not equal
Definition InstrTypes.h:700
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
Tagged union holding either a T or a Error.
Definition Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:436
ThreadLocalMode getThreadLocalMode() const
Module * getParent()
Get the module that this global value is contained inside of...
This class is used to form a handle around another node that is persistent and is updated across invo...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
succ_reverse_iterator succ_rbegin()
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
size_type size() const
Definition SmallSet.h:170
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
static constexpr size_t npos
Definition StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition StringRef.h:172
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition Triple.h:774
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
use_iterator use_begin()
Definition Value.h:364
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
Register getPtrSizedStackRegister(const MachineFunction &MF) const
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
bool hasAnyFMA() const
bool hasSSE1() const
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
bool hasBitScanPassThrough() const
bool hasSSE42() const
const X86TargetLowering * getTargetLowering() const override
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
bool canUseCMOV() const
bool isTargetDarwin() const
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
const X86InstrInfo * getInstrInfo() const override
bool useAVX512Regs() const
bool hasSSE3() const
bool isCallingConvWin64(CallingConv::ID CC) const
bool hasAVX512() const
bool canExtendTo512DQ() const
bool hasSSE41() const
bool hasSSE2() const
bool hasSSSE3() const
bool hasInt256() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
unsigned getPreferVectorWidth() const
const X86FrameLowering * getFrameLowering() const override
bool useBWIRegs() const
bool hasAVX2() const
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define INT64_MIN
Definition DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
@ COND_NE
Not equal.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ X86_ThisCall
Similar to X86_StdCall.
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition ISDOpcodes.h:140
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:892
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition ISDOpcodes.h:151
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:881
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isExtVecInRegOpcode(unsigned Opcode)
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Opcode_match m_Opc(unsigned Opcode)
BinaryOpc_match< LHS, RHS > m_Srl(const LHS &L, const RHS &R)
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
TernaryOpc_match< LHS, RHS, IDX > m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx)
UnaryOpc_match< Opnd > m_Abs(const Opnd &Op)
Or< Preds... > m_AnyOf(const Preds &...preds)
And< Preds... > m_AllOf(const Preds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
UnaryOpc_match< Opnd > m_AnyExt(const Opnd &Op)
auto m_Node(unsigned Opcode, const OpndPreds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
CondCode_match m_SpecificCondCode(ISD::CondCode CC)
Match a conditional code SDNode with a specific ISD::CondCode.
auto m_SpecificVectorElementVT(EVT RefVT, const Pattern &P)
Match a vector ValueType.
CondCode_match m_CondCode()
Match any conditional code SDNode.
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
Invariant opcodes: All instruction sets have these as their low opcodes.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ PTR32_UPTR
Definition X86.h:217
@ PTR64
Definition X86.h:218
@ PTR32_SPTR
Definition X86.h:216
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
Define some predicates that are used for node matching.
@ AddrNumOperands
Definition X86BaseInfo.h:36
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getRoundingModeX86(unsigned RM)
Convert LLVM rounding mode to X86 rounding mode.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
constexpr double e
Definition MathExtras.h:47
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition Utils.cpp:1607
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2056
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:314
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1757
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1974
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
To bit_cast(const From &from) noexcept
Definition bit.h:90
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1840
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1934
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1815
constexpr unsigned BitWidth
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2088
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition STLExtras.h:1584
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
@ SM_SentinelUndef
@ SM_SentinelZero
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
#define EQ(a, b)
Definition regexec.c:65
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition APFloat.h:308
static LLVM_ABI const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition APFloat.cpp:289
static LLVM_ABI const fltSemantics & IEEEquad() LLVM_READNONE
Definition APFloat.cpp:268
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
opStatus
IEEE-754R 7: Default exception handling.
Definition APFloat.h:320
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition KnownBits.h:186
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition KnownBits.h:274
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition KnownBits.h:289
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:111
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition KnownBits.h:196
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
void setNoSignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
X86AddressMode - This struct holds a generalized full x86 address mode.