LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
159 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
301 setOperationAction(ISD::LRINT, MVT::f32, Custom);
302 setOperationAction(ISD::LRINT, MVT::f64, Custom);
303 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
304 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
305
306 if (!Subtarget.is64Bit()) {
307 setOperationAction(ISD::LRINT, MVT::i64, Custom);
308 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
331 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
332 MVT::v4i64}) {
335 }
336 if (Subtarget.is64Bit()) {
339 }
340 }
341
342 // Handle address space casts between mixed sized pointers.
343 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
344 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
345
346 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
347 if (!Subtarget.hasSSE2()) {
348 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
349 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
352 if (Subtarget.is64Bit()) {
353 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
354 // Without SSE, i64->f64 goes through memory.
355 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
356 }
357 } else if (!Subtarget.is64Bit())
358 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
359
360 // Scalar integer divide and remainder are lowered to use operations that
361 // produce two results, to match the available instructions. This exposes
362 // the two-result form to trivial CSE, which is able to combine x/y and x%y
363 // into a single instruction.
364 //
365 // Scalar integer multiply-high is also lowered to use two-result
366 // operations, to match the available instructions. However, plain multiply
367 // (low) operations are left as Legal, as there are single-result
368 // instructions for this in x86. Using the two-result multiply instructions
369 // when both high and low results are needed must be arranged by dagcombine.
370 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
377 }
378
379 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
380 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
381 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
383 setOperationAction(ISD::BR_CC, VT, Expand);
385 }
386 if (Subtarget.is64Bit())
391
392 setOperationAction(ISD::FREM , MVT::f32 , Expand);
393 setOperationAction(ISD::FREM , MVT::f64 , Expand);
394 setOperationAction(ISD::FREM , MVT::f80 , Expand);
395 setOperationAction(ISD::FREM , MVT::f128 , Expand);
396
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
399 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
400 setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
401 setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
402 setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
403 }
404
405 // Promote the i8 variants and force them on up to i32 which has a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
409 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
410 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
411 // promote that too.
412 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
414
415 if (!Subtarget.hasBMI()) {
416 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
418 if (Subtarget.is64Bit()) {
419 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
421 }
422 }
423
424 if (Subtarget.hasLZCNT()) {
425 // When promoting the i8 variants, force them to i32 for a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
429 } else {
430 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 continue;
435 }
436 }
437
438 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
439 ISD::STRICT_FP_TO_FP16}) {
440 // Special handling for half-precision floating point conversions.
441 // If we don't have F16C support, then lower half float conversions
442 // into library calls.
444 Op, MVT::f32,
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
446 // There's never any support for operations beyond MVT::f32.
447 setOperationAction(Op, MVT::f64, Expand);
448 setOperationAction(Op, MVT::f80, Expand);
449 setOperationAction(Op, MVT::f128, Expand);
450 }
451
452 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
453 setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand);
454 setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand);
455 }
456
457 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
460 setTruncStoreAction(VT, MVT::f16, Expand);
461 setTruncStoreAction(VT, MVT::bf16, Expand);
462
463 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
464 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
465 }
466
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
473 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
474 // popcntw is longer to encode than popcntl and also has a false dependency
475 // on the dest that popcntl hasn't had since Cannon Lake.
476 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
477 } else {
482 }
483
484 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
485
486 if (!Subtarget.hasMOVBE())
488
489 // X86 wants to expand cmov itself.
490 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 }
496 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
498 continue;
501 }
502
503 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
506
508 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
509 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
513
514 // Darwin ABI issue.
515 for (auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
524 }
525
526 // 64-bit shl, sra, srl (iff 32-bit x86)
527 for (auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
529 continue;
533 }
534
535 if (Subtarget.hasSSEPrefetch())
536 setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
537
538 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
539
540 // Expand certain atomics
541 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
544 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
545 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
546 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
547 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
548 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
549 }
550
551 if (!Subtarget.is64Bit())
552 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
553
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
555 // All CPUs supporting AVX will atomically load/store aligned 128-bit
556 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
557 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
558 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
559 }
560
561 if (Subtarget.canUseCMPXCHG16B())
562 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
563
564 // FIXME - use subtarget debug flags
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
567 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
568 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
569 }
570
573
574 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
575 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
576
577 setOperationAction(ISD::TRAP, MVT::Other, Legal);
578 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
579 if (Subtarget.isTargetPS())
580 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
581 else
582 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
583
584 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
585 setOperationAction(ISD::VASTART , MVT::Other, Custom);
586 setOperationAction(ISD::VAEND , MVT::Other, Expand);
587 bool Is64Bit = Subtarget.is64Bit();
588 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
589 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
590
591 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
592 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
593
594 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
595
596 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
597 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
598 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
599
601
602 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
603 setOperationAction(ISD::FABS, VT, Action);
604 setOperationAction(ISD::FNEG, VT, Action);
606 setOperationAction(ISD::FREM, VT, Action);
607 setOperationAction(ISD::FMA, VT, Action);
608 setOperationAction(ISD::FMINNUM, VT, Action);
609 setOperationAction(ISD::FMAXNUM, VT, Action);
610 setOperationAction(ISD::FMINIMUM, VT, Action);
611 setOperationAction(ISD::FMAXIMUM, VT, Action);
612 setOperationAction(ISD::FMINIMUMNUM, VT, Action);
613 setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
614 setOperationAction(ISD::FSIN, VT, Action);
615 setOperationAction(ISD::FCOS, VT, Action);
616 setOperationAction(ISD::FSINCOS, VT, Action);
617 setOperationAction(ISD::FTAN, VT, Action);
618 setOperationAction(ISD::FSQRT, VT, Action);
619 setOperationAction(ISD::FPOW, VT, Action);
620 setOperationAction(ISD::FPOWI, VT, Action);
621 setOperationAction(ISD::FLOG, VT, Action);
622 setOperationAction(ISD::FLOG2, VT, Action);
623 setOperationAction(ISD::FLOG10, VT, Action);
624 setOperationAction(ISD::FEXP, VT, Action);
625 setOperationAction(ISD::FEXP2, VT, Action);
626 setOperationAction(ISD::FEXP10, VT, Action);
627 setOperationAction(ISD::FCEIL, VT, Action);
628 setOperationAction(ISD::FFLOOR, VT, Action);
629 setOperationAction(ISD::FNEARBYINT, VT, Action);
630 setOperationAction(ISD::FRINT, VT, Action);
631 setOperationAction(ISD::BR_CC, VT, Action);
632 setOperationAction(ISD::SETCC, VT, Action);
635 setOperationAction(ISD::FROUND, VT, Action);
636 setOperationAction(ISD::FROUNDEVEN, VT, Action);
637 setOperationAction(ISD::FTRUNC, VT, Action);
638 setOperationAction(ISD::FLDEXP, VT, Action);
639 };
640
641 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
642 // f16, f32 and f64 use SSE.
643 // Set up the FP register classes.
644 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
645 : &X86::FR16RegClass);
646 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
647 : &X86::FR32RegClass);
648 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
649 : &X86::FR64RegClass);
650
651 // Disable f32->f64 extload as we can only generate this in one instruction
652 // under optsize. So its easier to pattern match (fpext (load)) for that
653 // case instead of needing to emit 2 instructions for extload in the
654 // non-optsize case.
655 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
656
657 for (auto VT : { MVT::f32, MVT::f64 }) {
658 // Use ANDPD to simulate FABS.
659 setOperationAction(ISD::FABS, VT, Custom);
660
661 // Use XORP to simulate FNEG.
662 setOperationAction(ISD::FNEG, VT, Custom);
663
664 // Use ANDPD and ORPD to simulate FCOPYSIGN.
666
667 // These might be better off as horizontal vector ops.
670
671 // We don't support sin/cos/fmod
672 setOperationAction(ISD::FSIN , VT, Expand);
673 setOperationAction(ISD::FCOS , VT, Expand);
674 setOperationAction(ISD::FSINCOS, VT, Expand);
675 }
676
677 // Half type will be promoted by default.
678 setF16Action(MVT::f16, Promote);
683 setOperationAction(ISD::FABS, MVT::f16, Custom);
684 setOperationAction(ISD::FNEG, MVT::f16, Custom);
687 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
688 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
689
720 setOperationAction(ISD::LRINT, MVT::f16, Expand);
721 setOperationAction(ISD::LLRINT, MVT::f16, Expand);
722
723 // Lower this to MOVMSK plus an AND.
726
727 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
728 (UseX87 || Is64Bit)) {
729 // Use SSE for f32, x87 for f64.
730 // Set up the FP register classes.
731 addRegisterClass(MVT::f32, &X86::FR32RegClass);
732 if (UseX87)
733 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
734
735 // Use ANDPS to simulate FABS.
736 setOperationAction(ISD::FABS , MVT::f32, Custom);
737
738 // Use XORP to simulate FNEG.
739 setOperationAction(ISD::FNEG , MVT::f32, Custom);
740
741 if (UseX87)
743
744 // Use ANDPS and ORPS to simulate FCOPYSIGN.
745 if (UseX87)
748
749 // We don't support sin/cos/fmod
750 setOperationAction(ISD::FSIN , MVT::f32, Expand);
751 setOperationAction(ISD::FCOS , MVT::f32, Expand);
752 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
753
754 if (UseX87) {
755 // Always expand sin/cos functions even though x87 has an instruction.
756 setOperationAction(ISD::FSIN, MVT::f64, Expand);
757 setOperationAction(ISD::FCOS, MVT::f64, Expand);
758 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
759 }
760 } else if (UseX87) {
761 // f32 and f64 in x87.
762 // Set up the FP register classes.
763 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
764 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
765
766 for (auto VT : { MVT::f32, MVT::f64 }) {
769
770 // Always expand sin/cos functions even though x87 has an instruction.
771 setOperationAction(ISD::FSIN , VT, Expand);
772 setOperationAction(ISD::FCOS , VT, Expand);
773 setOperationAction(ISD::FSINCOS, VT, Expand);
774 }
775 }
776
777 // Expand FP32 immediates into loads from the stack, save special cases.
778 if (isTypeLegal(MVT::f32)) {
779 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
780 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
781 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
782 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
783 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
784 } else // SSE immediates.
785 addLegalFPImmediate(APFloat(+0.0f)); // xorps
786 }
787 // Expand FP64 immediates into loads from the stack, save special cases.
788 if (isTypeLegal(MVT::f64)) {
789 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
790 addLegalFPImmediate(APFloat(+0.0)); // FLD0
791 addLegalFPImmediate(APFloat(+1.0)); // FLD1
792 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
793 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
794 } else // SSE immediates.
795 addLegalFPImmediate(APFloat(+0.0)); // xorpd
796 }
797 // Support fp16 0 immediate.
798 if (isTypeLegal(MVT::f16))
799 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
800
801 // Handle constrained floating-point operations of scalar.
814
815 // We don't support FMA.
818
819 // f80 always uses X87.
820 if (UseX87) {
821 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
824 {
826 addLegalFPImmediate(TmpFlt); // FLD0
827 TmpFlt.changeSign();
828 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
829
830 bool ignored;
831 APFloat TmpFlt2(+1.0);
833 &ignored);
834 addLegalFPImmediate(TmpFlt2); // FLD1
835 TmpFlt2.changeSign();
836 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
837 }
838
839 // Always expand sin/cos functions even though x87 has an instruction.
840 // clang-format off
841 setOperationAction(ISD::FSIN , MVT::f80, Expand);
842 setOperationAction(ISD::FCOS , MVT::f80, Expand);
843 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
844 setOperationAction(ISD::FTAN , MVT::f80, Expand);
845 setOperationAction(ISD::FASIN , MVT::f80, Expand);
846 setOperationAction(ISD::FACOS , MVT::f80, Expand);
847 setOperationAction(ISD::FATAN , MVT::f80, Expand);
848 setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
849 setOperationAction(ISD::FSINH , MVT::f80, Expand);
850 setOperationAction(ISD::FCOSH , MVT::f80, Expand);
851 setOperationAction(ISD::FTANH , MVT::f80, Expand);
852 // clang-format on
853
854 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
855 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
856 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
857 setOperationAction(ISD::FRINT, MVT::f80, Expand);
858 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
859 setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
861 setOperationAction(ISD::LROUND, MVT::f80, LibCall);
862 setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
863 setOperationAction(ISD::LRINT, MVT::f80, Custom);
864 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
865
866 // Handle constrained floating-point operations of scalar.
873 if (isTypeLegal(MVT::f16)) {
874 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
876 } else {
878 }
879 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
880 // as Custom.
882 }
883
884 // f128 uses xmm registers, but most operations require libcalls.
885 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
886 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
890
901
902 setOperationAction(ISD::FABS, MVT::f128, Custom);
903 setOperationAction(ISD::FNEG, MVT::f128, Custom);
905
906 // clang-format off
907 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
909 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
911 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
912 setOperationAction(ISD::FTAN, MVT::f128, LibCall);
914 // clang-format on
915 // No STRICT_FSINCOS
916 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
918
919 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
921 // We need to custom handle any FP_ROUND with an f128 input, but
922 // LegalizeDAG uses the result type to know when to run a custom handler.
923 // So we have to list all legal floating point result types here.
924 if (isTypeLegal(MVT::f32)) {
927 }
928 if (isTypeLegal(MVT::f64)) {
931 }
932 if (isTypeLegal(MVT::f80)) {
936 }
937
939
940 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
941 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
942 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
943 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
944 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
945 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
946 }
947
948 // Always use a library call for pow.
949 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
950 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
951 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
952 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
953
954 setOperationAction(ISD::FLOG, MVT::f80, Expand);
955 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
956 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
957 setOperationAction(ISD::FEXP, MVT::f80, Expand);
958 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
959 setOperationAction(ISD::FEXP10, MVT::f80, Expand);
960 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
961 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
962
963 // Some FP actions are always expanded for vector types.
964 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
965 MVT::v4f32, MVT::v8f32, MVT::v16f32,
966 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
967 // clang-format off
968 setOperationAction(ISD::FSIN, VT, Expand);
969 setOperationAction(ISD::FSINCOS, VT, Expand);
970 setOperationAction(ISD::FCOS, VT, Expand);
971 setOperationAction(ISD::FTAN, VT, Expand);
974 setOperationAction(ISD::FPOW, VT, Expand);
975 setOperationAction(ISD::FLOG, VT, Expand);
976 setOperationAction(ISD::FLOG2, VT, Expand);
977 setOperationAction(ISD::FLOG10, VT, Expand);
978 setOperationAction(ISD::FEXP, VT, Expand);
979 setOperationAction(ISD::FEXP2, VT, Expand);
980 setOperationAction(ISD::FEXP10, VT, Expand);
981 // clang-format on
982 }
983
984 // First set operation action for all vector types to either promote
985 // (for widening) or expand (for scalarization). Then we will selectively
986 // turn on ones that can be effectively codegen'd.
997 setOperationAction(ISD::FFLOOR, VT, Expand);
998 setOperationAction(ISD::FCEIL, VT, Expand);
999 setOperationAction(ISD::FTRUNC, VT, Expand);
1000 setOperationAction(ISD::FRINT, VT, Expand);
1001 setOperationAction(ISD::FNEARBYINT, VT, Expand);
1002 setOperationAction(ISD::FROUNDEVEN, VT, Expand);
1026 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1027 setTruncStoreAction(InnerVT, VT, Expand);
1028
1029 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1030 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1031
1032 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1033 // types, we have to deal with them whether we ask for Expansion or not.
1034 // Setting Expand causes its own optimisation problems though, so leave
1035 // them legal.
1036 if (VT.getVectorElementType() == MVT::i1)
1037 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1038
1039 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1040 // split/scalarized right now.
1041 if (VT.getVectorElementType() == MVT::f16 ||
1042 VT.getVectorElementType() == MVT::bf16)
1043 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1044 }
1045 }
1046
1047 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1048 // with -msoft-float, disable use of MMX as well.
1049 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1050 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1051 // No operations on x86mmx supported, everything uses intrinsics.
1052 }
1053
1054 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1055 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1056 : &X86::VR128RegClass);
1057
1058 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1059 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1060 setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
1061 setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
1062
1063 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1064 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1072
1073 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1074 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1076
1082 }
1083
1084 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1085 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1086 : &X86::VR128RegClass);
1087
1088 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1089 // registers cannot be used even for integer operations.
1090 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1091 : &X86::VR128RegClass);
1092 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1093 : &X86::VR128RegClass);
1094 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1097 : &X86::VR128RegClass);
1098 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1099 : &X86::VR128RegClass);
1100
1101 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1102 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1103 setOperationAction(ISD::FMINIMUM, VT, Custom);
1104 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1105 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1106 }
1107
1108 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1109 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1114 }
1115
1116 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1117 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1118 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1119
1120 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1121 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1122 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1123 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1125 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1126 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1127 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1128 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1129 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1132
1133 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1134 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1135 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1136
1137 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1139 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1141
1142 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1143 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1144
1145 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1146 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1147 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1148 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1149 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1150 }
1151
1162
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1174
1175 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1176 // setcc all the way to isel and prefer SETGT in some isel patterns.
1179 }
1180
1181 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1182 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1187
1188 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1194 }
1195
1196 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1200
1201 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1202 continue;
1203
1206 }
1207 setF16Action(MVT::v8f16, Expand);
1208 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1209 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1210 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1212 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1213 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1215
1216 // Custom lower v2i64 and v2f64 selects.
1223
1230
1231 // Custom legalize these to avoid over promotion or custom promotion.
1232 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1237 }
1238
1243
1246
1249
1250 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1255
1256 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1260
1261 // We want to legalize this to an f64 load rather than an i64 load on
1262 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1263 // store.
1264 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1265 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1266 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1267 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1268 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1269 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1270
1271 // Add 32-bit vector stores to help vectorization opportunities.
1272 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1273 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1274
1275 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1276 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1277 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1278 if (!Subtarget.hasAVX512())
1279 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1280
1284
1286
1303
1304 // In the customized shift lowering, the legal v4i32/v2i64 cases
1305 // in AVX2 will be recognized.
1306 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1310 if (VT == MVT::v2i64) continue;
1315 }
1316
1322 }
1323
1324 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1329
1330 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1332 }
1333 }
1334
1335 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1336 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1337 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1338 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1339
1340 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1343 }
1344
1345 // These might be better off as horizontal vector ops.
1350 }
1351
1352 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1353 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1354 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1356 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1358 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1360 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1362 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1364 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1366
1367 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1368 }
1369
1370 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1371 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1372 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1373 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1374 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1375 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1376 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1377 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1378
1382
1383 // FIXME: Do we need to handle scalar-to-vector here?
1384 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1385 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1386
1387 // We directly match byte blends in the backend as they match the VSELECT
1388 // condition form.
1390
1391 // SSE41 brings specific instructions for doing vector sign extend even in
1392 // cases where we don't have SRA.
1393 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1396 }
1397
1398 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1399 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1400 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1405 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1406 }
1407
1408 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1409 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1410 // do the pre and post work in the vector domain.
1413 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1414 // so that DAG combine doesn't try to turn it into uint_to_fp.
1417 }
1418 }
1419
1420 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1422 }
1423
1424 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1425 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1426 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1429 }
1430
1431 // XOP can efficiently perform BITREVERSE with VPPERM.
1432 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1434 }
1435
1436 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1437 bool HasInt256 = Subtarget.hasInt256();
1438
1439 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1440 : &X86::VR256RegClass);
1441 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1442 : &X86::VR256RegClass);
1443 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1444 : &X86::VR256RegClass);
1445 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1446 : &X86::VR256RegClass);
1447 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1448 : &X86::VR256RegClass);
1449 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1450 : &X86::VR256RegClass);
1451 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1452 : &X86::VR256RegClass);
1453
1454 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1455 setOperationAction(ISD::FFLOOR, VT, Legal);
1457 setOperationAction(ISD::FCEIL, VT, Legal);
1459 setOperationAction(ISD::FTRUNC, VT, Legal);
1461 setOperationAction(ISD::FRINT, VT, Legal);
1463 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1465 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1467
1468 setOperationAction(ISD::FROUND, VT, Custom);
1469
1470 setOperationAction(ISD::FNEG, VT, Custom);
1471 setOperationAction(ISD::FABS, VT, Custom);
1473
1474 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1475 setOperationAction(ISD::FMINIMUM, VT, Custom);
1476 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1477 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1479 }
1480
1481 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1482 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1483
1484 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1485 // even though v8i16 is a legal type.
1486 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1487 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1488 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1489 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1493
1496 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1498 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1500
1512
1513 if (!Subtarget.hasAVX512())
1514 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1515
1516 // In the customized shift lowering, the legal v8i32/v4i64 cases
1517 // in AVX2 will be recognized.
1518 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1524 if (VT == MVT::v4i64) continue;
1529 }
1530
1531 // These types need custom splitting if their input is a 128-bit vector.
1536
1540 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1541 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1544
1545 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1549 }
1550
1555
1556 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1561
1562 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1563 // setcc all the way to isel and prefer SETGT in some isel patterns.
1566 }
1567
1568 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1569 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1574
1575 if (Subtarget.hasAnyFMA()) {
1576 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1577 MVT::v2f64, MVT::v4f64 }) {
1580 }
1581 }
1582
1583 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1584 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1585 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1586 }
1587
1588 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1589 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1592
1593 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1594 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1595 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1596 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1597 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1598 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1599 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1601
1602 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1603 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1604
1605 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1606 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1607 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1608 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1609 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1610
1611 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1623
1624 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1625 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1630 }
1631
1632 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1635 }
1636
1637 if (HasInt256) {
1638 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1639 // when we have a 256bit-wide blend with immediate.
1642
1643 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1644 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1645 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1646 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1647 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1650 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1651 }
1652 }
1653
1654 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1655 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1656 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1657 setOperationAction(ISD::MSTORE, VT, Legal);
1658 }
1659
1660 // Extract subvector is special because the value type
1661 // (result) is 128-bit but the source is 256-bit wide.
1662 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1663 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1665 }
1666
1667 // Custom lower several nodes for 256-bit types.
1668 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1669 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1678 setOperationAction(ISD::STORE, VT, Custom);
1679 }
1680 setF16Action(MVT::v16f16, Expand);
1681 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1682 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1684 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1685 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1686 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1687 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1688
1689 if (HasInt256) {
1691
1692 // Custom legalize 2x32 to get a little better code.
1693 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1694 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1695
1696 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1697 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1698 setOperationAction(ISD::MGATHER, VT, Custom);
1699 }
1700 }
1701
1702 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1703 Subtarget.hasF16C()) {
1704 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1707 }
1708 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1709 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1711 }
1712 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1713 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1714 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1715 }
1716 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1717 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1718 }
1719
1720 // This block controls legalization of the mask vector sizes that are
1721 // available with AVX512. 512-bit vectors are in a separate block controlled
1722 // by useAVX512Regs.
1723 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1724 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1725 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1726 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1727 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1728 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1729
1733
1734 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1735 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1736 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1737 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1738 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1739 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1741 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1749
1750 // There is no byte sized k-register load or store without AVX512DQ.
1751 if (!Subtarget.hasDQI()) {
1752 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1753 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1754 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1755 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1756
1757 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1758 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1759 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1760 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1761 }
1762
1763 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1764 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1768 }
1769
1770 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1772
1773 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1777
1784 }
1785
1786 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1788 }
1789 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1790 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1791 setOperationAction(ISD::LRINT, VT, Legal);
1792 setOperationAction(ISD::LLRINT, VT, Legal);
1793 }
1794 }
1795
1796 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1797 // elements. 512-bits can be disabled based on prefer-vector-width and
1798 // required-vector-width function attributes.
1799 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1800 bool HasBWI = Subtarget.hasBWI();
1801
1802 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1808 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1809
1810 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1811 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1812 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1813 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1814 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1815 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1816 if (HasBWI)
1817 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1818 }
1819
1820 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1821 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1822 setOperationAction(ISD::FMINIMUM, VT, Custom);
1823 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1824 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1825 setOperationAction(ISD::FNEG, VT, Custom);
1826 setOperationAction(ISD::FABS, VT, Custom);
1831 }
1832 setOperationAction(ISD::LRINT, MVT::v16f32,
1833 Subtarget.hasDQI() ? Legal : Custom);
1834 setOperationAction(ISD::LRINT, MVT::v8f64,
1835 Subtarget.hasDQI() ? Legal : Custom);
1836 if (Subtarget.hasDQI())
1837 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1838
1839 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1844 }
1845
1846 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1851 }
1852
1857 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1859
1871
1872 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1873 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1874 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1875 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1876 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1877 if (HasBWI)
1878 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1879
1880 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1881 // to 512-bit rather than use the AVX2 instructions so that we can use
1882 // k-masks.
1883 if (!Subtarget.hasVLX()) {
1884 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1885 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1886 setOperationAction(ISD::MLOAD, VT, Custom);
1887 setOperationAction(ISD::MSTORE, VT, Custom);
1888 }
1889 }
1890
1892 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1893 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1903
1904 if (HasBWI) {
1905 // Extends from v64i1 masks to 512-bit vectors.
1909 }
1910
1911 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1912 setOperationAction(ISD::FFLOOR, VT, Legal);
1914 setOperationAction(ISD::FCEIL, VT, Legal);
1916 setOperationAction(ISD::FTRUNC, VT, Legal);
1918 setOperationAction(ISD::FRINT, VT, Legal);
1920 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1922 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1924
1925 setOperationAction(ISD::FROUND, VT, Custom);
1926 }
1927
1928 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1931 }
1932
1933 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1934 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1935 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1936 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1937
1938 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1939 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1940 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1941 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1942
1943 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1944 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1945 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1946 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1947 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1948 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1949 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1951
1952 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1953 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1954
1955 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1965
1966 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1967 // setcc all the way to isel and prefer SETGT in some isel patterns.
1970 }
1971
1972 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1973 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1978
1979 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1986 }
1987
1988 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1989 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1990 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1992 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1993 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1994 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1995 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2000 }
2001
2002 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2003 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2004 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2005 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2006 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2007 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2008
2009 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2013 setOperationAction(Opc, MVT::v8i64, Custom);
2014
2015 if (Subtarget.hasDQI())
2016 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2017
2018 if (Subtarget.hasCDI()) {
2019 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2020 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2022 }
2023 } // Subtarget.hasCDI()
2024
2025 if (Subtarget.hasVPOPCNTDQ()) {
2026 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2028 }
2029
2030 // Extract subvector is special because the value type
2031 // (result) is 256-bit but the source is 512-bit wide.
2032 // 128-bit was made Legal under AVX1.
2033 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2034 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2036
2037 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2038 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2048 }
2049 setF16Action(MVT::v32f16, Expand);
2052 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2054 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2055 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2056 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2057
2058 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2059 setOperationAction(ISD::MLOAD, VT, Legal);
2060 setOperationAction(ISD::MSTORE, VT, Legal);
2061 setOperationAction(ISD::MGATHER, VT, Custom);
2062 setOperationAction(ISD::MSCATTER, VT, Custom);
2063 }
2064 if (HasBWI) {
2065 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2066 setOperationAction(ISD::MLOAD, VT, Legal);
2067 setOperationAction(ISD::MSTORE, VT, Legal);
2068 }
2069 } else {
2070 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2071 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2072 }
2073
2074 if (Subtarget.hasVBMI2()) {
2075 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2078 }
2079
2080 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2081 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2082 }
2083
2084 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2085 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2087 }// useAVX512Regs
2088
2089 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2090 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2091 MVT::v4i64}) {
2094 }
2095 }
2096
2097 // This block controls legalization for operations that don't have
2098 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2099 // narrower widths.
2100 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2101 // These operations are handled on non-VLX by artificially widening in
2102 // isel patterns.
2103
2107
2108 if (Subtarget.hasDQI()) {
2109 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2110 // v2f32 UINT_TO_FP is already custom under SSE2.
2113 "Unexpected operation action!");
2114 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2119 }
2120
2121 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2127 }
2128
2129 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2132 }
2133
2134 // Custom legalize 2x32 to get a little better code.
2135 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2136 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2137
2138 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2139 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2140 setOperationAction(ISD::MSCATTER, VT, Custom);
2141
2142 if (Subtarget.hasDQI()) {
2146 setOperationAction(Opc, MVT::v2i64, Custom);
2147 setOperationAction(Opc, MVT::v4i64, Custom);
2148 }
2149 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2150 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2151 }
2152
2153 if (Subtarget.hasCDI()) {
2154 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2156 }
2157 } // Subtarget.hasCDI()
2158
2159 if (Subtarget.hasVPOPCNTDQ()) {
2160 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2162 }
2163
2164 // We can try to convert vectors to different sizes to leverage legal
2165 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2166 // then specialize to Legal below.
2167 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2168 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2169 MVT::v16i16, MVT::v8i8})
2171
2172 // Legal vpcompress depends on various AVX512 extensions.
2173 // Legal in AVX512F
2174 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2176
2177 // Legal in AVX512F + AVX512VL
2178 if (Subtarget.hasVLX())
2179 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2180 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2182
2183 // Legal in AVX512F + AVX512VBMI2
2184 if (Subtarget.hasVBMI2())
2185 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2187
2188 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2189 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2190 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2192 }
2193
2194 // This block control legalization of v32i1/v64i1 which are available with
2195 // AVX512BW..
2196 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2197 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2198 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2199
2200 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2211 }
2212
2213 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2215
2216 // Extends from v32i1 masks to 256-bit vectors.
2220
2221 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2222 MVT::v16f16, MVT::v8f16}) {
2223 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2224 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2225 }
2226
2227 // These operations are handled on non-VLX by artificially widening in
2228 // isel patterns.
2229 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2230
2231 if (Subtarget.hasBITALG()) {
2232 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2234 }
2235 }
2236
2237 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2238 auto setGroup = [&] (MVT VT) {
2247 setOperationAction(ISD::FSQRT, VT, Legal);
2249
2250 setOperationAction(ISD::FFLOOR, VT, Legal);
2252 setOperationAction(ISD::FCEIL, VT, Legal);
2254 setOperationAction(ISD::FTRUNC, VT, Legal);
2256 setOperationAction(ISD::FRINT, VT, Legal);
2258 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2260 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
2262
2263 setOperationAction(ISD::FROUND, VT, Custom);
2264
2265 setOperationAction(ISD::LOAD, VT, Legal);
2266 setOperationAction(ISD::STORE, VT, Legal);
2267
2273
2274 setOperationAction(ISD::FNEG, VT, Custom);
2275 setOperationAction(ISD::FABS, VT, Custom);
2279
2283 };
2284
2285 // AVX512_FP16 scalar operations
2286 setGroup(MVT::f16);
2290 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2292 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2296 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2297 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2298 setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
2299 setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
2300 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2302 setOperationAction(ISD::LRINT, MVT::f16, Legal);
2303 setOperationAction(ISD::LLRINT, MVT::f16, Legal);
2304
2307
2308 if (Subtarget.useAVX512Regs()) {
2309 setGroup(MVT::v32f16);
2315 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2317 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2319 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
2322
2327 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2329 MVT::v32i16);
2330 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2332 MVT::v32i16);
2333 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2335 MVT::v32i16);
2336 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2338 MVT::v32i16);
2339
2343
2344 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2345 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2346
2347 setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2348 setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
2349 setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
2350 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
2351 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2352 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2353 }
2354
2359
2360 if (Subtarget.hasVLX()) {
2361 setGroup(MVT::v8f16);
2362 setGroup(MVT::v16f16);
2363
2374
2377 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
2379 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
2381
2382 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2385
2389
2390 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2391 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2393 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2394
2395 // Need to custom widen these to prevent scalarization.
2396 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2397 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2398
2399 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2400 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2401 setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
2402 setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
2403
2404 setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2405 setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
2406 setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
2407 setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
2408 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2409 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2410 }
2411 }
2412
2413 if (!Subtarget.useSoftFloat() &&
2414 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2415 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2416 : &X86::VR128RegClass);
2417 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2418 : &X86::VR256RegClass);
2419 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2420 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2421 // Set the operation action Custom to do the customization later.
2424 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2425 setF16Action(VT, Expand);
2426 if (!Subtarget.hasBF16())
2432 }
2433 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2434 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2435 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2436 }
2437 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2438 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2440 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2441 }
2442
2443 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2444 Subtarget.useAVX512Regs()) {
2445 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2446 setF16Action(MVT::v32bf16, Expand);
2447 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2448 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2449 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2451 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2455 }
2456
2457 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2458 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2459 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2460 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2461 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2462 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2463 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2464 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2465 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2466 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2467 setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
2468 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
2469 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2474 setOperationAction(ISD::FSQRT, VT, Legal);
2477 setOperationAction(ISD::FMINIMUM, VT, Custom);
2478 setOperationAction(ISD::FMAXIMUM, VT, Custom);
2479 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
2480 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
2481 }
2482 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2485 }
2486 }
2487
2488 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2489 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2490 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2491 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2492 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2493 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2494
2495 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2496 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2497 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2498 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2499 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2500
2501 if (Subtarget.hasBWI()) {
2502 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2503 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2504 }
2505
2506 if (Subtarget.hasFP16()) {
2507 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2516 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2525 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2530 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2531 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2533 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2535 }
2536 }
2537
2538 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2539 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2540 }
2541
2542 // We want to custom lower some of our intrinsics.
2546 if (!Subtarget.is64Bit()) {
2548 }
2549
2550 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2551 // handle type legalization for these operations here.
2552 //
2553 // FIXME: We really should do custom legalization for addition and
2554 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2555 // than generic legalization for 64-bit multiplication-with-overflow, though.
2556 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2557 if (VT == MVT::i64 && !Subtarget.is64Bit())
2558 continue;
2559 // Add/Sub/Mul with overflow operations are custom lowered.
2566
2567 // Support carry in as value rather than glue.
2573 }
2574
2575 // Combine sin / cos into _sincos_stret if it is available.
2576 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2577 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2578 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2579 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2580 }
2581
2582 if (Subtarget.isTargetWin64()) {
2583 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2584 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2585 setOperationAction(ISD::SREM, MVT::i128, Custom);
2586 setOperationAction(ISD::UREM, MVT::i128, Custom);
2595 }
2596
2597 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2598 // is. We should promote the value to 64-bits to solve this.
2599 // This is what the CRT headers do - `fmodf` is an inline header
2600 // function casting to f64 and calling `fmod`.
2601 if (Subtarget.is32Bit() &&
2602 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2603 // clang-format off
2604 for (ISD::NodeType Op :
2605 {ISD::FACOS, ISD::STRICT_FACOS,
2606 ISD::FASIN, ISD::STRICT_FASIN,
2607 ISD::FATAN, ISD::STRICT_FATAN,
2608 ISD::FATAN2, ISD::STRICT_FATAN2,
2609 ISD::FCEIL, ISD::STRICT_FCEIL,
2610 ISD::FCOS, ISD::STRICT_FCOS,
2611 ISD::FCOSH, ISD::STRICT_FCOSH,
2612 ISD::FEXP, ISD::STRICT_FEXP,
2613 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2615 ISD::FLOG, ISD::STRICT_FLOG,
2616 ISD::FLOG10, ISD::STRICT_FLOG10,
2617 ISD::FPOW, ISD::STRICT_FPOW,
2618 ISD::FSIN, ISD::STRICT_FSIN,
2619 ISD::FSINH, ISD::STRICT_FSINH,
2620 ISD::FTAN, ISD::STRICT_FTAN,
2621 ISD::FTANH, ISD::STRICT_FTANH,
2622 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2623 ISD::FMODF})
2624 if (isOperationExpand(Op, MVT::f32))
2625 setOperationAction(Op, MVT::f32, Promote);
2626 // clang-format on
2627
2628 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2629 // it, but it's just a wrapper around ldexp.
2630 if (Subtarget.isOSWindows()) {
2631 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2632 if (isOperationExpand(Op, MVT::f32))
2633 setOperationAction(Op, MVT::f32, Promote);
2634 }
2635
2636 // We have target-specific dag combine patterns for the following nodes:
2644 ISD::BITCAST,
2647 ISD::SHL,
2648 ISD::SRA,
2649 ISD::SRL,
2650 ISD::OR,
2651 ISD::AND,
2657 ISD::ADD,
2658 ISD::FADD,
2659 ISD::FSUB,
2660 ISD::FNEG,
2661 ISD::FMA,
2663 ISD::FMINNUM,
2664 ISD::FMAXNUM,
2665 ISD::SUB,
2666 ISD::LOAD,
2667 ISD::LRINT,
2668 ISD::LLRINT,
2669 ISD::MLOAD,
2670 ISD::STORE,
2671 ISD::MSTORE,
2687 ISD::SETCC,
2688 ISD::MUL,
2689 ISD::XOR,
2690 ISD::MSCATTER,
2691 ISD::MGATHER,
2692 ISD::FP16_TO_FP,
2693 ISD::FP_EXTEND,
2700
2701 computeRegisterProperties(Subtarget.getRegisterInfo());
2702
2703 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2705 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2707 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2709
2710 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2711 // that needs to benchmarked and balanced with the potential use of vector
2712 // load/store types (PR33329, PR33914).
2715
2716 // Default loop alignment, which can be overridden by -align-loops.
2718
2719 // An out-of-order CPU can speculatively execute past a predictable branch,
2720 // but a conditional move could be stalled by an expensive earlier operation.
2721 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2722 EnableExtLdPromotion = true;
2724
2726
2727 // Default to having -disable-strictnode-mutation on
2728 IsStrictFPEnabled = true;
2729}
2730
2731// This has so far only been implemented for 64-bit MachO.
2733 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2734}
2735
2737 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2738 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2739}
2740
2742 const SDLoc &DL) const {
2743 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2744 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2745 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2746 return SDValue(Node, 0);
2747}
2748
2751 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2752 !Subtarget.hasBWI())
2753 return TypeSplitVector;
2754
2755 // Since v8f16 is legal, widen anything over v4f16.
2756 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2757 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2758 VT.getVectorElementType() == MVT::f16)
2759 return TypeSplitVector;
2760
2761 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2762 VT.getVectorElementType() != MVT::i1)
2763 return TypeWidenVector;
2764
2766}
2767
2768FastISel *
2770 const TargetLibraryInfo *libInfo) const {
2771 return X86::createFastISel(funcInfo, libInfo);
2772}
2773
2774//===----------------------------------------------------------------------===//
2775// Other Lowering Hooks
2776//===----------------------------------------------------------------------===//
2777
2779 bool AssumeSingleUse) {
2780 if (!AssumeSingleUse && !Op.hasOneUse())
2781 return false;
2782 if (!ISD::isNormalLoad(Op.getNode()))
2783 return false;
2784
2785 // If this is an unaligned vector, make sure the target supports folding it.
2786 auto *Ld = cast<LoadSDNode>(Op.getNode());
2787 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2788 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2789 return false;
2790
2791 // TODO: If this is a non-temporal load and the target has an instruction
2792 // for it, it should not be folded. See "useNonTemporalLoad()".
2793
2794 return true;
2795}
2796
2798 const X86Subtarget &Subtarget,
2799 bool AssumeSingleUse) {
2800 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2801 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2802 return false;
2803
2804 // We can not replace a wide volatile load with a broadcast-from-memory,
2805 // because that would narrow the load, which isn't legal for volatiles.
2806 auto *Ld = cast<LoadSDNode>(Op.getNode());
2807 return !Ld->isVolatile() ||
2808 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2809}
2810
2812 if (!Op.hasOneUse())
2813 return false;
2814 // Peek through (oneuse) bitcast users
2815 SDNode *User = *Op->user_begin();
2816 while (User->getOpcode() == ISD::BITCAST) {
2817 if (!User->hasOneUse())
2818 return false;
2819 User = *User->user_begin();
2820 }
2821 return ISD::isNormalStore(User);
2822}
2823
2825 if (Op.hasOneUse()) {
2826 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2827 return (ISD::ZERO_EXTEND == Opcode);
2828 }
2829 return false;
2830}
2831
2832static bool isLogicOp(unsigned Opcode) {
2833 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2834 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2835}
2836
2837static bool isTargetShuffle(unsigned Opcode) {
2838 switch(Opcode) {
2839 default: return false;
2840 case X86ISD::BLENDI:
2841 case X86ISD::PSHUFB:
2842 case X86ISD::PSHUFD:
2843 case X86ISD::PSHUFHW:
2844 case X86ISD::PSHUFLW:
2845 case X86ISD::SHUFP:
2846 case X86ISD::INSERTPS:
2847 case X86ISD::EXTRQI:
2848 case X86ISD::INSERTQI:
2849 case X86ISD::VALIGN:
2850 case X86ISD::PALIGNR:
2851 case X86ISD::VSHLDQ:
2852 case X86ISD::VSRLDQ:
2853 case X86ISD::MOVLHPS:
2854 case X86ISD::MOVHLPS:
2855 case X86ISD::MOVSHDUP:
2856 case X86ISD::MOVSLDUP:
2857 case X86ISD::MOVDDUP:
2858 case X86ISD::MOVSS:
2859 case X86ISD::MOVSD:
2860 case X86ISD::MOVSH:
2861 case X86ISD::UNPCKL:
2862 case X86ISD::UNPCKH:
2863 case X86ISD::VBROADCAST:
2864 case X86ISD::VPERMILPI:
2865 case X86ISD::VPERMILPV:
2866 case X86ISD::VPERM2X128:
2867 case X86ISD::SHUF128:
2868 case X86ISD::VPERMIL2:
2869 case X86ISD::VPERMI:
2870 case X86ISD::VPPERM:
2871 case X86ISD::VPERMV:
2872 case X86ISD::VPERMV3:
2873 case X86ISD::VZEXT_MOVL:
2874 return true;
2875 }
2876}
2877
2878static bool isTargetShuffleVariableMask(unsigned Opcode) {
2879 switch (Opcode) {
2880 default: return false;
2881 // Target Shuffles.
2882 case X86ISD::PSHUFB:
2883 case X86ISD::VPERMILPV:
2884 case X86ISD::VPERMIL2:
2885 case X86ISD::VPPERM:
2886 case X86ISD::VPERMV:
2887 case X86ISD::VPERMV3:
2888 return true;
2889 // 'Faux' Target Shuffles.
2890 case ISD::OR:
2891 case ISD::AND:
2892 case X86ISD::ANDNP:
2893 return true;
2894 }
2895}
2896
2899 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2901 int ReturnAddrIndex = FuncInfo->getRAIndex();
2902
2903 if (ReturnAddrIndex == 0) {
2904 // Set up a frame object for the return address.
2905 unsigned SlotSize = RegInfo->getSlotSize();
2906 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2907 -(int64_t)SlotSize,
2908 false);
2909 FuncInfo->setRAIndex(ReturnAddrIndex);
2910 }
2911
2912 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2913}
2914
2916 bool HasSymbolicDisplacement) {
2917 // Offset should fit into 32 bit immediate field.
2918 if (!isInt<32>(Offset))
2919 return false;
2920
2921 // If we don't have a symbolic displacement - we don't have any extra
2922 // restrictions.
2923 if (!HasSymbolicDisplacement)
2924 return true;
2925
2926 // We can fold large offsets in the large code model because we always use
2927 // 64-bit offsets.
2928 if (CM == CodeModel::Large)
2929 return true;
2930
2931 // For kernel code model we know that all object resist in the negative half
2932 // of 32bits address space. We may not accept negative offsets, since they may
2933 // be just off and we may accept pretty large positive ones.
2934 if (CM == CodeModel::Kernel)
2935 return Offset >= 0;
2936
2937 // For other non-large code models we assume that latest small object is 16MB
2938 // before end of 31 bits boundary. We may also accept pretty large negative
2939 // constants knowing that all objects are in the positive half of address
2940 // space.
2941 return Offset < 16 * 1024 * 1024;
2942}
2943
2944/// Return true if the condition is an signed comparison operation.
2945static bool isX86CCSigned(X86::CondCode X86CC) {
2946 switch (X86CC) {
2947 default:
2948 llvm_unreachable("Invalid integer condition!");
2949 case X86::COND_E:
2950 case X86::COND_NE:
2951 case X86::COND_B:
2952 case X86::COND_A:
2953 case X86::COND_BE:
2954 case X86::COND_AE:
2955 return false;
2956 case X86::COND_G:
2957 case X86::COND_GE:
2958 case X86::COND_L:
2959 case X86::COND_LE:
2960 return true;
2961 }
2962}
2963
2965 switch (SetCCOpcode) {
2966 // clang-format off
2967 default: llvm_unreachable("Invalid integer condition!");
2968 case ISD::SETEQ: return X86::COND_E;
2969 case ISD::SETGT: return X86::COND_G;
2970 case ISD::SETGE: return X86::COND_GE;
2971 case ISD::SETLT: return X86::COND_L;
2972 case ISD::SETLE: return X86::COND_LE;
2973 case ISD::SETNE: return X86::COND_NE;
2974 case ISD::SETULT: return X86::COND_B;
2975 case ISD::SETUGT: return X86::COND_A;
2976 case ISD::SETULE: return X86::COND_BE;
2977 case ISD::SETUGE: return X86::COND_AE;
2978 // clang-format on
2979 }
2980}
2981
2982/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2983/// condition code, returning the condition code and the LHS/RHS of the
2984/// comparison to make.
2986 bool isFP, SDValue &LHS, SDValue &RHS,
2987 SelectionDAG &DAG) {
2988 if (!isFP) {
2990 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2991 // X > -1 -> X == 0, jump !sign.
2992 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2993 return X86::COND_NS;
2994 }
2995 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2996 // X < 0 -> X == 0, jump on sign.
2997 return X86::COND_S;
2998 }
2999 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3000 // X >= 0 -> X == 0, jump on !sign.
3001 return X86::COND_NS;
3002 }
3003 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3004 // X < 1 -> X <= 0
3005 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3006 return X86::COND_LE;
3007 }
3008 }
3009
3010 return TranslateIntegerX86CC(SetCCOpcode);
3011 }
3012
3013 // First determine if it is required or is profitable to flip the operands.
3014
3015 // If LHS is a foldable load, but RHS is not, flip the condition.
3016 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3017 !ISD::isNON_EXTLoad(RHS.getNode())) {
3018 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3019 std::swap(LHS, RHS);
3020 }
3021
3022 switch (SetCCOpcode) {
3023 default: break;
3024 case ISD::SETOLT:
3025 case ISD::SETOLE:
3026 case ISD::SETUGT:
3027 case ISD::SETUGE:
3028 std::swap(LHS, RHS);
3029 break;
3030 }
3031
3032 // On a floating point condition, the flags are set as follows:
3033 // ZF PF CF op
3034 // 0 | 0 | 0 | X > Y
3035 // 0 | 0 | 1 | X < Y
3036 // 1 | 0 | 0 | X == Y
3037 // 1 | 1 | 1 | unordered
3038 switch (SetCCOpcode) {
3039 // clang-format off
3040 default: llvm_unreachable("Condcode should be pre-legalized away");
3041 case ISD::SETUEQ:
3042 case ISD::SETEQ: return X86::COND_E;
3043 case ISD::SETOLT: // flipped
3044 case ISD::SETOGT:
3045 case ISD::SETGT: return X86::COND_A;
3046 case ISD::SETOLE: // flipped
3047 case ISD::SETOGE:
3048 case ISD::SETGE: return X86::COND_AE;
3049 case ISD::SETUGT: // flipped
3050 case ISD::SETULT:
3051 case ISD::SETLT: return X86::COND_B;
3052 case ISD::SETUGE: // flipped
3053 case ISD::SETULE:
3054 case ISD::SETLE: return X86::COND_BE;
3055 case ISD::SETONE:
3056 case ISD::SETNE: return X86::COND_NE;
3057 case ISD::SETUO: return X86::COND_P;
3058 case ISD::SETO: return X86::COND_NP;
3059 case ISD::SETOEQ:
3060 case ISD::SETUNE: return X86::COND_INVALID;
3061 // clang-format on
3062 }
3063}
3064
3065/// Is there a floating point cmov for the specific X86 condition code?
3066/// Current x86 isa includes the following FP cmov instructions:
3067/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3068static bool hasFPCMov(unsigned X86CC) {
3069 switch (X86CC) {
3070 default:
3071 return false;
3072 case X86::COND_B:
3073 case X86::COND_BE:
3074 case X86::COND_E:
3075 case X86::COND_P:
3076 case X86::COND_A:
3077 case X86::COND_AE:
3078 case X86::COND_NE:
3079 case X86::COND_NP:
3080 return true;
3081 }
3082}
3083
3084static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3085 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3086 VT.is512BitVector();
3087}
3088
3090 const CallInst &I,
3091 MachineFunction &MF,
3092 unsigned Intrinsic) const {
3093 Info.flags = MachineMemOperand::MONone;
3094 Info.offset = 0;
3095
3097 if (!IntrData) {
3098 switch (Intrinsic) {
3099 case Intrinsic::x86_aesenc128kl:
3100 case Intrinsic::x86_aesdec128kl:
3101 Info.opc = ISD::INTRINSIC_W_CHAIN;
3102 Info.ptrVal = I.getArgOperand(1);
3103 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3104 Info.align = Align(1);
3105 Info.flags |= MachineMemOperand::MOLoad;
3106 return true;
3107 case Intrinsic::x86_aesenc256kl:
3108 case Intrinsic::x86_aesdec256kl:
3109 Info.opc = ISD::INTRINSIC_W_CHAIN;
3110 Info.ptrVal = I.getArgOperand(1);
3111 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3112 Info.align = Align(1);
3113 Info.flags |= MachineMemOperand::MOLoad;
3114 return true;
3115 case Intrinsic::x86_aesencwide128kl:
3116 case Intrinsic::x86_aesdecwide128kl:
3117 Info.opc = ISD::INTRINSIC_W_CHAIN;
3118 Info.ptrVal = I.getArgOperand(0);
3119 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3120 Info.align = Align(1);
3121 Info.flags |= MachineMemOperand::MOLoad;
3122 return true;
3123 case Intrinsic::x86_aesencwide256kl:
3124 case Intrinsic::x86_aesdecwide256kl:
3125 Info.opc = ISD::INTRINSIC_W_CHAIN;
3126 Info.ptrVal = I.getArgOperand(0);
3127 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3128 Info.align = Align(1);
3129 Info.flags |= MachineMemOperand::MOLoad;
3130 return true;
3131 case Intrinsic::x86_cmpccxadd32:
3132 case Intrinsic::x86_cmpccxadd64:
3133 case Intrinsic::x86_atomic_bts:
3134 case Intrinsic::x86_atomic_btc:
3135 case Intrinsic::x86_atomic_btr: {
3136 Info.opc = ISD::INTRINSIC_W_CHAIN;
3137 Info.ptrVal = I.getArgOperand(0);
3138 unsigned Size = I.getType()->getScalarSizeInBits();
3139 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3140 Info.align = Align(Size);
3143 return true;
3144 }
3145 case Intrinsic::x86_atomic_bts_rm:
3146 case Intrinsic::x86_atomic_btc_rm:
3147 case Intrinsic::x86_atomic_btr_rm: {
3148 Info.opc = ISD::INTRINSIC_W_CHAIN;
3149 Info.ptrVal = I.getArgOperand(0);
3150 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3151 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3152 Info.align = Align(Size);
3155 return true;
3156 }
3157 case Intrinsic::x86_aadd32:
3158 case Intrinsic::x86_aadd64:
3159 case Intrinsic::x86_aand32:
3160 case Intrinsic::x86_aand64:
3161 case Intrinsic::x86_aor32:
3162 case Intrinsic::x86_aor64:
3163 case Intrinsic::x86_axor32:
3164 case Intrinsic::x86_axor64:
3165 case Intrinsic::x86_atomic_add_cc:
3166 case Intrinsic::x86_atomic_sub_cc:
3167 case Intrinsic::x86_atomic_or_cc:
3168 case Intrinsic::x86_atomic_and_cc:
3169 case Intrinsic::x86_atomic_xor_cc: {
3170 Info.opc = ISD::INTRINSIC_W_CHAIN;
3171 Info.ptrVal = I.getArgOperand(0);
3172 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3173 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3174 Info.align = Align(Size);
3177 return true;
3178 }
3179 }
3180 return false;
3181 }
3182
3183 switch (IntrData->Type) {
3186 case TRUNCATE_TO_MEM_VI32: {
3187 Info.opc = ISD::INTRINSIC_VOID;
3188 Info.ptrVal = I.getArgOperand(0);
3189 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3191 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3192 ScalarVT = MVT::i8;
3193 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3194 ScalarVT = MVT::i16;
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3196 ScalarVT = MVT::i32;
3197
3198 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3199 Info.align = Align(1);
3200 Info.flags |= MachineMemOperand::MOStore;
3201 break;
3202 }
3203 case GATHER:
3204 case GATHER_AVX2: {
3205 Info.opc = ISD::INTRINSIC_W_CHAIN;
3206 Info.ptrVal = nullptr;
3207 MVT DataVT = MVT::getVT(I.getType());
3208 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3209 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3210 IndexVT.getVectorNumElements());
3211 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3212 Info.align = Align(1);
3213 Info.flags |= MachineMemOperand::MOLoad;
3214 break;
3215 }
3216 case SCATTER: {
3217 Info.opc = ISD::INTRINSIC_VOID;
3218 Info.ptrVal = nullptr;
3219 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3220 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3221 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3222 IndexVT.getVectorNumElements());
3223 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3224 Info.align = Align(1);
3225 Info.flags |= MachineMemOperand::MOStore;
3226 break;
3227 }
3228 default:
3229 return false;
3230 }
3231
3232 return true;
3233}
3234
3235/// Returns true if the target can instruction select the
3236/// specified FP immediate natively. If false, the legalizer will
3237/// materialize the FP immediate as a load from a constant pool.
3239 bool ForCodeSize) const {
3240 for (const APFloat &FPImm : LegalFPImmediates)
3241 if (Imm.bitwiseIsEqual(FPImm))
3242 return true;
3243 return false;
3244}
3245
3247 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3248 std::optional<unsigned> ByteOffset) const {
3249 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3250
3251 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3252 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3253 N = *N->user_begin();
3254 return N;
3255 };
3256
3257 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3258 // relocation target a movq or addq instruction: don't let the load shrink.
3259 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3260 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3261 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3262 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3263
3264 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3265 // those uses are extracted directly into a store, then the extract + store
3266 // can be store-folded, or (4) any use will be used by legal full width
3267 // instruction. Then, it's probably not worth splitting the load.
3268 EVT VT = Load->getValueType(0);
3269 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3270 !SDValue(Load, 0).hasOneUse()) {
3271 bool FullWidthUse = false;
3272 bool AllExtractStores = true;
3273 for (SDUse &Use : Load->uses()) {
3274 // Skip uses of the chain value. Result 0 of the node is the load value.
3275 if (Use.getResNo() != 0)
3276 continue;
3277
3278 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3279
3280 // If this use is an extract + store, it's probably not worth splitting.
3281 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3282 all_of(User->uses(), [&](const SDUse &U) {
3283 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3284 return Inner->getOpcode() == ISD::STORE;
3285 }))
3286 continue;
3287
3288 AllExtractStores = false;
3289
3290 // If any use is a full width legal/target bin op, then assume its legal
3291 // and won't split.
3292 if (isBinOp(User->getOpcode()) &&
3293 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3294 User->getOpcode() > ISD::BUILTIN_OP_END))
3295 FullWidthUse = true;
3296 }
3297
3298 if (AllExtractStores)
3299 return false;
3300
3301 // If we have an user that uses the full vector width, then this use is
3302 // only worth splitting if the offset isn't 0 (to avoid an
3303 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3304 if (FullWidthUse)
3305 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3306 }
3307
3308 return true;
3309}
3310
3311/// Returns true if it is beneficial to convert a load of a constant
3312/// to just the constant itself.
3314 Type *Ty) const {
3315 assert(Ty->isIntegerTy());
3316
3317 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3318 if (BitSize == 0 || BitSize > 64)
3319 return false;
3320 return true;
3321}
3322
3324 // If we are using XMM registers in the ABI and the condition of the select is
3325 // a floating-point compare and we have blendv or conditional move, then it is
3326 // cheaper to select instead of doing a cross-register move and creating a
3327 // load that depends on the compare result.
3328 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3329 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3330}
3331
3333 // TODO: It might be a win to ease or lift this restriction, but the generic
3334 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3335 if (VT.isVector() && Subtarget.hasAVX512())
3336 return false;
3337
3338 return true;
3339}
3340
3342 SDValue C) const {
3343 // TODO: We handle scalars using custom code, but generic combining could make
3344 // that unnecessary.
3345 APInt MulC;
3346 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3347 return false;
3348
3349 // Find the type this will be legalized too. Otherwise we might prematurely
3350 // convert this to shl+add/sub and then still have to type legalize those ops.
3351 // Another choice would be to defer the decision for illegal types until
3352 // after type legalization. But constant splat vectors of i64 can't make it
3353 // through type legalization on 32-bit targets so we would need to special
3354 // case vXi64.
3355 while (getTypeAction(Context, VT) != TypeLegal)
3356 VT = getTypeToTransformTo(Context, VT);
3357
3358 // If vector multiply is legal, assume that's faster than shl + add/sub.
3359 // Multiply is a complex op with higher latency and lower throughput in
3360 // most implementations, sub-vXi32 vector multiplies are always fast,
3361 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3362 // is always going to be slow.
3363 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3364 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3365 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3366 return false;
3367
3368 // shl+add, shl+sub, shl+add+neg
3369 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3370 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3371}
3372
3374 unsigned Index) const {
3376 return false;
3377
3378 // Mask vectors support all subregister combinations and operations that
3379 // extract half of vector.
3380 if (ResVT.getVectorElementType() == MVT::i1)
3381 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3382 (Index == ResVT.getVectorNumElements()));
3383
3384 return (Index % ResVT.getVectorNumElements()) == 0;
3385}
3386
3388 unsigned Opc = VecOp.getOpcode();
3389
3390 // Assume target opcodes can't be scalarized.
3391 // TODO - do we have any exceptions?
3392 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3393 return false;
3394
3395 // If the vector op is not supported, try to convert to scalar.
3396 EVT VecVT = VecOp.getValueType();
3398 return true;
3399
3400 // If the vector op is supported, but the scalar op is not, the transform may
3401 // not be worthwhile.
3402 EVT ScalarVT = VecVT.getScalarType();
3403 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3404}
3405
3407 bool) const {
3408 // TODO: Allow vectors?
3409 if (VT.isVector())
3410 return false;
3411 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3412}
3413
3415 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3416 // i32/i64 or can rely on BSF passthrough value.
3417 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3418 Subtarget.hasBitScanPassThrough() ||
3419 (!Ty->isVectorTy() &&
3420 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3421}
3422
3424 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3425 // passthrough value.
3426 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3427 Subtarget.hasBitScanPassThrough();
3428}
3429
3431 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3432 // expensive than a straight movsd. On the other hand, it's important to
3433 // shrink long double fp constant since fldt is very slow.
3434 return !Subtarget.hasSSE2() || VT == MVT::f80;
3435}
3436
3438 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3439 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3440}
3441
3443 const SelectionDAG &DAG,
3444 const MachineMemOperand &MMO) const {
3445 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3446 BitcastVT.getVectorElementType() == MVT::i1)
3447 return false;
3448
3449 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3450 return false;
3451
3452 // If both types are legal vectors, it's always ok to convert them.
3453 if (LoadVT.isVector() && BitcastVT.isVector() &&
3454 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3455 return true;
3456
3457 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3458}
3459
3461 const MachineFunction &MF) const {
3462 // Do not merge to float value size (128 bytes) if no implicit
3463 // float attribute is set.
3464 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3465
3466 if (NoFloat) {
3467 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3468 return (MemVT.getSizeInBits() <= MaxIntSize);
3469 }
3470 // Make sure we don't merge greater than our preferred vector
3471 // width.
3472 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3473 return false;
3474
3475 return true;
3476}
3477
3479 return Subtarget.hasFastLZCNT();
3480}
3481
3483 const Instruction &AndI) const {
3484 return true;
3485}
3486
3488 EVT VT = Y.getValueType();
3489
3490 if (VT.isVector())
3491 return false;
3492
3493 if (!Subtarget.hasBMI())
3494 return false;
3495
3496 // There are only 32-bit and 64-bit forms for 'andn'.
3497 if (VT != MVT::i32 && VT != MVT::i64)
3498 return false;
3499
3500 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3501}
3502
3504 EVT VT = Y.getValueType();
3505
3506 if (!VT.isVector())
3507 return hasAndNotCompare(Y);
3508
3509 // Vector.
3510
3511 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3512 return false;
3513
3514 if (VT == MVT::v4i32)
3515 return true;
3516
3517 return Subtarget.hasSSE2();
3518}
3519
3521 return X.getValueType().isScalarInteger(); // 'bt'
3522}
3523
3527 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3528 SelectionDAG &DAG) const {
3529 // Does baseline recommend not to perform the fold by default?
3531 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3532 return false;
3533 // For scalars this transform is always beneficial.
3534 if (X.getValueType().isScalarInteger())
3535 return true;
3536 // If all the shift amounts are identical, then transform is beneficial even
3537 // with rudimentary SSE2 shifts.
3538 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3539 return true;
3540 // If we have AVX2 with it's powerful shift operations, then it's also good.
3541 if (Subtarget.hasAVX2())
3542 return true;
3543 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3544 return NewShiftOpcode == ISD::SHL;
3545}
3546
3548 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3549 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3550 if (!VT.isInteger())
3551 return ShiftOpc;
3552
3553 bool PreferRotate = false;
3554 if (VT.isVector()) {
3555 // For vectors, if we have rotate instruction support, then its definetly
3556 // best. Otherwise its not clear what the best so just don't make changed.
3557 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3558 VT.getScalarType() == MVT::i64);
3559 } else {
3560 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3561 // rotate unless we have a zext mask+shr.
3562 PreferRotate = Subtarget.hasBMI2();
3563 if (!PreferRotate) {
3564 unsigned MaskBits =
3565 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3566 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3567 }
3568 }
3569
3570 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3571 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3572
3573 if (PreferRotate && MayTransformRotate)
3574 return ISD::ROTL;
3575
3576 // If vector we don't really get much benefit swapping around constants.
3577 // Maybe we could check if the DAG has the flipped node already in the
3578 // future.
3579 if (VT.isVector())
3580 return ShiftOpc;
3581
3582 // See if the beneficial to swap shift type.
3583 if (ShiftOpc == ISD::SHL) {
3584 // If the current setup has imm64 mask, then inverse will have
3585 // at least imm32 mask (or be zext i32 -> i64).
3586 if (VT == MVT::i64)
3587 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3588 : ShiftOpc;
3589
3590 // We can only benefit if req at least 7-bit for the mask. We
3591 // don't want to replace shl of 1,2,3 as they can be implemented
3592 // with lea/add.
3593 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3594 }
3595
3596 if (VT == MVT::i64)
3597 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3598 // extremely efficient.
3599 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3600
3601 // Keep small shifts as shl so we can generate add/lea.
3602 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3603 }
3604
3605 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3606 // (PreferRotate will be set in the latter case).
3607 if (PreferRotate || !MayTransformRotate || VT.isVector())
3608 return ShiftOpc;
3609
3610 // Non-vector type and we have a zext mask with SRL.
3611 return ISD::SRL;
3612}
3613
3616 const Value *Lhs,
3617 const Value *Rhs) const {
3618 using namespace llvm::PatternMatch;
3619 int BaseCost = BrMergingBaseCostThresh.getValue();
3620 // With CCMP, branches can be merged in a more efficient way.
3621 if (BaseCost >= 0 && Subtarget.hasCCMP())
3622 BaseCost += BrMergingCcmpBias;
3623 // a == b && a == c is a fast pattern on x86.
3624 if (BaseCost >= 0 && Opc == Instruction::And &&
3627 BaseCost += 1;
3628 return {BaseCost, BrMergingLikelyBias.getValue(),
3629 BrMergingUnlikelyBias.getValue()};
3630}
3631
3633 return N->getOpcode() != ISD::FP_EXTEND;
3634}
3635
3637 const SDNode *N, CombineLevel Level) const {
3638 assert(((N->getOpcode() == ISD::SHL &&
3639 N->getOperand(0).getOpcode() == ISD::SRL) ||
3640 (N->getOpcode() == ISD::SRL &&
3641 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3642 "Expected shift-shift mask");
3643 // TODO: Should we always create i64 masks? Or only folded immediates?
3644 EVT VT = N->getValueType(0);
3645 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3646 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3647 // Only fold if the shift values are equal - so it folds to AND.
3648 // TODO - we should fold if either is a non-uniform vector but we don't do
3649 // the fold for non-splats yet.
3650 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3651 }
3653}
3654
3656 EVT VT = Y.getValueType();
3657
3658 // For vectors, we don't have a preference, but we probably want a mask.
3659 if (VT.isVector())
3660 return false;
3661
3662 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3663 if (VT == MVT::i64 && !Subtarget.is64Bit())
3664 return false;
3665
3666 return true;
3667}
3668
3671 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3673 !Subtarget.isOSWindows())
3676 ExpansionFactor);
3677}
3678
3680 // Any legal vector type can be splatted more efficiently than
3681 // loading/spilling from memory.
3682 return isTypeLegal(VT);
3683}
3684
3686 MVT VT = MVT::getIntegerVT(NumBits);
3687 if (isTypeLegal(VT))
3688 return VT;
3689
3690 // PMOVMSKB can handle this.
3691 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3692 return MVT::v16i8;
3693
3694 // VPMOVMSKB can handle this.
3695 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3696 return MVT::v32i8;
3697
3698 // TODO: Allow 64-bit type for 32-bit target.
3699 // TODO: 512-bit types should be allowed, but make sure that those
3700 // cases are handled in combineVectorSizedSetCCEquality().
3701
3703}
3704
3705/// Val is the undef sentinel value or equal to the specified value.
3706static bool isUndefOrEqual(int Val, int CmpVal) {
3707 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3708}
3709
3710/// Return true if every element in Mask is the undef sentinel value or equal to
3711/// the specified value.
3712static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3713 return llvm::all_of(Mask, [CmpVal](int M) {
3714 return (M == SM_SentinelUndef) || (M == CmpVal);
3715 });
3716}
3717
3718/// Return true if every element in Mask, beginning from position Pos and ending
3719/// in Pos+Size is the undef sentinel value or equal to the specified value.
3720static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3721 unsigned Size) {
3722 return llvm::all_of(Mask.slice(Pos, Size),
3723 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3724}
3725
3726/// Val is either the undef or zero sentinel value.
3727static bool isUndefOrZero(int Val) {
3728 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3729}
3730
3731/// Return true if every element in Mask, beginning from position Pos and ending
3732/// in Pos+Size is the undef sentinel value.
3733static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3734 return llvm::all_of(Mask.slice(Pos, Size),
3735 [](int M) { return M == SM_SentinelUndef; });
3736}
3737
3738/// Return true if the mask creates a vector whose lower half is undefined.
3740 unsigned NumElts = Mask.size();
3741 return isUndefInRange(Mask, 0, NumElts / 2);
3742}
3743
3744/// Return true if the mask creates a vector whose upper half is undefined.
3746 unsigned NumElts = Mask.size();
3747 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3748}
3749
3750/// Return true if Val falls within the specified range (L, H].
3751static bool isInRange(int Val, int Low, int Hi) {
3752 return (Val >= Low && Val < Hi);
3753}
3754
3755/// Return true if the value of any element in Mask falls within the specified
3756/// range (L, H].
3757static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3758 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3759}
3760
3761/// Return true if the value of any element in Mask is the zero sentinel value.
3762static bool isAnyZero(ArrayRef<int> Mask) {
3763 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3764}
3765
3766/// Return true if Val is undef or if its value falls within the
3767/// specified range (L, H].
3768static bool isUndefOrInRange(int Val, int Low, int Hi) {
3769 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3770}
3771
3772/// Return true if every element in Mask is undef or if its value
3773/// falls within the specified range (L, H].
3774static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3775 return llvm::all_of(
3776 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3777}
3778
3779/// Return true if Val is undef, zero or if its value falls within the
3780/// specified range (L, H].
3781static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3782 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3783}
3784
3785/// Return true if every element in Mask is undef, zero or if its value
3786/// falls within the specified range (L, H].
3787static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3788 return llvm::all_of(
3789 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3790}
3791
3792/// Return true if every element in Mask, is an in-place blend/select mask or is
3793/// undef.
3795 unsigned NumElts = Mask.size();
3796 for (auto [I, M] : enumerate(Mask))
3797 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3798 return false;
3799 return true;
3800}
3801
3802/// Return true if every element in Mask, beginning
3803/// from position Pos and ending in Pos + Size, falls within the specified
3804/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3805static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3806 unsigned Size, int Low, int Step = 1) {
3807 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3808 if (!isUndefOrEqual(Mask[i], Low))
3809 return false;
3810 return true;
3811}
3812
3813/// Return true if every element in Mask, beginning
3814/// from position Pos and ending in Pos+Size, falls within the specified
3815/// sequential range (Low, Low+Size], or is undef or is zero.
3817 unsigned Size, int Low,
3818 int Step = 1) {
3819 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3820 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3821 return false;
3822 return true;
3823}
3824
3825/// Return true if every element in Mask, beginning
3826/// from position Pos and ending in Pos+Size is undef or is zero.
3827static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3828 unsigned Size) {
3829 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3830}
3831
3832/// Return true if every element of a single input is referenced by the shuffle
3833/// mask. i.e. it just permutes them all.
3835 unsigned NumElts = Mask.size();
3836 APInt DemandedElts = APInt::getZero(NumElts);
3837 for (int M : Mask)
3838 if (isInRange(M, 0, NumElts))
3839 DemandedElts.setBit(M);
3840 return DemandedElts.isAllOnes();
3841}
3842
3843/// Helper function to test whether a shuffle mask could be
3844/// simplified by widening the elements being shuffled.
3845///
3846/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3847/// leaves it in an unspecified state.
3848///
3849/// NOTE: This must handle normal vector shuffle masks and *target* vector
3850/// shuffle masks. The latter have the special property of a '-2' representing
3851/// a zero-ed lane of a vector.
3853 SmallVectorImpl<int> &WidenedMask) {
3854 WidenedMask.assign(Mask.size() / 2, 0);
3855 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3856 int M0 = Mask[i];
3857 int M1 = Mask[i + 1];
3858
3859 // If both elements are undef, its trivial.
3860 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3861 WidenedMask[i / 2] = SM_SentinelUndef;
3862 continue;
3863 }
3864
3865 // Check for an undef mask and a mask value properly aligned to fit with
3866 // a pair of values. If we find such a case, use the non-undef mask's value.
3867 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3868 WidenedMask[i / 2] = M1 / 2;
3869 continue;
3870 }
3871 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3872 WidenedMask[i / 2] = M0 / 2;
3873 continue;
3874 }
3875
3876 // When zeroing, we need to spread the zeroing across both lanes to widen.
3877 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3878 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3880 WidenedMask[i / 2] = SM_SentinelZero;
3881 continue;
3882 }
3883 return false;
3884 }
3885
3886 // Finally check if the two mask values are adjacent and aligned with
3887 // a pair.
3888 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3889 WidenedMask[i / 2] = M0 / 2;
3890 continue;
3891 }
3892
3893 // Otherwise we can't safely widen the elements used in this shuffle.
3894 return false;
3895 }
3896 assert(WidenedMask.size() == Mask.size() / 2 &&
3897 "Incorrect size of mask after widening the elements!");
3898
3899 return true;
3900}
3901
3903 const APInt &Zeroable,
3904 bool V2IsZero,
3905 SmallVectorImpl<int> &WidenedMask) {
3906 // Create an alternative mask with info about zeroable elements.
3907 // Here we do not set undef elements as zeroable.
3908 SmallVector<int, 64> ZeroableMask(Mask);
3909 if (V2IsZero) {
3910 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3911 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3912 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3913 ZeroableMask[i] = SM_SentinelZero;
3914 }
3915 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3916}
3917
3919 SmallVector<int, 32> WidenedMask;
3920 return canWidenShuffleElements(Mask, WidenedMask);
3921}
3922
3923// Attempt to narrow/widen shuffle mask until it matches the target number of
3924// elements.
3925static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3926 SmallVectorImpl<int> &ScaledMask) {
3927 unsigned NumSrcElts = Mask.size();
3928 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3929 "Illegal shuffle scale factor");
3930
3931 // Narrowing is guaranteed to work.
3932 if (NumDstElts >= NumSrcElts) {
3933 int Scale = NumDstElts / NumSrcElts;
3934 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3935 return true;
3936 }
3937
3938 // We have to repeat the widening until we reach the target size, but we can
3939 // split out the first widening as it sets up ScaledMask for us.
3940 if (canWidenShuffleElements(Mask, ScaledMask)) {
3941 while (ScaledMask.size() > NumDstElts) {
3942 SmallVector<int, 16> WidenedMask;
3943 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3944 return false;
3945 ScaledMask = std::move(WidenedMask);
3946 }
3947 return true;
3948 }
3949
3950 return false;
3951}
3952
3953static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3954 SmallVector<int, 32> ScaledMask;
3955 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3956}
3957
3958// Helper to grow the shuffle mask for a larger value type.
3959// NOTE: This is different to scaleShuffleElements which is a same size type.
3960static void growShuffleMask(ArrayRef<int> SrcMask,
3961 SmallVectorImpl<int> &DstMask,
3962 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3963 assert(DstMask.empty() && "Expected an empty shuffle mas");
3964 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3965 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3966 unsigned NumSrcElts = SrcMask.size();
3967 DstMask.assign(SrcMask.begin(), SrcMask.end());
3968 for (int &M : DstMask) {
3969 if (M < 0)
3970 continue;
3971 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3972 }
3973 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3974}
3975
3976/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3978 return isNullConstant(Elt) || isNullFPConstant(Elt);
3979}
3980
3981// Build a vector of constants.
3982// Use an UNDEF node if MaskElt == -1.
3983// Split 64-bit constants in the 32-bit mode.
3985 const SDLoc &dl, bool IsMask = false) {
3986
3988 bool Split = false;
3989
3990 MVT ConstVecVT = VT;
3991 unsigned NumElts = VT.getVectorNumElements();
3992 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3993 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3994 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3995 Split = true;
3996 }
3997
3998 MVT EltVT = ConstVecVT.getVectorElementType();
3999 for (unsigned i = 0; i < NumElts; ++i) {
4000 bool IsUndef = Values[i] < 0 && IsMask;
4001 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4002 DAG.getConstant(Values[i], dl, EltVT);
4003 Ops.push_back(OpNode);
4004 if (Split)
4005 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4006 DAG.getConstant(0, dl, EltVT));
4007 }
4008 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4009 if (Split)
4010 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4011 return ConstsNode;
4012}
4013
4014static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4015 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4016 assert(Bits.size() == Undefs.getBitWidth() &&
4017 "Unequal constant and undef arrays");
4019 bool Split = false;
4020
4021 MVT ConstVecVT = VT;
4022 unsigned NumElts = VT.getVectorNumElements();
4023 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4024 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4025 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4026 Split = true;
4027 }
4028
4029 MVT EltVT = ConstVecVT.getVectorElementType();
4030 MVT EltIntVT = EltVT.changeTypeToInteger();
4031 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4032 if (Undefs[i]) {
4033 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4034 continue;
4035 }
4036 const APInt &V = Bits[i];
4037 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4038 if (Split) {
4039 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4040 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4041 } else {
4042 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4043 }
4044 }
4045
4046 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4047 return DAG.getBitcast(VT, ConstsNode);
4048}
4049
4051 SelectionDAG &DAG, const SDLoc &dl) {
4052 APInt Undefs = APInt::getZero(Bits.size());
4053 return getConstVector(Bits, Undefs, VT, DAG, dl);
4054}
4055
4056/// Returns a vector of specified type with all zero elements.
4057static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4058 SelectionDAG &DAG, const SDLoc &dl) {
4059 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4060 VT.getVectorElementType() == MVT::i1) &&
4061 "Unexpected vector type");
4062
4063 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4064 // type. This ensures they get CSE'd. But if the integer type is not
4065 // available, use a floating-point +0.0 instead.
4066 SDValue Vec;
4067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4068 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4069 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4070 } else if (VT.isFloatingPoint() &&
4072 Vec = DAG.getConstantFP(+0.0, dl, VT);
4073 } else if (VT.getVectorElementType() == MVT::i1) {
4074 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4075 "Unexpected vector type");
4076 Vec = DAG.getConstant(0, dl, VT);
4077 } else {
4078 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4079 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4080 }
4081 return DAG.getBitcast(VT, Vec);
4082}
4083
4084// Helper to determine if the ops are all the extracted subvectors come from a
4085// single source. If we allow commute they don't have to be in order (Lo/Hi).
4086static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4087 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4088 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4089 LHS.getValueType() != RHS.getValueType() ||
4090 LHS.getOperand(0) != RHS.getOperand(0))
4091 return SDValue();
4092
4093 SDValue Src = LHS.getOperand(0);
4094 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4095 return SDValue();
4096
4097 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4098 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4099 RHS.getConstantOperandAPInt(1) == NumElts) ||
4100 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4101 LHS.getConstantOperandAPInt(1) == NumElts))
4102 return Src;
4103
4104 return SDValue();
4105}
4106
4107static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4108 const SDLoc &dl, unsigned vectorWidth) {
4109 EVT VT = Vec.getValueType();
4110 EVT ElVT = VT.getVectorElementType();
4111 unsigned ResultNumElts =
4112 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4113 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4114
4115 assert(ResultVT.getSizeInBits() == vectorWidth &&
4116 "Illegal subvector extraction");
4117
4118 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4119 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4120 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4121
4122 // This is the index of the first element of the vectorWidth-bit chunk
4123 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4124 IdxVal &= ~(ElemsPerChunk - 1);
4125
4126 // If the input is a buildvector just emit a smaller one.
4127 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4128 return DAG.getBuildVector(ResultVT, dl,
4129 Vec->ops().slice(IdxVal, ElemsPerChunk));
4130
4131 // Check if we're extracting the upper undef of a widening pattern.
4132 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4133 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4134 isNullConstant(Vec.getOperand(2)))
4135 return DAG.getUNDEF(ResultVT);
4136
4137 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4138}
4139
4140/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4141/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4142/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4143/// instructions or a simple subregister reference. Idx is an index in the
4144/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4145/// lowering EXTRACT_VECTOR_ELT operations easier.
4146static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4147 SelectionDAG &DAG, const SDLoc &dl) {
4149 Vec.getValueType().is512BitVector()) &&
4150 "Unexpected vector size!");
4151 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4152}
4153
4154/// Generate a DAG to grab 256-bits from a 512-bit vector.
4155static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4156 SelectionDAG &DAG, const SDLoc &dl) {
4157 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4158 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4159}
4160
4161static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4162 SelectionDAG &DAG, const SDLoc &dl,
4163 unsigned vectorWidth) {
4164 assert((vectorWidth == 128 || vectorWidth == 256) &&
4165 "Unsupported vector width");
4166 // Inserting UNDEF is Result
4167 if (Vec.isUndef())
4168 return Result;
4169
4170 // Insert the relevant vectorWidth bits.
4171 EVT VT = Vec.getValueType();
4172 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4173 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4174
4175 // This is the index of the first element of the vectorWidth-bit chunk
4176 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4177 IdxVal &= ~(ElemsPerChunk - 1);
4178 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4179}
4180
4181/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4182/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4183/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4184/// simple superregister reference. Idx is an index in the 128 bits
4185/// we want. It need not be aligned to a 128-bit boundary. That makes
4186/// lowering INSERT_VECTOR_ELT operations easier.
4187static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4188 SelectionDAG &DAG, const SDLoc &dl) {
4189 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4190 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4191}
4192
4193/// Widen a vector to a larger size with the same scalar type, with the new
4194/// elements either zero or undef.
4195static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4196 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4197 const SDLoc &dl) {
4198 EVT VecVT = Vec.getValueType();
4200 VecVT.getScalarType() == VT.getScalarType() &&
4201 "Unsupported vector widening type");
4202 // If the upper 128-bits of a build vector are already undef/zero, then try to
4203 // widen from the lower 128-bits.
4204 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4205 unsigned NumSrcElts = VecVT.getVectorNumElements();
4206 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4207 if (all_of(Hi, [&](SDValue V) {
4208 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4209 }))
4210 Vec = extract128BitVector(Vec, 0, DAG, dl);
4211 }
4212 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4213 : DAG.getUNDEF(VT);
4214 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4215}
4216
4217/// Widen a vector to a larger size with the same scalar type, with the new
4218/// elements either zero or undef.
4219static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4220 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4221 const SDLoc &dl, unsigned WideSizeInBits) {
4222 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4223 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4224 "Unsupported vector widening type");
4225 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4226 MVT SVT = Vec.getSimpleValueType().getScalarType();
4227 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4228 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4229}
4230
4231/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4232/// and bitcast with integer types.
4233static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4234 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4235 unsigned NumElts = VT.getVectorNumElements();
4236 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4237 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4238 return VT;
4239}
4240
4241/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4242/// bitcast with integer types.
4243static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4244 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4245 const SDLoc &dl) {
4246 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4247 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4248}
4249
4250// Helper function to collect subvector ops that are concatenated together,
4251// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4252// The subvectors in Ops are guaranteed to be the same type.
4254 SelectionDAG &DAG) {
4255 assert(Ops.empty() && "Expected an empty ops vector");
4256
4257 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4258 Ops.append(N->op_begin(), N->op_end());
4259 return true;
4260 }
4261
4262 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4263 SDValue Src = N->getOperand(0);
4264 SDValue Sub = N->getOperand(1);
4265 const APInt &Idx = N->getConstantOperandAPInt(2);
4266 EVT VT = Src.getValueType();
4267 EVT SubVT = Sub.getValueType();
4268
4269 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4270 // insert_subvector(undef, x, lo)
4271 if (Idx == 0 && Src.isUndef()) {
4272 Ops.push_back(Sub);
4273 Ops.push_back(DAG.getUNDEF(SubVT));
4274 return true;
4275 }
4276 if (Idx == (VT.getVectorNumElements() / 2)) {
4277 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4278 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4279 Src.getOperand(1).getValueType() == SubVT &&
4280 isNullConstant(Src.getOperand(2))) {
4281 // Attempt to recurse into inner (matching) concats.
4282 SDValue Lo = Src.getOperand(1);
4283 SDValue Hi = Sub;
4284 SmallVector<SDValue, 2> LoOps, HiOps;
4285 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4286 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4287 LoOps.size() == HiOps.size()) {
4288 Ops.append(LoOps);
4289 Ops.append(HiOps);
4290 return true;
4291 }
4292 Ops.push_back(Lo);
4293 Ops.push_back(Hi);
4294 return true;
4295 }
4296 // insert_subvector(x, extract_subvector(x, lo), hi)
4297 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4298 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4299 Ops.append(2, Sub);
4300 return true;
4301 }
4302 // insert_subvector(undef, x, hi)
4303 if (Src.isUndef()) {
4304 Ops.push_back(DAG.getUNDEF(SubVT));
4305 Ops.push_back(Sub);
4306 return true;
4307 }
4308 }
4309 }
4310 }
4311
4312 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4313 EVT VT = N->getValueType(0);
4314 SDValue Src = N->getOperand(0);
4315 uint64_t Idx = N->getConstantOperandVal(1);
4316
4317 // Collect all the subvectors from the source vector and slice off the
4318 // extraction.
4320 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4321 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4322 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4323 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4324 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4325 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4326 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4327 return true;
4328 }
4329 }
4330
4331 assert(Ops.empty() && "Expected an empty ops vector");
4332 return false;
4333}
4334
4335// Helper to check if \p V can be split into subvectors and the upper subvectors
4336// are all undef. In which case return the lower subvector.
4338 SelectionDAG &DAG) {
4339 SmallVector<SDValue> SubOps;
4340 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4341 return SDValue();
4342
4343 unsigned NumSubOps = SubOps.size();
4344 unsigned HalfNumSubOps = NumSubOps / 2;
4345 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4346
4347 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4348 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4349 return SDValue();
4350
4351 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4352 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4353 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4354}
4355
4356// Helper to check if we can access all the constituent subvectors without any
4357// extract ops.
4360 return collectConcatOps(V.getNode(), Ops, DAG);
4361}
4362
4363static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4364 const SDLoc &dl) {
4365 EVT VT = Op.getValueType();
4366 unsigned NumElems = VT.getVectorNumElements();
4367 unsigned SizeInBits = VT.getSizeInBits();
4368 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4369 "Can't split odd sized vector");
4370
4372 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4373 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4374 unsigned HalfOps = SubOps.size() / 2;
4375 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4376 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4377 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4378 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4379 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4380 return std::make_pair(Lo, Hi);
4381 }
4382
4383 // If this is a splat value (with no-undefs) then use the lower subvector,
4384 // which should be a free extraction.
4385 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4386 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4387 return std::make_pair(Lo, Lo);
4388
4389 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4390 return std::make_pair(Lo, Hi);
4391}
4392
4393/// Break an operation into 2 half sized ops and then concatenate the results.
4395 unsigned NumOps = Op.getNumOperands();
4396 EVT VT = Op.getValueType();
4397
4398 // Extract the LHS Lo/Hi vectors
4401 for (unsigned I = 0; I != NumOps; ++I) {
4402 SDValue SrcOp = Op.getOperand(I);
4403 if (!SrcOp.getValueType().isVector()) {
4404 LoOps[I] = HiOps[I] = SrcOp;
4405 continue;
4406 }
4407 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4408 }
4409
4410 EVT LoVT, HiVT;
4411 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4412 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4413 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4414 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4415}
4416
4417/// Break an unary integer operation into 2 half sized ops and then
4418/// concatenate the result back.
4420 const SDLoc &dl) {
4421 // Make sure we only try to split 256/512-bit types to avoid creating
4422 // narrow vectors.
4423 [[maybe_unused]] EVT VT = Op.getValueType();
4424 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4425 Op.getOperand(0).getValueType().is512BitVector()) &&
4426 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4427 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4428 VT.getVectorNumElements() &&
4429 "Unexpected VTs!");
4430 return splitVectorOp(Op, DAG, dl);
4431}
4432
4433/// Break a binary integer operation into 2 half sized ops and then
4434/// concatenate the result back.
4436 const SDLoc &dl) {
4437 // Assert that all the types match.
4438 [[maybe_unused]] EVT VT = Op.getValueType();
4439 assert(Op.getOperand(0).getValueType() == VT &&
4440 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4441 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4442 return splitVectorOp(Op, DAG, dl);
4443}
4444
4445// Helper for splitting operands of an operation to legal target size and
4446// apply a function on each part.
4447// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4448// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4449// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4450// The argument Builder is a function that will be applied on each split part:
4451// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4452template <typename F>
4454 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4455 F Builder, bool CheckBWI = true) {
4456 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4457 unsigned NumSubs = 1;
4458 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4459 (!CheckBWI && Subtarget.useAVX512Regs())) {
4460 if (VT.getSizeInBits() > 512) {
4461 NumSubs = VT.getSizeInBits() / 512;
4462 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4463 }
4464 } else if (Subtarget.hasAVX2()) {
4465 if (VT.getSizeInBits() > 256) {
4466 NumSubs = VT.getSizeInBits() / 256;
4467 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4468 }
4469 } else {
4470 if (VT.getSizeInBits() > 128) {
4471 NumSubs = VT.getSizeInBits() / 128;
4472 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4473 }
4474 }
4475
4476 if (NumSubs == 1)
4477 return Builder(DAG, DL, Ops);
4478
4480 for (unsigned i = 0; i != NumSubs; ++i) {
4482 for (SDValue Op : Ops) {
4483 EVT OpVT = Op.getValueType();
4484 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4485 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4486 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4487 }
4488 Subs.push_back(Builder(DAG, DL, SubOps));
4489 }
4490 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4491}
4492
4493// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4494// targets.
4495static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4497 const X86Subtarget &Subtarget) {
4498 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4499 MVT SVT = VT.getScalarType();
4500
4501 // If we have a 32/64 splatted constant, splat it to DstTy to
4502 // encourage a foldable broadcast'd operand.
4503 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4504 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4505 // AVX512 broadcasts 32/64-bit operands.
4506 // TODO: Support float once getAVX512Node is used by fp-ops.
4507 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4509 return SDValue();
4510 // If we're not widening, don't bother if we're not bitcasting.
4511 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4512 return SDValue();
4514 APInt SplatValue, SplatUndef;
4515 unsigned SplatBitSize;
4516 bool HasAnyUndefs;
4517 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4518 HasAnyUndefs, OpEltSizeInBits) &&
4519 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4520 return DAG.getConstant(SplatValue, DL, DstVT);
4521 }
4522 return SDValue();
4523 };
4524
4525 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4526
4527 MVT DstVT = VT;
4528 if (Widen)
4529 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4530
4531 // Canonicalize src operands.
4532 SmallVector<SDValue> SrcOps(Ops);
4533 for (SDValue &Op : SrcOps) {
4534 MVT OpVT = Op.getSimpleValueType();
4535 // Just pass through scalar operands.
4536 if (!OpVT.isVector())
4537 continue;
4538 assert(OpVT == VT && "Vector type mismatch");
4539
4540 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4541 Op = BroadcastOp;
4542 continue;
4543 }
4544
4545 // Just widen the subvector by inserting into an undef wide vector.
4546 if (Widen)
4547 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4548 }
4549
4550 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4551
4552 // Perform the 512-bit op then extract the bottom subvector.
4553 if (Widen)
4554 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4555 return Res;
4556}
4557
4558/// Insert i1-subvector to i1-vector.
4560 const X86Subtarget &Subtarget) {
4561
4562 SDLoc dl(Op);
4563 SDValue Vec = Op.getOperand(0);
4564 SDValue SubVec = Op.getOperand(1);
4565 SDValue Idx = Op.getOperand(2);
4566 unsigned IdxVal = Op.getConstantOperandVal(2);
4567
4568 // Inserting undef is a nop. We can just return the original vector.
4569 if (SubVec.isUndef())
4570 return Vec;
4571
4572 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4573 return Op;
4574
4575 MVT OpVT = Op.getSimpleValueType();
4576 unsigned NumElems = OpVT.getVectorNumElements();
4577 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4578
4579 // Extend to natively supported kshift.
4580 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4581
4582 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4583 // if necessary.
4584 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4585 // May need to promote to a legal type.
4586 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4587 DAG.getConstant(0, dl, WideOpVT),
4588 SubVec, Idx);
4589 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4590 }
4591
4592 MVT SubVecVT = SubVec.getSimpleValueType();
4593 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4594 assert(IdxVal + SubVecNumElems <= NumElems &&
4595 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4596 "Unexpected index value in INSERT_SUBVECTOR");
4597
4598 SDValue Undef = DAG.getUNDEF(WideOpVT);
4599
4600 if (IdxVal == 0) {
4601 // Zero lower bits of the Vec
4602 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4603 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4604 ZeroIdx);
4605 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4606 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4607 // Merge them together, SubVec should be zero extended.
4608 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4609 DAG.getConstant(0, dl, WideOpVT),
4610 SubVec, ZeroIdx);
4611 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4612 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4613 }
4614
4615 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4616 Undef, SubVec, ZeroIdx);
4617
4618 if (Vec.isUndef()) {
4619 assert(IdxVal != 0 && "Unexpected index");
4620 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4621 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4622 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4623 }
4624
4626 assert(IdxVal != 0 && "Unexpected index");
4627 // If upper elements of Vec are known undef, then just shift into place.
4628 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4629 [](SDValue V) { return V.isUndef(); })) {
4630 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4631 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4632 } else {
4633 NumElems = WideOpVT.getVectorNumElements();
4634 unsigned ShiftLeft = NumElems - SubVecNumElems;
4635 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4636 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4637 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4638 if (ShiftRight != 0)
4639 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4640 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4641 }
4642 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4643 }
4644
4645 // Simple case when we put subvector in the upper part
4646 if (IdxVal + SubVecNumElems == NumElems) {
4647 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4648 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4649 if (SubVecNumElems * 2 == NumElems) {
4650 // Special case, use legal zero extending insert_subvector. This allows
4651 // isel to optimize when bits are known zero.
4652 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4653 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4654 DAG.getConstant(0, dl, WideOpVT),
4655 Vec, ZeroIdx);
4656 } else {
4657 // Otherwise use explicit shifts to zero the bits.
4658 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4659 Undef, Vec, ZeroIdx);
4660 NumElems = WideOpVT.getVectorNumElements();
4661 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4662 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4663 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4664 }
4665 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4666 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4667 }
4668
4669 // Inserting into the middle is more complicated.
4670
4671 NumElems = WideOpVT.getVectorNumElements();
4672
4673 // Widen the vector if needed.
4674 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4675
4676 unsigned ShiftLeft = NumElems - SubVecNumElems;
4677 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4678
4679 // Do an optimization for the most frequently used types.
4680 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4681 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4682 Mask0.flipAllBits();
4683 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4684 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4685 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4686 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4687 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4688 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4689 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4690 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4691
4692 // Reduce to original width if needed.
4693 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4694 }
4695
4696 // Clear the upper bits of the subvector and move it to its insert position.
4697 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4698 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4699 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4700 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4701
4702 // Isolate the bits below the insertion point.
4703 unsigned LowShift = NumElems - IdxVal;
4704 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4705 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4706 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4707 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4708
4709 // Isolate the bits after the last inserted bit.
4710 unsigned HighShift = IdxVal + SubVecNumElems;
4711 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4712 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4713 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4714 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4715
4716 // Now OR all 3 pieces together.
4717 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4718 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4719
4720 // Reduce to original width if needed.
4721 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4722}
4723
4725 const SDLoc &dl) {
4726 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4727 EVT SubVT = V1.getValueType();
4728 EVT SubSVT = SubVT.getScalarType();
4729 unsigned SubNumElts = SubVT.getVectorNumElements();
4730 unsigned SubVectorWidth = SubVT.getSizeInBits();
4731 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4732 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4733 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4734}
4735
4736/// Returns a vector of specified type with all bits set.
4737/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4738/// Then bitcast to their original type, ensuring they get CSE'd.
4739static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4740 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4741 "Expected a 128/256/512-bit vector type");
4742 unsigned NumElts = VT.getSizeInBits() / 32;
4743 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4744 return DAG.getBitcast(VT, Vec);
4745}
4746
4747static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4748 SDValue In, SelectionDAG &DAG) {
4749 EVT InVT = In.getValueType();
4750 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4751
4752 // Canonicalize Opcode to general extension version.
4753 switch (Opcode) {
4754 case ISD::ANY_EXTEND:
4756 Opcode = ISD::ANY_EXTEND;
4757 break;
4758 case ISD::SIGN_EXTEND:
4760 Opcode = ISD::SIGN_EXTEND;
4761 break;
4762 case ISD::ZERO_EXTEND:
4764 Opcode = ISD::ZERO_EXTEND;
4765 break;
4766 default:
4767 llvm_unreachable("Unknown extension opcode");
4768 }
4769
4770 // For 256-bit vectors, we only need the lower (128-bit) input half.
4771 // For 512-bit vectors, we only need the lower input half or quarter.
4772 if (InVT.getSizeInBits() > 128) {
4773 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4774 "Expected VTs to be the same size!");
4775 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4776 In = extractSubVector(In, 0, DAG, DL,
4777 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4778 InVT = In.getValueType();
4779 }
4780
4781 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4782 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4783
4784 return DAG.getNode(Opcode, DL, VT, In);
4785}
4786
4787// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4789 SDValue Mask, SelectionDAG &DAG) {
4790 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4791 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4792 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4793}
4794
4796 bool Lo, bool Unary) {
4797 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4798 "Illegal vector type to unpack");
4799 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4800 int NumElts = VT.getVectorNumElements();
4801 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4802 for (int i = 0; i < NumElts; ++i) {
4803 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4804 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4805 Pos += (Unary ? 0 : NumElts * (i % 2));
4806 Pos += (Lo ? 0 : NumEltsInLane / 2);
4807 Mask.push_back(Pos);
4808 }
4809}
4810
4811/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4812/// imposed by AVX and specific to the unary pattern. Example:
4813/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4814/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4816 bool Lo) {
4817 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4818 int NumElts = VT.getVectorNumElements();
4819 for (int i = 0; i < NumElts; ++i) {
4820 int Pos = i / 2;
4821 Pos += (Lo ? 0 : NumElts / 2);
4822 Mask.push_back(Pos);
4823 }
4824}
4825
4826// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4827static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4828 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4831 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4832 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4833 int M = Mask[I];
4834 if (M < 0)
4835 continue;
4836 SDValue V = (M < NumElts) ? V1 : V2;
4837 if (V.isUndef())
4838 continue;
4839 Ops[I] = V.getOperand(M % NumElts);
4840 }
4841 return DAG.getBuildVector(VT, dl, Ops);
4842 }
4843
4844 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4845}
4846
4847/// Returns a vector_shuffle node for an unpackl operation.
4848static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4849 SDValue V1, SDValue V2) {
4851 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4852 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4853}
4854
4855/// Returns a vector_shuffle node for an unpackh operation.
4856static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4857 SDValue V1, SDValue V2) {
4859 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4860 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4861}
4862
4863/// Returns a node that packs the LHS + RHS nodes together at half width.
4864/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4865/// TODO: Add subvector splitting if/when we have a need for it.
4866static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4867 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4868 bool PackHiHalf = false) {
4869 MVT OpVT = LHS.getSimpleValueType();
4870 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4871 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4872 assert(OpVT == RHS.getSimpleValueType() &&
4873 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4874 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4875 "Unexpected PACK operand types");
4876 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4877 "Unexpected PACK result type");
4878
4879 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4880 if (EltSizeInBits == 32) {
4881 SmallVector<int> PackMask;
4882 int Offset = PackHiHalf ? 1 : 0;
4883 int NumElts = VT.getVectorNumElements();
4884 for (int I = 0; I != NumElts; I += 4) {
4885 PackMask.push_back(I + Offset);
4886 PackMask.push_back(I + Offset + 2);
4887 PackMask.push_back(I + Offset + NumElts);
4888 PackMask.push_back(I + Offset + NumElts + 2);
4889 }
4890 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4891 DAG.getBitcast(VT, RHS), PackMask);
4892 }
4893
4894 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4895 if (!PackHiHalf) {
4896 if (UsePackUS &&
4897 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4898 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4899 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4900
4901 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4902 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4903 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4904 }
4905
4906 // Fallback to sign/zero extending the requested half and pack.
4907 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4908 if (UsePackUS) {
4909 if (PackHiHalf) {
4910 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4911 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4912 } else {
4913 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4914 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4915 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4916 };
4917 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4918 };
4919
4920 if (!PackHiHalf) {
4921 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4922 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4923 }
4924 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4925 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4926 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4927}
4928
4929/// Return a vector_shuffle of the specified vector of zero or undef vector.
4930/// This produces a shuffle where the low element of V2 is swizzled into the
4931/// zero/undef vector, landing at element Idx.
4932/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4934 bool IsZero,
4935 const X86Subtarget &Subtarget,
4936 SelectionDAG &DAG) {
4937 MVT VT = V2.getSimpleValueType();
4938 SDValue V1 = IsZero
4939 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4940 int NumElems = VT.getVectorNumElements();
4941 SmallVector<int, 16> MaskVec(NumElems);
4942 for (int i = 0; i != NumElems; ++i)
4943 // If this is the insertion idx, put the low elt of V2 here.
4944 MaskVec[i] = (i == Idx) ? NumElems : i;
4945 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4946}
4947
4949 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4950 Ptr.getOpcode() == X86ISD::WrapperRIP)
4951 Ptr = Ptr.getOperand(0);
4953}
4954
4955// TODO: Add support for non-zero offsets.
4958 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4959 return nullptr;
4960 return CNode->getConstVal();
4961}
4962
4964 if (!Load || !ISD::isNormalLoad(Load))
4965 return nullptr;
4966 return getTargetConstantFromBasePtr(Load->getBasePtr());
4967}
4968
4973
4974const Constant *
4976 assert(LD && "Unexpected null LoadSDNode");
4977 return getTargetConstantFromNode(LD);
4978}
4979
4981 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4982 SDValue Cond = N->getOperand(0);
4983 SDValue RHS = N->getOperand(2);
4984 EVT CondVT = Cond.getValueType();
4985 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4986 CondVT.getVectorElementType() == MVT::i1 &&
4987 ISD::isBuildVectorAllZeros(RHS.getNode());
4988}
4989
4990// Extract raw constant bits from constant pools.
4991static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4992 APInt &UndefElts,
4993 SmallVectorImpl<APInt> &EltBits,
4994 bool AllowWholeUndefs = true,
4995 bool AllowPartialUndefs = false) {
4996 assert(EltBits.empty() && "Expected an empty EltBits vector");
4997
4999
5000 EVT VT = Op.getValueType();
5001 unsigned SizeInBits = VT.getSizeInBits();
5002 unsigned NumElts = SizeInBits / EltSizeInBits;
5003
5004 // Can't split constant.
5005 if ((SizeInBits % EltSizeInBits) != 0)
5006 return false;
5007
5008 // Bitcast a source array of element bits to the target size.
5009 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5010 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5011 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5012 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5013 "Constant bit sizes don't match");
5014
5015 // Don't split if we don't allow undef bits.
5016 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5017 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5018 return false;
5019
5020 // If we're already the right size, don't bother bitcasting.
5021 if (NumSrcElts == NumElts) {
5022 UndefElts = UndefSrcElts;
5023 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5024 return true;
5025 }
5026
5027 // Extract all the undef/constant element data and pack into single bitsets.
5028 APInt UndefBits(SizeInBits, 0);
5029 APInt MaskBits(SizeInBits, 0);
5030
5031 for (unsigned i = 0; i != NumSrcElts; ++i) {
5032 unsigned BitOffset = i * SrcEltSizeInBits;
5033 if (UndefSrcElts[i])
5034 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5035 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5036 }
5037
5038 // Split the undef/constant single bitset data into the target elements.
5039 UndefElts = APInt(NumElts, 0);
5040 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5041
5042 for (unsigned i = 0; i != NumElts; ++i) {
5043 unsigned BitOffset = i * EltSizeInBits;
5044 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5045
5046 // Only treat an element as UNDEF if all bits are UNDEF.
5047 if (UndefEltBits.isAllOnes()) {
5048 if (!AllowWholeUndefs)
5049 return false;
5050 UndefElts.setBit(i);
5051 continue;
5052 }
5053
5054 // If only some bits are UNDEF then treat them as zero (or bail if not
5055 // supported).
5056 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5057 return false;
5058
5059 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5060 }
5061 return true;
5062 };
5063
5064 // Collect constant bits and insert into mask/undef bit masks.
5065 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5066 unsigned UndefBitIndex) {
5067 if (!Cst)
5068 return false;
5069 if (isa<UndefValue>(Cst)) {
5070 Undefs.setBit(UndefBitIndex);
5071 return true;
5072 }
5073 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5074 Mask = CInt->getValue();
5075 return true;
5076 }
5077 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5078 Mask = CFP->getValueAPF().bitcastToAPInt();
5079 return true;
5080 }
5081 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5082 Type *Ty = CDS->getType();
5083 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5084 Type *EltTy = CDS->getElementType();
5085 bool IsInteger = EltTy->isIntegerTy();
5086 bool IsFP =
5087 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5088 if (!IsInteger && !IsFP)
5089 return false;
5090 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5091 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5092 if (IsInteger)
5093 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5094 else
5095 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5096 I * EltBits);
5097 return true;
5098 }
5099 return false;
5100 };
5101
5102 // Handle UNDEFs.
5103 if (Op.isUndef()) {
5104 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5105 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5106 return CastBitData(UndefSrcElts, SrcEltBits);
5107 }
5108
5109 // Extract scalar constant bits.
5110 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5111 APInt UndefSrcElts = APInt::getZero(1);
5112 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5113 return CastBitData(UndefSrcElts, SrcEltBits);
5114 }
5115 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5116 APInt UndefSrcElts = APInt::getZero(1);
5117 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5118 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5119 return CastBitData(UndefSrcElts, SrcEltBits);
5120 }
5121
5122 // Extract constant bits from build vector.
5123 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5124 BitVector Undefs;
5125 SmallVector<APInt> SrcEltBits;
5126 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5127 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5128 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5129 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5130 if (Undefs[I])
5131 UndefSrcElts.setBit(I);
5132 return CastBitData(UndefSrcElts, SrcEltBits);
5133 }
5134 }
5135
5136 // Extract constant bits from constant pool vector.
5137 if (auto *Cst = getTargetConstantFromNode(Op)) {
5138 Type *CstTy = Cst->getType();
5139 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5140 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5141 return false;
5142
5143 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5144 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5145 if ((SizeInBits % SrcEltSizeInBits) != 0)
5146 return false;
5147
5148 APInt UndefSrcElts(NumSrcElts, 0);
5149 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5150 for (unsigned i = 0; i != NumSrcElts; ++i)
5151 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5152 UndefSrcElts, i))
5153 return false;
5154
5155 return CastBitData(UndefSrcElts, SrcEltBits);
5156 }
5157
5158 // Extract constant bits from a broadcasted constant pool scalar.
5159 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5160 EltSizeInBits <= VT.getScalarSizeInBits()) {
5161 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5162 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5163 return false;
5164
5165 SDValue Ptr = MemIntr->getBasePtr();
5167 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5168 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5169
5170 APInt UndefSrcElts(NumSrcElts, 0);
5171 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5172 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5173 if (UndefSrcElts[0])
5174 UndefSrcElts.setBits(0, NumSrcElts);
5175 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5176 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5177 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5178 return CastBitData(UndefSrcElts, SrcEltBits);
5179 }
5180 }
5181 }
5182
5183 // Extract constant bits from a subvector broadcast.
5184 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5185 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5186 SDValue Ptr = MemIntr->getBasePtr();
5187 // The source constant may be larger than the subvector broadcast,
5188 // ensure we extract the correct subvector constants.
5189 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5190 Type *CstTy = Cst->getType();
5191 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5192 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5193 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5194 (SizeInBits % SubVecSizeInBits) != 0)
5195 return false;
5196 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5197 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5198 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5199 APInt UndefSubElts(NumSubElts, 0);
5200 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5201 APInt(CstEltSizeInBits, 0));
5202 for (unsigned i = 0; i != NumSubElts; ++i) {
5203 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5204 UndefSubElts, i))
5205 return false;
5206 for (unsigned j = 1; j != NumSubVecs; ++j)
5207 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5208 }
5209 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5210 UndefSubElts);
5211 return CastBitData(UndefSubElts, SubEltBits);
5212 }
5213 }
5214
5215 // Extract a rematerialized scalar constant insertion.
5216 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5217 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5218 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5219 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5220 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5221
5222 APInt UndefSrcElts(NumSrcElts, 0);
5223 SmallVector<APInt, 64> SrcEltBits;
5224 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5225 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5226 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5227 return CastBitData(UndefSrcElts, SrcEltBits);
5228 }
5229
5230 // Insert constant bits from a base and sub vector sources.
5231 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5232 // If bitcasts to larger elements we might lose track of undefs - don't
5233 // allow any to be safe.
5234 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5235 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5236
5237 APInt UndefSrcElts, UndefSubElts;
5238 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5239 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5240 UndefSubElts, EltSubBits,
5241 AllowWholeUndefs && AllowUndefs,
5242 AllowPartialUndefs && AllowUndefs) &&
5243 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5244 UndefSrcElts, EltSrcBits,
5245 AllowWholeUndefs && AllowUndefs,
5246 AllowPartialUndefs && AllowUndefs)) {
5247 unsigned BaseIdx = Op.getConstantOperandVal(2);
5248 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5249 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5250 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5251 return CastBitData(UndefSrcElts, EltSrcBits);
5252 }
5253 }
5254
5255 // Extract constant bits from a subvector's source.
5256 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5257 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5258 EltBits, AllowWholeUndefs,
5259 AllowPartialUndefs)) {
5260 EVT SrcVT = Op.getOperand(0).getValueType();
5261 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5262 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5263 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5264 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5265 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5266 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5267 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5268
5269 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5270 if ((BaseIdx + NumSubElts) != NumSrcElts)
5271 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5272 if (BaseIdx != 0)
5273 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5274 return true;
5275 }
5276
5277 // Extract constant bits from shuffle node sources.
5278 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5279 // TODO - support shuffle through bitcasts.
5280 if (EltSizeInBits != VT.getScalarSizeInBits())
5281 return false;
5282
5283 ArrayRef<int> Mask = SVN->getMask();
5284 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5285 llvm::any_of(Mask, [](int M) { return M < 0; }))
5286 return false;
5287
5288 APInt UndefElts0, UndefElts1;
5289 SmallVector<APInt, 32> EltBits0, EltBits1;
5290 if (isAnyInRange(Mask, 0, NumElts) &&
5291 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5292 UndefElts0, EltBits0, AllowWholeUndefs,
5293 AllowPartialUndefs))
5294 return false;
5295 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5296 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5297 UndefElts1, EltBits1, AllowWholeUndefs,
5298 AllowPartialUndefs))
5299 return false;
5300
5301 UndefElts = APInt::getZero(NumElts);
5302 for (int i = 0; i != (int)NumElts; ++i) {
5303 int M = Mask[i];
5304 if (M < 0) {
5305 UndefElts.setBit(i);
5306 EltBits.push_back(APInt::getZero(EltSizeInBits));
5307 } else if (M < (int)NumElts) {
5308 if (UndefElts0[M])
5309 UndefElts.setBit(i);
5310 EltBits.push_back(EltBits0[M]);
5311 } else {
5312 if (UndefElts1[M - NumElts])
5313 UndefElts.setBit(i);
5314 EltBits.push_back(EltBits1[M - NumElts]);
5315 }
5316 }
5317 return true;
5318 }
5319
5320 return false;
5321}
5322
5323namespace llvm {
5324namespace X86 {
5325bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5326 APInt UndefElts;
5327 SmallVector<APInt, 16> EltBits;
5329 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5330 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5331 int SplatIndex = -1;
5332 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5333 if (UndefElts[i])
5334 continue;
5335 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5336 SplatIndex = -1;
5337 break;
5338 }
5339 SplatIndex = i;
5340 }
5341 if (0 <= SplatIndex) {
5342 SplatVal = EltBits[SplatIndex];
5343 return true;
5344 }
5345 }
5346
5347 return false;
5348}
5349
5350int getRoundingModeX86(unsigned RM) {
5351 switch (static_cast<::llvm::RoundingMode>(RM)) {
5352 // clang-format off
5353 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break;
5354 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; break;
5355 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; break;
5356 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; break;
5357 default:
5358 return X86::rmInvalid; // Invalid rounding mode
5359 }
5360}
5361
5362} // namespace X86
5363} // namespace llvm
5364
5366 unsigned MaskEltSizeInBits,
5368 APInt &UndefElts) {
5369 // Extract the raw target constant bits.
5370 SmallVector<APInt, 64> EltBits;
5371 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5372 EltBits, /* AllowWholeUndefs */ true,
5373 /* AllowPartialUndefs */ false))
5374 return false;
5375
5376 // Insert the extracted elements into the mask.
5377 for (const APInt &Elt : EltBits)
5378 RawMask.push_back(Elt.getZExtValue());
5379
5380 return true;
5381}
5382
5383static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5384 bool AllowUndefs) {
5385 APInt UndefElts;
5386 SmallVector<APInt, 64> EltBits;
5387 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5388 /*AllowWholeUndefs*/ AllowUndefs,
5389 /*AllowPartialUndefs*/ false))
5390 return false;
5391
5392 bool IsPow2OrUndef = true;
5393 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5394 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5395 return IsPow2OrUndef;
5396}
5397
5398// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5400 // TODO: don't always ignore oneuse constraints.
5401 V = peekThroughBitcasts(V);
5402 EVT VT = V.getValueType();
5403
5404 // Match not(xor X, -1) -> X.
5405 if (V.getOpcode() == ISD::XOR &&
5406 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5407 isAllOnesConstant(V.getOperand(1))))
5408 return V.getOperand(0);
5409
5410 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5411 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5412 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5413 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5414 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5415 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5416 V.getOperand(1));
5417 }
5418 }
5419
5420 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5421 if (V.getOpcode() == X86ISD::PCMPGT &&
5422 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5423 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5424 V.getOperand(0).hasOneUse()) {
5425 APInt UndefElts;
5426 SmallVector<APInt> EltBits;
5427 if (getTargetConstantBitsFromNode(V.getOperand(0),
5428 V.getScalarValueSizeInBits(), UndefElts,
5429 EltBits) &&
5430 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5431 // Don't fold min_signed_value -> (min_signed_value - 1)
5432 bool MinSigned = false;
5433 for (APInt &Elt : EltBits) {
5434 MinSigned |= Elt.isMinSignedValue();
5435 Elt -= 1;
5436 }
5437 if (!MinSigned) {
5438 SDLoc DL(V);
5439 MVT VT = V.getSimpleValueType();
5440 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5441 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5442 }
5443 }
5444 }
5445
5446 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5448 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5449 for (SDValue &CatOp : CatOps) {
5450 SDValue NotCat = IsNOT(CatOp, DAG);
5451 if (!NotCat)
5452 return SDValue();
5453 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5454 }
5455 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5456 }
5457
5458 // Match not(or(not(X),not(Y))) -> and(X, Y).
5459 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5460 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5461 // TODO: Handle cases with single NOT operand -> ANDNP
5462 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5463 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5464 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5465 DAG.getBitcast(VT, Op1));
5466 }
5467
5468 return SDValue();
5469}
5470
5471/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5472/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5473/// Note: This ignores saturation, so inputs must be checked first.
5475 bool Unary, unsigned NumStages = 1) {
5476 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5477 unsigned NumElts = VT.getVectorNumElements();
5478 unsigned NumLanes = VT.getSizeInBits() / 128;
5479 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5480 unsigned Offset = Unary ? 0 : NumElts;
5481 unsigned Repetitions = 1u << (NumStages - 1);
5482 unsigned Increment = 1u << NumStages;
5483 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5484
5485 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5486 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5487 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5488 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5489 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5490 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5491 }
5492 }
5493}
5494
5495// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5496static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5497 APInt &DemandedLHS, APInt &DemandedRHS) {
5498 int NumLanes = VT.getSizeInBits() / 128;
5499 int NumElts = DemandedElts.getBitWidth();
5500 int NumInnerElts = NumElts / 2;
5501 int NumEltsPerLane = NumElts / NumLanes;
5502 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5503
5504 DemandedLHS = APInt::getZero(NumInnerElts);
5505 DemandedRHS = APInt::getZero(NumInnerElts);
5506
5507 // Map DemandedElts to the packed operands.
5508 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5509 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5510 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5511 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5512 if (DemandedElts[OuterIdx])
5513 DemandedLHS.setBit(InnerIdx);
5514 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5515 DemandedRHS.setBit(InnerIdx);
5516 }
5517 }
5518}
5519
5520// Split the demanded elts of a HADD/HSUB node between its operands.
5521static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5522 APInt &DemandedLHS, APInt &DemandedRHS) {
5524 DemandedLHS, DemandedRHS);
5525 DemandedLHS |= DemandedLHS << 1;
5526 DemandedRHS |= DemandedRHS << 1;
5527}
5528
5529/// Calculates the shuffle mask corresponding to the target-specific opcode.
5530/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5531/// operands in \p Ops, and returns true.
5532/// Sets \p IsUnary to true if only one source is used. Note that this will set
5533/// IsUnary for shuffles which use a single input multiple times, and in those
5534/// cases it will adjust the mask to only have indices within that single input.
5535/// It is an error to call this with non-empty Mask/Ops vectors.
5536static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5538 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5539 if (!isTargetShuffle(N.getOpcode()))
5540 return false;
5541
5542 MVT VT = N.getSimpleValueType();
5543 unsigned NumElems = VT.getVectorNumElements();
5544 unsigned MaskEltSize = VT.getScalarSizeInBits();
5546 APInt RawUndefs;
5547 uint64_t ImmN;
5548
5549 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5550 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5551
5552 IsUnary = false;
5553 bool IsFakeUnary = false;
5554 switch (N.getOpcode()) {
5555 case X86ISD::BLENDI:
5556 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5557 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5558 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5559 DecodeBLENDMask(NumElems, ImmN, Mask);
5560 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5561 break;
5562 case X86ISD::SHUFP:
5563 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5564 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5565 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5566 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5567 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5568 break;
5569 case X86ISD::INSERTPS:
5570 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5571 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5572 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5573 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5574 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5575 break;
5576 case X86ISD::EXTRQI:
5577 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5578 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5579 isa<ConstantSDNode>(N.getOperand(2))) {
5580 int BitLen = N.getConstantOperandVal(1);
5581 int BitIdx = N.getConstantOperandVal(2);
5582 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5583 IsUnary = true;
5584 }
5585 break;
5586 case X86ISD::INSERTQI:
5587 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5588 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5589 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5590 isa<ConstantSDNode>(N.getOperand(3))) {
5591 int BitLen = N.getConstantOperandVal(2);
5592 int BitIdx = N.getConstantOperandVal(3);
5593 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5594 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5595 }
5596 break;
5597 case X86ISD::UNPCKH:
5598 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5599 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5600 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5601 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5602 break;
5603 case X86ISD::UNPCKL:
5604 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5605 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5606 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5607 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5608 break;
5609 case X86ISD::MOVHLPS:
5610 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5611 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5612 DecodeMOVHLPSMask(NumElems, Mask);
5613 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5614 break;
5615 case X86ISD::MOVLHPS:
5616 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5617 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5618 DecodeMOVLHPSMask(NumElems, Mask);
5619 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5620 break;
5621 case X86ISD::VALIGN:
5622 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5623 "Only 32-bit and 64-bit elements are supported!");
5624 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5625 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5626 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5627 DecodeVALIGNMask(NumElems, ImmN, Mask);
5628 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5629 Ops.push_back(N.getOperand(1));
5630 Ops.push_back(N.getOperand(0));
5631 break;
5632 case X86ISD::PALIGNR:
5633 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5634 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5635 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5636 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5637 DecodePALIGNRMask(NumElems, ImmN, Mask);
5638 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5639 Ops.push_back(N.getOperand(1));
5640 Ops.push_back(N.getOperand(0));
5641 break;
5642 case X86ISD::VSHLDQ:
5643 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5644 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5645 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5646 DecodePSLLDQMask(NumElems, ImmN, Mask);
5647 IsUnary = true;
5648 break;
5649 case X86ISD::VSRLDQ:
5650 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5651 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5652 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5653 DecodePSRLDQMask(NumElems, ImmN, Mask);
5654 IsUnary = true;
5655 break;
5656 case X86ISD::PSHUFD:
5657 case X86ISD::VPERMILPI:
5658 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5659 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5660 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5661 IsUnary = true;
5662 break;
5663 case X86ISD::PSHUFHW:
5664 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5665 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5666 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5667 IsUnary = true;
5668 break;
5669 case X86ISD::PSHUFLW:
5670 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5671 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5672 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5673 IsUnary = true;
5674 break;
5675 case X86ISD::VZEXT_MOVL:
5676 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5677 DecodeZeroMoveLowMask(NumElems, Mask);
5678 IsUnary = true;
5679 break;
5680 case X86ISD::VBROADCAST:
5681 // We only decode broadcasts of same-sized vectors, peeking through to
5682 // extracted subvectors is likely to cause hasOneUse issues with
5683 // SimplifyDemandedBits etc.
5684 if (N.getOperand(0).getValueType() == VT) {
5685 DecodeVectorBroadcast(NumElems, Mask);
5686 IsUnary = true;
5687 break;
5688 }
5689 return false;
5690 case X86ISD::VPERMILPV: {
5691 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5692 IsUnary = true;
5693 SDValue MaskNode = N.getOperand(1);
5694 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5695 RawUndefs)) {
5696 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5697 break;
5698 }
5699 return false;
5700 }
5701 case X86ISD::PSHUFB: {
5702 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5703 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5704 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5705 IsUnary = true;
5706 SDValue MaskNode = N.getOperand(1);
5707 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5708 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5709 break;
5710 }
5711 return false;
5712 }
5713 case X86ISD::VPERMI:
5714 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5715 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5716 DecodeVPERMMask(NumElems, ImmN, Mask);
5717 IsUnary = true;
5718 break;
5719 case X86ISD::MOVSS:
5720 case X86ISD::MOVSD:
5721 case X86ISD::MOVSH:
5722 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5723 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5724 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5725 break;
5726 case X86ISD::VPERM2X128:
5727 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5728 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5729 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5730 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5731 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5732 break;
5733 case X86ISD::SHUF128:
5734 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5735 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5736 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5737 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5738 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5739 break;
5740 case X86ISD::MOVSLDUP:
5741 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5742 DecodeMOVSLDUPMask(NumElems, Mask);
5743 IsUnary = true;
5744 break;
5745 case X86ISD::MOVSHDUP:
5746 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5747 DecodeMOVSHDUPMask(NumElems, Mask);
5748 IsUnary = true;
5749 break;
5750 case X86ISD::MOVDDUP:
5751 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5752 DecodeMOVDDUPMask(NumElems, Mask);
5753 IsUnary = true;
5754 break;
5755 case X86ISD::VPERMIL2: {
5756 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5757 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5758 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5759 SDValue MaskNode = N.getOperand(2);
5760 SDValue CtrlNode = N.getOperand(3);
5761 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5762 unsigned CtrlImm = CtrlOp->getZExtValue();
5763 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5764 RawUndefs)) {
5765 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5766 Mask);
5767 break;
5768 }
5769 }
5770 return false;
5771 }
5772 case X86ISD::VPPERM: {
5773 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5774 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5775 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5776 SDValue MaskNode = N.getOperand(2);
5777 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5778 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5779 break;
5780 }
5781 return false;
5782 }
5783 case X86ISD::VPERMV: {
5784 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5785 IsUnary = true;
5786 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5787 Ops.push_back(N.getOperand(1));
5788 SDValue MaskNode = N.getOperand(0);
5789 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5790 RawUndefs)) {
5791 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5792 break;
5793 }
5794 return false;
5795 }
5796 case X86ISD::VPERMV3: {
5797 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5798 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5799 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5800 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5801 Ops.push_back(N.getOperand(0));
5802 Ops.push_back(N.getOperand(2));
5803 SDValue MaskNode = N.getOperand(1);
5804 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5805 RawUndefs)) {
5806 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5807 break;
5808 }
5809 return false;
5810 }
5811 default:
5812 llvm_unreachable("unknown target shuffle node");
5813 }
5814
5815 // Empty mask indicates the decode failed.
5816 if (Mask.empty())
5817 return false;
5818
5819 // Check if we're getting a shuffle mask with zero'd elements.
5820 if (!AllowSentinelZero && isAnyZero(Mask))
5821 return false;
5822
5823 // If we have a fake unary shuffle, the shuffle mask is spread across two
5824 // inputs that are actually the same node. Re-map the mask to always point
5825 // into the first input.
5826 if (IsFakeUnary)
5827 for (int &M : Mask)
5828 if (M >= (int)Mask.size())
5829 M -= Mask.size();
5830
5831 // If we didn't already add operands in the opcode-specific code, default to
5832 // adding 1 or 2 operands starting at 0.
5833 if (Ops.empty()) {
5834 Ops.push_back(N.getOperand(0));
5835 if (!IsUnary || IsFakeUnary)
5836 Ops.push_back(N.getOperand(1));
5837 }
5838
5839 return true;
5840}
5841
5842// Wrapper for getTargetShuffleMask with InUnary;
5843static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5845 SmallVectorImpl<int> &Mask) {
5846 bool IsUnary;
5847 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5848}
5849
5850/// Compute whether each element of a shuffle is zeroable.
5851///
5852/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5853/// Either it is an undef element in the shuffle mask, the element of the input
5854/// referenced is undef, or the element of the input referenced is known to be
5855/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5856/// as many lanes with this technique as possible to simplify the remaining
5857/// shuffle.
5859 SDValue V1, SDValue V2,
5860 APInt &KnownUndef, APInt &KnownZero) {
5861 int Size = Mask.size();
5862 KnownUndef = KnownZero = APInt::getZero(Size);
5863
5864 V1 = peekThroughBitcasts(V1);
5865 V2 = peekThroughBitcasts(V2);
5866
5867 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5868 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5869
5870 int VectorSizeInBits = V1.getValueSizeInBits();
5871 int ScalarSizeInBits = VectorSizeInBits / Size;
5872 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5873
5874 for (int i = 0; i < Size; ++i) {
5875 int M = Mask[i];
5876 // Handle the easy cases.
5877 if (M < 0) {
5878 KnownUndef.setBit(i);
5879 continue;
5880 }
5881 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5882 KnownZero.setBit(i);
5883 continue;
5884 }
5885
5886 // Determine shuffle input and normalize the mask.
5887 SDValue V = M < Size ? V1 : V2;
5888 M %= Size;
5889
5890 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5891 if (V.getOpcode() != ISD::BUILD_VECTOR)
5892 continue;
5893
5894 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5895 // the (larger) source element must be UNDEF/ZERO.
5896 if ((Size % V.getNumOperands()) == 0) {
5897 int Scale = Size / V->getNumOperands();
5898 SDValue Op = V.getOperand(M / Scale);
5899 if (Op.isUndef())
5900 KnownUndef.setBit(i);
5901 if (X86::isZeroNode(Op))
5902 KnownZero.setBit(i);
5903 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5904 APInt Val = Cst->getAPIntValue();
5905 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5906 if (Val == 0)
5907 KnownZero.setBit(i);
5908 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5909 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5910 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5911 if (Val == 0)
5912 KnownZero.setBit(i);
5913 }
5914 continue;
5915 }
5916
5917 // If the BUILD_VECTOR has more elements then all the (smaller) source
5918 // elements must be UNDEF or ZERO.
5919 if ((V.getNumOperands() % Size) == 0) {
5920 int Scale = V->getNumOperands() / Size;
5921 bool AllUndef = true;
5922 bool AllZero = true;
5923 for (int j = 0; j < Scale; ++j) {
5924 SDValue Op = V.getOperand((M * Scale) + j);
5925 AllUndef &= Op.isUndef();
5926 AllZero &= X86::isZeroNode(Op);
5927 }
5928 if (AllUndef)
5929 KnownUndef.setBit(i);
5930 if (AllZero)
5931 KnownZero.setBit(i);
5932 continue;
5933 }
5934 }
5935}
5936
5937/// Decode a target shuffle mask and inputs and see if any values are
5938/// known to be undef or zero from their inputs.
5939/// Returns true if the target shuffle mask was decoded.
5940/// FIXME: Merge this with computeZeroableShuffleElements?
5943 APInt &KnownUndef, APInt &KnownZero) {
5944 bool IsUnary;
5945 if (!isTargetShuffle(N.getOpcode()))
5946 return false;
5947
5948 MVT VT = N.getSimpleValueType();
5949 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5950 return false;
5951
5952 int Size = Mask.size();
5953 SDValue V1 = Ops[0];
5954 SDValue V2 = IsUnary ? V1 : Ops[1];
5955 KnownUndef = KnownZero = APInt::getZero(Size);
5956
5957 V1 = peekThroughBitcasts(V1);
5958 V2 = peekThroughBitcasts(V2);
5959
5960 assert((VT.getSizeInBits() % Size) == 0 &&
5961 "Illegal split of shuffle value type");
5962 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5963
5964 // Extract known constant input data.
5965 APInt UndefSrcElts[2];
5966 SmallVector<APInt, 32> SrcEltBits[2];
5967 bool IsSrcConstant[2] = {
5968 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5969 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5970 /*AllowPartialUndefs*/ false),
5971 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5972 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5973 /*AllowPartialUndefs*/ false)};
5974
5975 for (int i = 0; i < Size; ++i) {
5976 int M = Mask[i];
5977
5978 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5979 if (M < 0) {
5980 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5981 if (SM_SentinelUndef == M)
5982 KnownUndef.setBit(i);
5983 if (SM_SentinelZero == M)
5984 KnownZero.setBit(i);
5985 continue;
5986 }
5987
5988 // Determine shuffle input and normalize the mask.
5989 unsigned SrcIdx = M / Size;
5990 SDValue V = M < Size ? V1 : V2;
5991 M %= Size;
5992
5993 // We are referencing an UNDEF input.
5994 if (V.isUndef()) {
5995 KnownUndef.setBit(i);
5996 continue;
5997 }
5998
5999 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6000 // TODO: We currently only set UNDEF for integer types - floats use the same
6001 // registers as vectors and many of the scalar folded loads rely on the
6002 // SCALAR_TO_VECTOR pattern.
6003 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6004 (Size % V.getValueType().getVectorNumElements()) == 0) {
6005 int Scale = Size / V.getValueType().getVectorNumElements();
6006 int Idx = M / Scale;
6007 if (Idx != 0 && !VT.isFloatingPoint())
6008 KnownUndef.setBit(i);
6009 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6010 KnownZero.setBit(i);
6011 continue;
6012 }
6013
6014 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6015 // base vectors.
6016 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6017 SDValue Vec = V.getOperand(0);
6018 int NumVecElts = Vec.getValueType().getVectorNumElements();
6019 if (Vec.isUndef() && Size == NumVecElts) {
6020 int Idx = V.getConstantOperandVal(2);
6021 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6022 if (M < Idx || (Idx + NumSubElts) <= M)
6023 KnownUndef.setBit(i);
6024 }
6025 continue;
6026 }
6027
6028 // Attempt to extract from the source's constant bits.
6029 if (IsSrcConstant[SrcIdx]) {
6030 if (UndefSrcElts[SrcIdx][M])
6031 KnownUndef.setBit(i);
6032 else if (SrcEltBits[SrcIdx][M] == 0)
6033 KnownZero.setBit(i);
6034 }
6035 }
6036
6037 assert(VT.getVectorNumElements() == (unsigned)Size &&
6038 "Different mask size from vector size!");
6039 return true;
6040}
6041
6042// Replace target shuffle mask elements with known undef/zero sentinels.
6044 const APInt &KnownUndef,
6045 const APInt &KnownZero,
6046 bool ResolveKnownZeros= true) {
6047 unsigned NumElts = Mask.size();
6048 assert(KnownUndef.getBitWidth() == NumElts &&
6049 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6050
6051 for (unsigned i = 0; i != NumElts; ++i) {
6052 if (KnownUndef[i])
6053 Mask[i] = SM_SentinelUndef;
6054 else if (ResolveKnownZeros && KnownZero[i])
6055 Mask[i] = SM_SentinelZero;
6056 }
6057}
6058
6059// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6061 APInt &KnownUndef,
6062 APInt &KnownZero) {
6063 unsigned NumElts = Mask.size();
6064 KnownUndef = KnownZero = APInt::getZero(NumElts);
6065
6066 for (unsigned i = 0; i != NumElts; ++i) {
6067 int M = Mask[i];
6068 if (SM_SentinelUndef == M)
6069 KnownUndef.setBit(i);
6070 if (SM_SentinelZero == M)
6071 KnownZero.setBit(i);
6072 }
6073}
6074
6075// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6077 SDValue Cond, bool IsBLENDV = false) {
6078 EVT CondVT = Cond.getValueType();
6079 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6080 unsigned NumElts = CondVT.getVectorNumElements();
6081
6082 APInt UndefElts;
6083 SmallVector<APInt, 32> EltBits;
6084 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6085 /*AllowWholeUndefs*/ true,
6086 /*AllowPartialUndefs*/ false))
6087 return false;
6088
6089 Mask.resize(NumElts, SM_SentinelUndef);
6090
6091 for (int i = 0; i != (int)NumElts; ++i) {
6092 Mask[i] = i;
6093 // Arbitrarily choose from the 2nd operand if the select condition element
6094 // is undef.
6095 // TODO: Can we do better by matching patterns such as even/odd?
6096 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6097 (IsBLENDV && EltBits[i].isNonNegative()))
6098 Mask[i] += NumElts;
6099 }
6100
6101 return true;
6102}
6103
6104// Forward declaration (for getFauxShuffleMask recursive check).
6105static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6108 const SelectionDAG &DAG, unsigned Depth,
6109 bool ResolveKnownElts);
6110
6111// Attempt to decode ops that could be represented as a shuffle mask.
6112// The decoded shuffle mask may contain a different number of elements to the
6113// destination value type.
6114// TODO: Merge into getTargetShuffleInputs()
6115static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6118 const SelectionDAG &DAG, unsigned Depth,
6119 bool ResolveKnownElts) {
6120 Mask.clear();
6121 Ops.clear();
6122
6123 MVT VT = N.getSimpleValueType();
6124 unsigned NumElts = VT.getVectorNumElements();
6125 unsigned NumSizeInBits = VT.getSizeInBits();
6126 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6127 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6128 return false;
6129 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6130 unsigned NumSizeInBytes = NumSizeInBits / 8;
6131 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6132
6133 unsigned Opcode = N.getOpcode();
6134 switch (Opcode) {
6135 case ISD::VECTOR_SHUFFLE: {
6136 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6137 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6138 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6139 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6140 Ops.push_back(N.getOperand(0));
6141 Ops.push_back(N.getOperand(1));
6142 return true;
6143 }
6144 return false;
6145 }
6146 case ISD::AND:
6147 case X86ISD::ANDNP: {
6148 // Attempt to decode as a per-byte mask.
6149 APInt UndefElts;
6150 SmallVector<APInt, 32> EltBits;
6151 SDValue N0 = N.getOperand(0);
6152 SDValue N1 = N.getOperand(1);
6153 bool IsAndN = (X86ISD::ANDNP == Opcode);
6154 uint64_t ZeroMask = IsAndN ? 255 : 0;
6155 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6156 /*AllowWholeUndefs*/ false,
6157 /*AllowPartialUndefs*/ false))
6158 return false;
6159 // We can't assume an undef src element gives an undef dst - the other src
6160 // might be zero.
6161 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6162 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6163 const APInt &ByteBits = EltBits[i];
6164 if (ByteBits != 0 && ByteBits != 255)
6165 return false;
6166 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6167 }
6168 Ops.push_back(IsAndN ? N1 : N0);
6169 return true;
6170 }
6171 case ISD::OR: {
6172 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6173 // is a valid shuffle index.
6174 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6175 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6176 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6177 return false;
6178
6179 SmallVector<int, 64> SrcMask0, SrcMask1;
6180 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6183 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6184 Depth + 1, true) ||
6185 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6186 Depth + 1, true))
6187 return false;
6188
6189 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6190 SmallVector<int, 64> Mask0, Mask1;
6191 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6192 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6193 for (int i = 0; i != (int)MaskSize; ++i) {
6194 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6195 // loops converting between OR and BLEND shuffles due to
6196 // canWidenShuffleElements merging away undef elements, meaning we
6197 // fail to recognise the OR as the undef element isn't known zero.
6198 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6199 Mask.push_back(SM_SentinelZero);
6200 else if (Mask1[i] == SM_SentinelZero)
6201 Mask.push_back(i);
6202 else if (Mask0[i] == SM_SentinelZero)
6203 Mask.push_back(i + MaskSize);
6204 else
6205 return false;
6206 }
6207 Ops.push_back(N.getOperand(0));
6208 Ops.push_back(N.getOperand(1));
6209 return true;
6210 }
6211 case ISD::CONCAT_VECTORS: {
6212 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6213 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6214 if (NumBitsPerElt == 64) {
6215 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6216 for (unsigned M = 0; M != NumSubElts; ++M)
6217 Mask.push_back((I * NumElts) + M);
6218 Ops.push_back(N.getOperand(I));
6219 }
6220 return true;
6221 }
6222 return false;
6223 }
6224 case ISD::INSERT_SUBVECTOR: {
6225 SDValue Src = N.getOperand(0);
6226 SDValue Sub = N.getOperand(1);
6227 EVT SubVT = Sub.getValueType();
6228 unsigned NumSubElts = SubVT.getVectorNumElements();
6229 uint64_t InsertIdx = N.getConstantOperandVal(2);
6230 // Subvector isn't demanded - just return the base vector.
6231 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6232 Mask.resize(NumElts);
6233 std::iota(Mask.begin(), Mask.end(), 0);
6234 Ops.push_back(Src);
6235 return true;
6236 }
6237 // Handle CONCAT(SUB0, SUB1).
6238 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6239 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6240 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6241 Src.getOperand(0).isUndef() &&
6242 Src.getOperand(1).getValueType() == SubVT &&
6243 Src.getConstantOperandVal(2) == 0 &&
6244 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6245 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6246 Mask.resize(NumElts);
6247 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6248 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6249 Ops.push_back(Src.getOperand(1));
6250 Ops.push_back(Sub);
6251 return true;
6252 }
6253 if (!N->isOnlyUserOf(Sub.getNode()))
6254 return false;
6255
6256 SmallVector<int, 64> SubMask;
6257 SmallVector<SDValue, 2> SubInputs;
6259 EVT SubSrcVT = SubSrc.getValueType();
6260 if (!SubSrcVT.isVector())
6261 return false;
6262
6263 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6264 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6265 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6266 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6267 SDValue SubSrcSrc = SubSrc.getOperand(0);
6268 unsigned NumSubSrcSrcElts =
6269 SubSrcSrc.getValueType().getVectorNumElements();
6270 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6271 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6272 "Subvector valuetype mismatch");
6273 InsertIdx *= (MaxElts / NumElts);
6274 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6275 NumSubElts *= (MaxElts / NumElts);
6276 bool SrcIsUndef = Src.isUndef();
6277 for (int i = 0; i != (int)MaxElts; ++i)
6278 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6279 for (int i = 0; i != (int)NumSubElts; ++i)
6280 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6281 if (!SrcIsUndef)
6282 Ops.push_back(Src);
6283 Ops.push_back(SubSrcSrc);
6284 return true;
6285 }
6286
6287 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6288 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6289 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6290 Depth + 1, ResolveKnownElts))
6291 return false;
6292
6293 // Subvector shuffle inputs must not be larger than the subvector.
6294 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6295 return SubVT.getFixedSizeInBits() <
6296 SubInput.getValueSizeInBits().getFixedValue();
6297 }))
6298 return false;
6299
6300 if (SubMask.size() != NumSubElts) {
6301 assert(((SubMask.size() % NumSubElts) == 0 ||
6302 (NumSubElts % SubMask.size()) == 0) &&
6303 "Illegal submask scale");
6304 if ((NumSubElts % SubMask.size()) == 0) {
6305 int Scale = NumSubElts / SubMask.size();
6306 SmallVector<int, 64> ScaledSubMask;
6307 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6308 SubMask = ScaledSubMask;
6309 } else {
6310 int Scale = SubMask.size() / NumSubElts;
6311 NumSubElts = SubMask.size();
6312 NumElts *= Scale;
6313 InsertIdx *= Scale;
6314 }
6315 }
6316 Ops.push_back(Src);
6317 Ops.append(SubInputs.begin(), SubInputs.end());
6318 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6319 Mask.append(NumElts, SM_SentinelZero);
6320 else
6321 for (int i = 0; i != (int)NumElts; ++i)
6322 Mask.push_back(i);
6323 for (int i = 0; i != (int)NumSubElts; ++i) {
6324 int M = SubMask[i];
6325 if (0 <= M) {
6326 int InputIdx = M / NumSubElts;
6327 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6328 }
6329 Mask[i + InsertIdx] = M;
6330 }
6331 return true;
6332 }
6333 case X86ISD::PINSRB:
6334 case X86ISD::PINSRW:
6337 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6338 // vector, for matching src/dst vector types.
6339 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6340
6341 unsigned DstIdx = 0;
6342 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6343 // Check we have an in-range constant insertion index.
6344 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6345 N.getConstantOperandAPInt(2).uge(NumElts))
6346 return false;
6347 DstIdx = N.getConstantOperandVal(2);
6348
6349 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6350 if (X86::isZeroNode(Scl)) {
6351 Ops.push_back(N.getOperand(0));
6352 for (unsigned i = 0; i != NumElts; ++i)
6353 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6354 return true;
6355 }
6356 }
6357
6358 // Peek through trunc/aext/zext/bitcast.
6359 // TODO: aext shouldn't require SM_SentinelZero padding.
6360 // TODO: handle shift of scalars.
6361 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6362 while (Scl.getOpcode() == ISD::TRUNCATE ||
6363 Scl.getOpcode() == ISD::ANY_EXTEND ||
6364 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6365 (Scl.getOpcode() == ISD::BITCAST &&
6368 Scl = Scl.getOperand(0);
6369 MinBitsPerElt =
6370 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6371 }
6372 if ((MinBitsPerElt % 8) != 0)
6373 return false;
6374
6375 // Attempt to find the source vector the scalar was extracted from.
6376 SDValue SrcExtract;
6377 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6378 Scl.getOpcode() == X86ISD::PEXTRW ||
6379 Scl.getOpcode() == X86ISD::PEXTRB) &&
6380 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6381 SrcExtract = Scl;
6382 }
6383 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6384 return false;
6385
6386 SDValue SrcVec = SrcExtract.getOperand(0);
6387 EVT SrcVT = SrcVec.getValueType();
6388 if (!SrcVT.getScalarType().isByteSized())
6389 return false;
6390 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6391 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6392 unsigned DstByte = DstIdx * NumBytesPerElt;
6393 MinBitsPerElt =
6394 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6395
6396 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6397 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6398 Ops.push_back(SrcVec);
6399 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6400 } else {
6401 Ops.push_back(SrcVec);
6402 Ops.push_back(N.getOperand(0));
6403 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6404 Mask.push_back(NumSizeInBytes + i);
6405 }
6406
6407 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6408 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6409 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6410 Mask[DstByte + i] = SrcByte + i;
6411 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6412 Mask[DstByte + i] = SM_SentinelZero;
6413 return true;
6414 }
6415 case X86ISD::PACKSS:
6416 case X86ISD::PACKUS: {
6417 SDValue N0 = N.getOperand(0);
6418 SDValue N1 = N.getOperand(1);
6419 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6420 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6421 "Unexpected input value type");
6422
6423 APInt EltsLHS, EltsRHS;
6424 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6425
6426 // If we know input saturation won't happen (or we don't care for particular
6427 // lanes), we can treat this as a truncation shuffle.
6428 bool Offset0 = false, Offset1 = false;
6429 if (Opcode == X86ISD::PACKSS) {
6430 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6431 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6432 (!(N1.isUndef() || EltsRHS.isZero()) &&
6433 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6434 return false;
6435 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6436 // PACKSS then it was likely being used for sign-extension for a
6437 // truncation, so just peek through and adjust the mask accordingly.
6438 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6439 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6440 Offset0 = true;
6441 N0 = N0.getOperand(0);
6442 }
6443 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6444 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6445 Offset1 = true;
6446 N1 = N1.getOperand(0);
6447 }
6448 } else {
6449 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6450 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6451 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6452 (!(N1.isUndef() || EltsRHS.isZero()) &&
6453 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6454 return false;
6455 }
6456
6457 bool IsUnary = (N0 == N1);
6458
6459 Ops.push_back(N0);
6460 if (!IsUnary)
6461 Ops.push_back(N1);
6462
6463 createPackShuffleMask(VT, Mask, IsUnary);
6464
6465 if (Offset0 || Offset1) {
6466 for (int &M : Mask)
6467 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6468 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6469 ++M;
6470 }
6471 return true;
6472 }
6473 case ISD::VSELECT:
6474 case X86ISD::BLENDV: {
6475 SDValue Cond = N.getOperand(0);
6476 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6477 Ops.push_back(N.getOperand(1));
6478 Ops.push_back(N.getOperand(2));
6479 return true;
6480 }
6481 return false;
6482 }
6483 case X86ISD::VTRUNC: {
6484 SDValue Src = N.getOperand(0);
6485 EVT SrcVT = Src.getValueType();
6486 if (SrcVT.getSizeInBits() != NumSizeInBits)
6487 return false;
6488 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6489 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6490 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6491 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6492 for (unsigned i = 0; i != NumSrcElts; ++i)
6493 Mask.push_back(i * Scale);
6494 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6495 Ops.push_back(Src);
6496 return true;
6497 }
6498 case ISD::SHL:
6499 case ISD::SRL: {
6500 APInt UndefElts;
6501 SmallVector<APInt, 32> EltBits;
6502 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6503 UndefElts, EltBits,
6504 /*AllowWholeUndefs*/ true,
6505 /*AllowPartialUndefs*/ false))
6506 return false;
6507
6508 // We can only decode 'whole byte' bit shifts as shuffles.
6509 for (unsigned I = 0; I != NumElts; ++I)
6510 if (DemandedElts[I] && !UndefElts[I] &&
6511 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6512 return false;
6513
6514 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6515 Ops.push_back(N.getOperand(0));
6516
6517 for (unsigned I = 0; I != NumElts; ++I) {
6518 if (!DemandedElts[I] || UndefElts[I])
6519 continue;
6520 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6521 unsigned Lo = I * NumBytesPerElt;
6522 unsigned Hi = Lo + NumBytesPerElt;
6523 // Clear mask to all zeros and insert the shifted byte indices.
6524 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6525 if (ISD::SHL == Opcode)
6526 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6527 else
6528 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6529 Lo + ByteShift);
6530 }
6531 return true;
6532 }
6533 case X86ISD::VSHLI:
6534 case X86ISD::VSRLI: {
6535 uint64_t ShiftVal = N.getConstantOperandVal(1);
6536 // Out of range bit shifts are guaranteed to be zero.
6537 if (NumBitsPerElt <= ShiftVal) {
6538 Mask.append(NumElts, SM_SentinelZero);
6539 return true;
6540 }
6541
6542 // We can only decode 'whole byte' bit shifts as shuffles.
6543 if ((ShiftVal % 8) != 0)
6544 break;
6545
6546 uint64_t ByteShift = ShiftVal / 8;
6547 Ops.push_back(N.getOperand(0));
6548
6549 // Clear mask to all zeros and insert the shifted byte indices.
6550 Mask.append(NumSizeInBytes, SM_SentinelZero);
6551
6552 if (X86ISD::VSHLI == Opcode) {
6553 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6554 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6555 Mask[i + j] = i + j - ByteShift;
6556 } else {
6557 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6558 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6559 Mask[i + j - ByteShift] = i + j;
6560 }
6561 return true;
6562 }
6563 case X86ISD::VROTLI:
6564 case X86ISD::VROTRI: {
6565 // We can only decode 'whole byte' bit rotates as shuffles.
6566 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6567 if ((RotateVal % 8) != 0)
6568 return false;
6569 Ops.push_back(N.getOperand(0));
6570 int Offset = RotateVal / 8;
6571 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6572 for (int i = 0; i != (int)NumElts; ++i) {
6573 int BaseIdx = i * NumBytesPerElt;
6574 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6575 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6576 }
6577 }
6578 return true;
6579 }
6580 case X86ISD::VBROADCAST: {
6581 SDValue Src = N.getOperand(0);
6582 if (!Src.getSimpleValueType().isVector()) {
6583 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6584 !isNullConstant(Src.getOperand(1)) ||
6585 Src.getOperand(0).getValueType().getScalarType() !=
6586 VT.getScalarType())
6587 return false;
6588 Src = Src.getOperand(0);
6589 }
6590 Ops.push_back(Src);
6591 Mask.append(NumElts, 0);
6592 return true;
6593 }
6595 SDValue Src = N.getOperand(0);
6596 EVT SrcVT = Src.getValueType();
6597 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6598
6599 // Extended source must be a simple vector.
6600 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6601 (NumBitsPerSrcElt % 8) != 0)
6602 return false;
6603
6604 // We can only handle all-signbits extensions.
6605 APInt DemandedSrcElts =
6606 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6607 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6608 return false;
6609
6610 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6611 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6612 for (unsigned I = 0; I != NumElts; ++I)
6613 Mask.append(Scale, I);
6614 Ops.push_back(Src);
6615 return true;
6616 }
6617 case ISD::ZERO_EXTEND:
6618 case ISD::ANY_EXTEND:
6621 SDValue Src = N.getOperand(0);
6622 EVT SrcVT = Src.getValueType();
6623
6624 // Extended source must be a simple vector.
6625 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6626 (SrcVT.getScalarSizeInBits() % 8) != 0)
6627 return false;
6628
6629 bool IsAnyExtend =
6630 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6631 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6632 IsAnyExtend, Mask);
6633 Ops.push_back(Src);
6634 return true;
6635 }
6636 }
6637
6638 return false;
6639}
6640
6641/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6643 SmallVectorImpl<int> &Mask) {
6644 int MaskWidth = Mask.size();
6645 SmallVector<SDValue, 16> UsedInputs;
6646 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6647 int lo = UsedInputs.size() * MaskWidth;
6648 int hi = lo + MaskWidth;
6649
6650 // Strip UNDEF input usage.
6651 if (Inputs[i].isUndef())
6652 for (int &M : Mask)
6653 if ((lo <= M) && (M < hi))
6654 M = SM_SentinelUndef;
6655
6656 // Check for unused inputs.
6657 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6658 for (int &M : Mask)
6659 if (lo <= M)
6660 M -= MaskWidth;
6661 continue;
6662 }
6663
6664 // Check for repeated inputs.
6665 bool IsRepeat = false;
6666 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6667 if (UsedInputs[j] != Inputs[i])
6668 continue;
6669 for (int &M : Mask)
6670 if (lo <= M)
6671 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6672 IsRepeat = true;
6673 break;
6674 }
6675 if (IsRepeat)
6676 continue;
6677
6678 UsedInputs.push_back(Inputs[i]);
6679 }
6680 Inputs = UsedInputs;
6681}
6682
6683/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6684/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6685/// Returns true if the target shuffle mask was decoded.
6686static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6689 APInt &KnownUndef, APInt &KnownZero,
6690 const SelectionDAG &DAG, unsigned Depth,
6691 bool ResolveKnownElts) {
6693 return false; // Limit search depth.
6694
6695 EVT VT = Op.getValueType();
6696 if (!VT.isSimple() || !VT.isVector())
6697 return false;
6698
6699 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6700 if (ResolveKnownElts)
6701 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6702 return true;
6703 }
6704 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6705 ResolveKnownElts)) {
6706 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6707 return true;
6708 }
6709 return false;
6710}
6711
6712static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6715 const SelectionDAG &DAG, unsigned Depth,
6716 bool ResolveKnownElts) {
6717 APInt KnownUndef, KnownZero;
6718 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6719 KnownZero, DAG, Depth, ResolveKnownElts);
6720}
6721
6724 const SelectionDAG &DAG, unsigned Depth = 0,
6725 bool ResolveKnownElts = true) {
6726 EVT VT = Op.getValueType();
6727 if (!VT.isSimple() || !VT.isVector())
6728 return false;
6729
6730 unsigned NumElts = Op.getValueType().getVectorNumElements();
6731 APInt DemandedElts = APInt::getAllOnes(NumElts);
6732 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6733 ResolveKnownElts);
6734}
6735
6736// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6737static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6738 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6739 SelectionDAG &DAG) {
6740 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6741 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6742 "Unknown broadcast load type");
6743
6744 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6745 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6746 return SDValue();
6747
6750 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6751 SDValue Ops[] = {Mem->getChain(), Ptr};
6752 SDValue BcstLd = DAG.getMemIntrinsicNode(
6753 Opcode, DL, Tys, Ops, MemVT,
6755 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6756 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6757 return BcstLd;
6758}
6759
6760/// Returns the scalar element that will make up the i'th
6761/// element of the result of the vector shuffle.
6762static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6763 SelectionDAG &DAG, unsigned Depth) {
6765 return SDValue(); // Limit search depth.
6766
6767 EVT VT = Op.getValueType();
6768 unsigned Opcode = Op.getOpcode();
6769 unsigned NumElems = VT.getVectorNumElements();
6770
6771 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6772 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6773 int Elt = SV->getMaskElt(Index);
6774
6775 if (Elt < 0)
6776 return DAG.getUNDEF(VT.getVectorElementType());
6777
6778 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6779 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6780 }
6781
6782 // Recurse into target specific vector shuffles to find scalars.
6783 if (isTargetShuffle(Opcode)) {
6784 MVT ShufVT = VT.getSimpleVT();
6785 MVT ShufSVT = ShufVT.getVectorElementType();
6786 int NumElems = (int)ShufVT.getVectorNumElements();
6787 SmallVector<int, 16> ShuffleMask;
6789 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6790 return SDValue();
6791
6792 int Elt = ShuffleMask[Index];
6793 if (Elt == SM_SentinelZero)
6794 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6795 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6796 if (Elt == SM_SentinelUndef)
6797 return DAG.getUNDEF(ShufSVT);
6798
6799 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6800 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6801 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6802 }
6803
6804 // Recurse into insert_subvector base/sub vector to find scalars.
6805 if (Opcode == ISD::INSERT_SUBVECTOR) {
6806 SDValue Vec = Op.getOperand(0);
6807 SDValue Sub = Op.getOperand(1);
6808 uint64_t SubIdx = Op.getConstantOperandVal(2);
6809 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6810
6811 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6812 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6813 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6814 }
6815
6816 // Recurse into concat_vectors sub vector to find scalars.
6817 if (Opcode == ISD::CONCAT_VECTORS) {
6818 EVT SubVT = Op.getOperand(0).getValueType();
6819 unsigned NumSubElts = SubVT.getVectorNumElements();
6820 uint64_t SubIdx = Index / NumSubElts;
6821 uint64_t SubElt = Index % NumSubElts;
6822 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6823 }
6824
6825 // Recurse into extract_subvector src vector to find scalars.
6826 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6827 SDValue Src = Op.getOperand(0);
6828 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6829 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6830 }
6831
6832 // We only peek through bitcasts of the same vector width.
6833 if (Opcode == ISD::BITCAST) {
6834 SDValue Src = Op.getOperand(0);
6835 EVT SrcVT = Src.getValueType();
6836 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6837 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6838 return SDValue();
6839 }
6840
6841 // Actual nodes that may contain scalar elements
6842
6843 // For insert_vector_elt - either return the index matching scalar or recurse
6844 // into the base vector.
6845 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6846 isa<ConstantSDNode>(Op.getOperand(2))) {
6847 if (Op.getConstantOperandAPInt(2) == Index)
6848 return Op.getOperand(1);
6849 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6850 }
6851
6852 if (Opcode == ISD::SCALAR_TO_VECTOR)
6853 return (Index == 0) ? Op.getOperand(0)
6854 : DAG.getUNDEF(VT.getVectorElementType());
6855
6856 if (Opcode == ISD::BUILD_VECTOR)
6857 return Op.getOperand(Index);
6858
6859 return SDValue();
6860}
6861
6862// Use PINSRB/PINSRW/PINSRD to create a build vector.
6864 const APInt &NonZeroMask,
6865 unsigned NumNonZero, unsigned NumZero,
6866 SelectionDAG &DAG,
6867 const X86Subtarget &Subtarget) {
6868 MVT VT = Op.getSimpleValueType();
6869 unsigned NumElts = VT.getVectorNumElements();
6870 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6871 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6872 "Illegal vector insertion");
6873
6874 SDValue V;
6875 bool First = true;
6876
6877 for (unsigned i = 0; i < NumElts; ++i) {
6878 bool IsNonZero = NonZeroMask[i];
6879 if (!IsNonZero)
6880 continue;
6881
6882 // If the build vector contains zeros or our first insertion is not the
6883 // first index then insert into zero vector to break any register
6884 // dependency else use SCALAR_TO_VECTOR.
6885 if (First) {
6886 First = false;
6887 if (NumZero || 0 != i)
6888 V = getZeroVector(VT, Subtarget, DAG, DL);
6889 else {
6890 assert(0 == i && "Expected insertion into zero-index");
6891 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6892 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6893 V = DAG.getBitcast(VT, V);
6894 continue;
6895 }
6896 }
6897 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6898 DAG.getVectorIdxConstant(i, DL));
6899 }
6900
6901 return V;
6902}
6903
6904/// Custom lower build_vector of v16i8.
6906 const APInt &NonZeroMask,
6907 unsigned NumNonZero, unsigned NumZero,
6908 SelectionDAG &DAG,
6909 const X86Subtarget &Subtarget) {
6910 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6911 return SDValue();
6912
6913 // SSE4.1 - use PINSRB to insert each byte directly.
6914 if (Subtarget.hasSSE41())
6915 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6916 DAG, Subtarget);
6917
6918 SDValue V;
6919
6920 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6921 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6922 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6923 !NonZeroMask.extractBits(2, 2).isZero()) {
6924 for (unsigned I = 0; I != 4; ++I) {
6925 if (!NonZeroMask[I])
6926 continue;
6927 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6928 if (I != 0)
6929 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6930 DAG.getConstant(I * 8, DL, MVT::i8));
6931 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6932 }
6933 assert(V && "Failed to fold v16i8 vector to zero");
6934 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6935 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6936 V = DAG.getBitcast(MVT::v8i16, V);
6937 }
6938 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6939 bool ThisIsNonZero = NonZeroMask[i];
6940 bool NextIsNonZero = NonZeroMask[i + 1];
6941 if (!ThisIsNonZero && !NextIsNonZero)
6942 continue;
6943
6944 SDValue Elt;
6945 if (ThisIsNonZero) {
6946 if (NumZero || NextIsNonZero)
6947 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6948 else
6949 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6950 }
6951
6952 if (NextIsNonZero) {
6953 SDValue NextElt = Op.getOperand(i + 1);
6954 if (i == 0 && NumZero)
6955 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6956 else
6957 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6958 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6959 DAG.getConstant(8, DL, MVT::i8));
6960 if (ThisIsNonZero)
6961 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6962 else
6963 Elt = NextElt;
6964 }
6965
6966 // If our first insertion is not the first index or zeros are needed, then
6967 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6968 // elements undefined).
6969 if (!V) {
6970 if (i != 0 || NumZero)
6971 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6972 else {
6973 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6974 V = DAG.getBitcast(MVT::v8i16, V);
6975 continue;
6976 }
6977 }
6978 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6979 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6980 DAG.getVectorIdxConstant(i / 2, DL));
6981 }
6982
6983 return DAG.getBitcast(MVT::v16i8, V);
6984}
6985
6986/// Custom lower build_vector of v8i16.
6988 const APInt &NonZeroMask,
6989 unsigned NumNonZero, unsigned NumZero,
6990 SelectionDAG &DAG,
6991 const X86Subtarget &Subtarget) {
6992 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6993 return SDValue();
6994
6995 // Use PINSRW to insert each byte directly.
6996 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6997 Subtarget);
6998}
6999
7000/// Custom lower build_vector of v4i32 or v4f32.
7002 SelectionDAG &DAG,
7003 const X86Subtarget &Subtarget) {
7004 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7005 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7006 // Because we're creating a less complicated build vector here, we may enable
7007 // further folding of the MOVDDUP via shuffle transforms.
7008 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7009 Op.getOperand(0) == Op.getOperand(2) &&
7010 Op.getOperand(1) == Op.getOperand(3) &&
7011 Op.getOperand(0) != Op.getOperand(1)) {
7012 MVT VT = Op.getSimpleValueType();
7013 MVT EltVT = VT.getVectorElementType();
7014 // Create a new build vector with the first 2 elements followed by undef
7015 // padding, bitcast to v2f64, duplicate, and bitcast back.
7016 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7017 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7018 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7019 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7020 return DAG.getBitcast(VT, Dup);
7021 }
7022
7023 // Find all zeroable elements.
7024 std::bitset<4> Zeroable, Undefs;
7025 for (int i = 0; i < 4; ++i) {
7026 SDValue Elt = Op.getOperand(i);
7027 Undefs[i] = Elt.isUndef();
7028 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7029 }
7030 assert(Zeroable.size() - Zeroable.count() > 1 &&
7031 "We expect at least two non-zero elements!");
7032
7033 // We only know how to deal with build_vector nodes where elements are either
7034 // zeroable or extract_vector_elt with constant index.
7035 SDValue FirstNonZero;
7036 unsigned FirstNonZeroIdx;
7037 for (unsigned i = 0; i < 4; ++i) {
7038 if (Zeroable[i])
7039 continue;
7040 SDValue Elt = Op.getOperand(i);
7041 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7043 return SDValue();
7044 // Make sure that this node is extracting from a 128-bit vector.
7045 MVT VT = Elt.getOperand(0).getSimpleValueType();
7046 if (!VT.is128BitVector())
7047 return SDValue();
7048 if (!FirstNonZero.getNode()) {
7049 FirstNonZero = Elt;
7050 FirstNonZeroIdx = i;
7051 }
7052 }
7053
7054 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7055 SDValue V1 = FirstNonZero.getOperand(0);
7056 MVT VT = V1.getSimpleValueType();
7057
7058 // See if this build_vector can be lowered as a blend with zero.
7059 SDValue Elt;
7060 unsigned EltMaskIdx, EltIdx;
7061 int Mask[4];
7062 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7063 if (Zeroable[EltIdx]) {
7064 // The zero vector will be on the right hand side.
7065 Mask[EltIdx] = EltIdx+4;
7066 continue;
7067 }
7068
7069 Elt = Op->getOperand(EltIdx);
7070 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7071 EltMaskIdx = Elt.getConstantOperandVal(1);
7072 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7073 break;
7074 Mask[EltIdx] = EltIdx;
7075 }
7076
7077 if (EltIdx == 4) {
7078 // Let the shuffle legalizer deal with blend operations.
7079 SDValue VZeroOrUndef = (Zeroable == Undefs)
7080 ? DAG.getUNDEF(VT)
7081 : getZeroVector(VT, Subtarget, DAG, DL);
7082 if (V1.getSimpleValueType() != VT)
7083 V1 = DAG.getBitcast(VT, V1);
7084 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7085 }
7086
7087 // See if we can lower this build_vector to a INSERTPS.
7088 if (!Subtarget.hasSSE41())
7089 return SDValue();
7090
7091 SDValue V2 = Elt.getOperand(0);
7092 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7093 V1 = SDValue();
7094
7095 bool CanFold = true;
7096 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7097 if (Zeroable[i])
7098 continue;
7099
7100 SDValue Current = Op->getOperand(i);
7101 SDValue SrcVector = Current->getOperand(0);
7102 if (!V1.getNode())
7103 V1 = SrcVector;
7104 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7105 }
7106
7107 if (!CanFold)
7108 return SDValue();
7109
7110 assert(V1.getNode() && "Expected at least two non-zero elements!");
7111 if (V1.getSimpleValueType() != MVT::v4f32)
7112 V1 = DAG.getBitcast(MVT::v4f32, V1);
7113 if (V2.getSimpleValueType() != MVT::v4f32)
7114 V2 = DAG.getBitcast(MVT::v4f32, V2);
7115
7116 // Ok, we can emit an INSERTPS instruction.
7117 unsigned ZMask = Zeroable.to_ulong();
7118
7119 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7120 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7121 SDValue Result =
7122 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7123 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7124 return DAG.getBitcast(VT, Result);
7125}
7126
7127/// Return a vector logical shift node.
7128static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7129 SelectionDAG &DAG, const TargetLowering &TLI,
7130 const SDLoc &dl) {
7131 assert(VT.is128BitVector() && "Unknown type for VShift");
7132 MVT ShVT = MVT::v16i8;
7133 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7134 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7135 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7136 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7137 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7138}
7139
7141 SelectionDAG &DAG) {
7142
7143 // Check if the scalar load can be widened into a vector load. And if
7144 // the address is "base + cst" see if the cst can be "absorbed" into
7145 // the shuffle mask.
7147 SDValue Ptr = LD->getBasePtr();
7148 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7149 return SDValue();
7150 EVT PVT = LD->getValueType(0);
7151 if (PVT != MVT::i32 && PVT != MVT::f32)
7152 return SDValue();
7153
7154 int FI = -1;
7155 int64_t Offset = 0;
7157 FI = FINode->getIndex();
7158 Offset = 0;
7159 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7160 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7161 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7162 Offset = Ptr.getConstantOperandVal(1);
7163 Ptr = Ptr.getOperand(0);
7164 } else {
7165 return SDValue();
7166 }
7167
7168 // FIXME: 256-bit vector instructions don't require a strict alignment,
7169 // improve this code to support it better.
7170 Align RequiredAlign(VT.getSizeInBits() / 8);
7171 SDValue Chain = LD->getChain();
7172 // Make sure the stack object alignment is at least 16 or 32.
7174 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7175 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7176 if (MFI.isFixedObjectIndex(FI)) {
7177 // Can't change the alignment. FIXME: It's possible to compute
7178 // the exact stack offset and reference FI + adjust offset instead.
7179 // If someone *really* cares about this. That's the way to implement it.
7180 return SDValue();
7181 } else {
7182 MFI.setObjectAlignment(FI, RequiredAlign);
7183 }
7184 }
7185
7186 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7187 // Ptr + (Offset & ~15).
7188 if (Offset < 0)
7189 return SDValue();
7190 if ((Offset % RequiredAlign.value()) & 3)
7191 return SDValue();
7192 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7193 if (StartOffset) {
7194 SDLoc DL(Ptr);
7195 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7196 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7197 }
7198
7199 int EltNo = (Offset - StartOffset) >> 2;
7200 unsigned NumElems = VT.getVectorNumElements();
7201
7202 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7203 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7204 LD->getPointerInfo().getWithOffset(StartOffset));
7205
7206 SmallVector<int, 8> Mask(NumElems, EltNo);
7207
7208 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7209 }
7210
7211 return SDValue();
7212}
7213
7214// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7215static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7216 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7217 auto *BaseLd = cast<LoadSDNode>(Elt);
7218 if (!BaseLd->isSimple())
7219 return false;
7220 Ld = BaseLd;
7221 ByteOffset = 0;
7222 return true;
7223 }
7224
7225 switch (Elt.getOpcode()) {
7226 case ISD::BITCAST:
7227 case ISD::TRUNCATE:
7229 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7230 case ISD::SRL:
7231 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7232 uint64_t Amt = AmtC->getZExtValue();
7233 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7234 ByteOffset += Amt / 8;
7235 return true;
7236 }
7237 }
7238 break;
7240 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7241 SDValue Src = Elt.getOperand(0);
7242 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7243 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7244 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7245 findEltLoadSrc(Src, Ld, ByteOffset)) {
7246 uint64_t Idx = IdxC->getZExtValue();
7247 ByteOffset += Idx * (SrcSizeInBits / 8);
7248 return true;
7249 }
7250 }
7251 break;
7252 }
7253
7254 return false;
7255}
7256
7257/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7258/// elements can be replaced by a single large load which has the same value as
7259/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7260///
7261/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7263 const SDLoc &DL, SelectionDAG &DAG,
7264 const X86Subtarget &Subtarget,
7265 bool IsAfterLegalize) {
7266 if ((VT.getScalarSizeInBits() % 8) != 0)
7267 return SDValue();
7268
7269 unsigned NumElems = Elts.size();
7270
7271 int LastLoadedElt = -1;
7272 APInt LoadMask = APInt::getZero(NumElems);
7273 APInt ZeroMask = APInt::getZero(NumElems);
7274 APInt UndefMask = APInt::getZero(NumElems);
7275
7276 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7277 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7278
7279 // For each element in the initializer, see if we've found a load, zero or an
7280 // undef.
7281 for (unsigned i = 0; i < NumElems; ++i) {
7282 SDValue Elt = peekThroughBitcasts(Elts[i]);
7283 if (!Elt.getNode())
7284 return SDValue();
7285 if (Elt.isUndef()) {
7286 UndefMask.setBit(i);
7287 continue;
7288 }
7290 ZeroMask.setBit(i);
7291 continue;
7292 }
7293
7294 // Each loaded element must be the correct fractional portion of the
7295 // requested vector load.
7296 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7297 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7298 return SDValue();
7299
7300 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7301 return SDValue();
7302 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7303 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7304 return SDValue();
7305
7306 LoadMask.setBit(i);
7307 LastLoadedElt = i;
7308 }
7309 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7310 NumElems &&
7311 "Incomplete element masks");
7312
7313 // Handle Special Cases - all undef or undef/zero.
7314 if (UndefMask.popcount() == NumElems)
7315 return DAG.getUNDEF(VT);
7316 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7317 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7318 : DAG.getConstantFP(0.0, DL, VT);
7319
7320 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7321 int FirstLoadedElt = LoadMask.countr_zero();
7322 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7323 EVT EltBaseVT = EltBase.getValueType();
7324 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7325 "Register/Memory size mismatch");
7326 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7327 assert(LDBase && "Did not find base load for merging consecutive loads");
7328 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7329 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7330 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7331 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7332 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7333
7334 // TODO: Support offsetting the base load.
7335 if (ByteOffsets[FirstLoadedElt] != 0)
7336 return SDValue();
7337
7338 // Check to see if the element's load is consecutive to the base load
7339 // or offset from a previous (already checked) load.
7340 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7341 LoadSDNode *Ld = Loads[EltIdx];
7342 int64_t ByteOffset = ByteOffsets[EltIdx];
7343 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7344 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7345 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7346 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7347 }
7348 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7349 EltIdx - FirstLoadedElt);
7350 };
7351
7352 // Consecutive loads can contain UNDEFS but not ZERO elements.
7353 // Consecutive loads with UNDEFs and ZEROs elements require a
7354 // an additional shuffle stage to clear the ZERO elements.
7355 bool IsConsecutiveLoad = true;
7356 bool IsConsecutiveLoadWithZeros = true;
7357 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7358 if (LoadMask[i]) {
7359 if (!CheckConsecutiveLoad(LDBase, i)) {
7360 IsConsecutiveLoad = false;
7361 IsConsecutiveLoadWithZeros = false;
7362 break;
7363 }
7364 } else if (ZeroMask[i]) {
7365 IsConsecutiveLoad = false;
7366 }
7367 }
7368
7369 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7370 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7371 assert(LDBase->isSimple() &&
7372 "Cannot merge volatile or atomic loads.");
7373 SDValue NewLd =
7374 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7375 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7376 for (auto *LD : Loads)
7377 if (LD)
7378 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7379 return NewLd;
7380 };
7381
7382 // Check if the base load is entirely dereferenceable.
7383 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7384 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7385
7386 // LOAD - all consecutive load/undefs (must start/end with a load or be
7387 // entirely dereferenceable). If we have found an entire vector of loads and
7388 // undefs, then return a large load of the entire vector width starting at the
7389 // base pointer. If the vector contains zeros, then attempt to shuffle those
7390 // elements.
7391 if (FirstLoadedElt == 0 &&
7392 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7393 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7394 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7395 return SDValue();
7396
7397 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7398 // will lower to regular temporal loads and use the cache.
7399 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7400 VT.is256BitVector() && !Subtarget.hasInt256())
7401 return SDValue();
7402
7403 if (NumElems == 1)
7404 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7405
7406 if (!ZeroMask)
7407 return CreateLoad(VT, LDBase);
7408
7409 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7410 // vector and a zero vector to clear out the zero elements.
7411 if (!IsAfterLegalize && VT.isVector()) {
7412 unsigned NumMaskElts = VT.getVectorNumElements();
7413 if ((NumMaskElts % NumElems) == 0) {
7414 unsigned Scale = NumMaskElts / NumElems;
7415 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7416 for (unsigned i = 0; i < NumElems; ++i) {
7417 if (UndefMask[i])
7418 continue;
7419 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7420 for (unsigned j = 0; j != Scale; ++j)
7421 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7422 }
7423 SDValue V = CreateLoad(VT, LDBase);
7424 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7425 : DAG.getConstantFP(0.0, DL, VT);
7426 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7427 }
7428 }
7429 }
7430
7431 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7432 if (VT.is256BitVector() || VT.is512BitVector()) {
7433 unsigned HalfNumElems = NumElems / 2;
7434 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7435 EVT HalfVT =
7436 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7437 SDValue HalfLD =
7438 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7439 DAG, Subtarget, IsAfterLegalize);
7440 if (HalfLD)
7441 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7442 HalfLD, DAG.getVectorIdxConstant(0, DL));
7443 }
7444 }
7445
7446 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7447 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7448 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7449 LoadSizeInBits == 64) &&
7450 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7451 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7452 : MVT::getIntegerVT(LoadSizeInBits);
7453 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7454 // Allow v4f32 on SSE1 only targets.
7455 // FIXME: Add more isel patterns so we can just use VT directly.
7456 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7457 VecVT = MVT::v4f32;
7458 if (TLI.isTypeLegal(VecVT)) {
7459 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7460 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7461 SDValue ResNode = DAG.getMemIntrinsicNode(
7462 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7464 for (auto *LD : Loads)
7465 if (LD)
7466 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7467 return DAG.getBitcast(VT, ResNode);
7468 }
7469 }
7470
7471 // BROADCAST - match the smallest possible repetition pattern, load that
7472 // scalar/subvector element and then broadcast to the entire vector.
7473 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7474 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7475 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7476 unsigned RepeatSize = SubElems * BaseSizeInBits;
7477 unsigned ScalarSize = std::min(RepeatSize, 64u);
7478 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7479 continue;
7480
7481 // Don't attempt a 1:N subvector broadcast - it should be caught by
7482 // combineConcatVectorOps, else will cause infinite loops.
7483 if (RepeatSize > ScalarSize && SubElems == 1)
7484 continue;
7485
7486 bool Match = true;
7487 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7488 for (unsigned i = 0; i != NumElems && Match; ++i) {
7489 if (!LoadMask[i])
7490 continue;
7491 SDValue Elt = peekThroughBitcasts(Elts[i]);
7492 if (RepeatedLoads[i % SubElems].isUndef())
7493 RepeatedLoads[i % SubElems] = Elt;
7494 else
7495 Match &= (RepeatedLoads[i % SubElems] == Elt);
7496 }
7497
7498 // We must have loads at both ends of the repetition.
7499 Match &= !RepeatedLoads.front().isUndef();
7500 Match &= !RepeatedLoads.back().isUndef();
7501 if (!Match)
7502 continue;
7503
7504 EVT RepeatVT =
7505 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7506 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7507 : EVT::getFloatingPointVT(ScalarSize);
7508 if (RepeatSize > ScalarSize)
7509 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7510 RepeatSize / ScalarSize);
7511 EVT BroadcastVT =
7512 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7513 VT.getSizeInBits() / ScalarSize);
7514 if (TLI.isTypeLegal(BroadcastVT)) {
7515 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7516 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7517 SDValue Broadcast = RepeatLoad;
7518 if (RepeatSize > ScalarSize) {
7519 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7520 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7521 } else {
7522 if (!Subtarget.hasAVX2() &&
7524 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7525 Subtarget,
7526 /*AssumeSingleUse=*/true))
7527 return SDValue();
7528 Broadcast =
7529 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7530 }
7531 return DAG.getBitcast(VT, Broadcast);
7532 }
7533 }
7534 }
7535 }
7536
7537 return SDValue();
7538}
7539
7540// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7541// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7542// are consecutive, non-overlapping, and in the right order.
7544 SelectionDAG &DAG,
7545 const X86Subtarget &Subtarget,
7546 bool IsAfterLegalize) {
7548 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7549 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7550 Elts.push_back(Elt);
7551 continue;
7552 }
7553 return SDValue();
7554 }
7555 assert(Elts.size() == VT.getVectorNumElements());
7556 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7557 IsAfterLegalize);
7558}
7559
7561 const APInt &Undefs, LLVMContext &C) {
7562 unsigned ScalarSize = VT.getScalarSizeInBits();
7563 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7564
7565 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7566 if (VT.isFloatingPoint()) {
7567 if (ScalarSize == 16)
7568 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7569 if (ScalarSize == 32)
7570 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7571 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7572 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7573 }
7574 return Constant::getIntegerValue(Ty, Val);
7575 };
7576
7577 SmallVector<Constant *, 32> ConstantVec;
7578 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7579 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7580 : getConstantScalar(Bits[I]));
7581
7582 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7583}
7584
7585static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7586 unsigned SplatBitSize, LLVMContext &C) {
7587 unsigned ScalarSize = VT.getScalarSizeInBits();
7588
7589 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7590 if (VT.isFloatingPoint()) {
7591 if (ScalarSize == 16)
7592 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7593 if (ScalarSize == 32)
7594 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7595 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7596 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7597 }
7598 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7599 };
7600
7601 if (ScalarSize == SplatBitSize)
7602 return getConstantScalar(SplatValue);
7603
7604 unsigned NumElm = SplatBitSize / ScalarSize;
7605 SmallVector<Constant *, 32> ConstantVec;
7606 for (unsigned I = 0; I != NumElm; ++I) {
7607 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7608 ConstantVec.push_back(getConstantScalar(Val));
7609 }
7610 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7611}
7612
7614 for (auto *U : N->users()) {
7615 unsigned Opc = U->getOpcode();
7616 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7617 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7618 return false;
7619 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7620 return false;
7621 if (isTargetShuffle(Opc))
7622 return true;
7623 if (Opc == ISD::BITCAST) // Ignore bitcasts
7624 return isFoldableUseOfShuffle(U);
7625 if (N->hasOneUse()) {
7626 // TODO, there may be some general way to know if a SDNode can
7627 // be folded. We now only know whether an MI is foldable.
7628 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7629 return false;
7630 return true;
7631 }
7632 }
7633 return false;
7634}
7635
7636// If the node has a single use by a VSELECT then AVX512 targets may be able to
7637// fold as a predicated instruction.
7638static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7639 unsigned SizeInBits = V.getValueSizeInBits();
7640 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7641 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7642 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7643 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7644 return true;
7645 }
7646 }
7647 return false;
7648}
7649
7650/// Attempt to use the vbroadcast instruction to generate a splat value
7651/// from a splat BUILD_VECTOR which uses:
7652/// a. A single scalar load, or a constant.
7653/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7654///
7655/// The VBROADCAST node is returned when a pattern is found,
7656/// or SDValue() otherwise.
7658 const SDLoc &dl,
7659 const X86Subtarget &Subtarget,
7660 SelectionDAG &DAG) {
7661 // VBROADCAST requires AVX.
7662 // TODO: Splats could be generated for non-AVX CPUs using SSE
7663 // instructions, but there's less potential gain for only 128-bit vectors.
7664 if (!Subtarget.hasAVX())
7665 return SDValue();
7666
7667 MVT VT = BVOp->getSimpleValueType(0);
7668 unsigned NumElts = VT.getVectorNumElements();
7669 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7670 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7671 "Unsupported vector type for broadcast.");
7672
7673 // See if the build vector is a repeating sequence of scalars (inc. splat).
7674 SDValue Ld;
7675 BitVector UndefElements;
7676 SmallVector<SDValue, 16> Sequence;
7677 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7678 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7679 if (Sequence.size() == 1)
7680 Ld = Sequence[0];
7681 }
7682
7683 // Attempt to use VBROADCASTM
7684 // From this pattern:
7685 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7686 // b. t1 = (build_vector t0 t0)
7687 //
7688 // Create (VBROADCASTM v2i1 X)
7689 if (!Sequence.empty() && Subtarget.hasCDI()) {
7690 // If not a splat, are the upper sequence values zeroable?
7691 unsigned SeqLen = Sequence.size();
7692 bool UpperZeroOrUndef =
7693 SeqLen == 1 ||
7694 llvm::all_of(ArrayRef(Sequence).drop_front(),
7695 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7696 SDValue Op0 = Sequence[0];
7697 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7698 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7699 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7700 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7701 ? Op0.getOperand(0)
7702 : Op0.getOperand(0).getOperand(0);
7703 MVT MaskVT = BOperand.getSimpleValueType();
7704 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7705 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7706 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7707 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7708 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7709 unsigned Scale = 512 / VT.getSizeInBits();
7710 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7711 }
7712 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7713 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7714 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7715 return DAG.getBitcast(VT, Bcst);
7716 }
7717 }
7718 }
7719
7720 unsigned NumUndefElts = UndefElements.count();
7721 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7722 APInt SplatValue, Undef;
7723 unsigned SplatBitSize;
7724 bool HasUndef;
7725 // Check if this is a repeated constant pattern suitable for broadcasting.
7726 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7727 SplatBitSize > VT.getScalarSizeInBits() &&
7728 SplatBitSize < VT.getSizeInBits()) {
7729 // Avoid replacing with broadcast when it's a use of a shuffle
7730 // instruction to preserve the present custom lowering of shuffles.
7731 if (isFoldableUseOfShuffle(BVOp))
7732 return SDValue();
7733 // replace BUILD_VECTOR with broadcast of the repeated constants.
7734 LLVMContext *Ctx = DAG.getContext();
7735 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7736 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7737 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7738 // Load the constant scalar/subvector and broadcast it.
7739 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7740 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7741 SDValue CP = DAG.getConstantPool(C, PVT);
7742 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7743
7744 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7745 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7746 SDValue Ops[] = {DAG.getEntryNode(), CP};
7747 MachinePointerInfo MPI =
7749 SDValue Brdcst =
7751 MPI, Alignment, MachineMemOperand::MOLoad);
7752 return DAG.getBitcast(VT, Brdcst);
7753 }
7754 if (SplatBitSize > 64) {
7755 // Load the vector of constants and broadcast it.
7756 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7757 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7758 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7759 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7760 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7761 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7762 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7763 MachinePointerInfo MPI =
7766 Ops, VVT, MPI, Alignment,
7768 }
7769 }
7770
7771 // If we are moving a scalar into a vector (Ld must be set and all elements
7772 // but 1 are undef) and that operation is not obviously supported by
7773 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7774 // That's better than general shuffling and may eliminate a load to GPR and
7775 // move from scalar to vector register.
7776 if (!Ld || NumElts - NumUndefElts != 1)
7777 return SDValue();
7778 unsigned ScalarSize = Ld.getValueSizeInBits();
7779 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7780 return SDValue();
7781 }
7782
7783 bool ConstSplatVal =
7784 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7785 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7786
7787 // TODO: Handle broadcasts of non-constant sequences.
7788
7789 // Make sure that all of the users of a non-constant load are from the
7790 // BUILD_VECTOR node.
7791 // FIXME: Is the use count needed for non-constant, non-load case?
7792 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7793 return SDValue();
7794
7795 unsigned ScalarSize = Ld.getValueSizeInBits();
7796 bool IsGE256 = (VT.getSizeInBits() >= 256);
7797
7798 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7799 // instruction to save 8 or more bytes of constant pool data.
7800 // TODO: If multiple splats are generated to load the same constant,
7801 // it may be detrimental to overall size. There needs to be a way to detect
7802 // that condition to know if this is truly a size win.
7803 bool OptForSize = DAG.shouldOptForSize();
7804
7805 // Handle broadcasting a single constant scalar from the constant pool
7806 // into a vector.
7807 // On Sandybridge (no AVX2), it is still better to load a constant vector
7808 // from the constant pool and not to broadcast it from a scalar.
7809 // But override that restriction when optimizing for size.
7810 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7811 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7812 EVT CVT = Ld.getValueType();
7813 assert(!CVT.isVector() && "Must not broadcast a vector type");
7814
7815 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7816 // For size optimization, also splat v2f64 and v2i64, and for size opt
7817 // with AVX2, also splat i8 and i16.
7818 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7819 if (ScalarSize == 32 ||
7820 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7821 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7822 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7823 const Constant *C = nullptr;
7825 C = CI->getConstantIntValue();
7827 C = CF->getConstantFPValue();
7828
7829 assert(C && "Invalid constant type");
7830
7831 SDValue CP =
7833 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7834
7835 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7836 SDValue Ops[] = {DAG.getEntryNode(), CP};
7837 MachinePointerInfo MPI =
7839 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7840 MPI, Alignment, MachineMemOperand::MOLoad);
7841 }
7842 }
7843
7844 // Handle AVX2 in-register broadcasts.
7845 if (!IsLoad && Subtarget.hasInt256() &&
7846 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7847 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7848
7849 // The scalar source must be a normal load.
7850 if (!IsLoad)
7851 return SDValue();
7852
7853 // Make sure the non-chain result is only used by this build vector.
7854 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7855 return SDValue();
7856
7857 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7858 (Subtarget.hasVLX() && ScalarSize == 64)) {
7859 auto *LN = cast<LoadSDNode>(Ld);
7860 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7861 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7862 SDValue BCast =
7864 LN->getMemoryVT(), LN->getMemOperand());
7865 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7866 return BCast;
7867 }
7868
7869 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7870 // double since there is no vbroadcastsd xmm
7871 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7872 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7873 auto *LN = cast<LoadSDNode>(Ld);
7874 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7875 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7876 SDValue BCast =
7878 LN->getMemoryVT(), LN->getMemOperand());
7879 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7880 return BCast;
7881 }
7882
7883 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7884 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7885
7886 // Unsupported broadcast.
7887 return SDValue();
7888}
7889
7890/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7891/// underlying vector and index.
7892///
7893/// Modifies \p ExtractedFromVec to the real vector and returns the real
7894/// index.
7895static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7896 SDValue ExtIdx) {
7897 int Idx = ExtIdx->getAsZExtVal();
7898 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7899 return Idx;
7900
7901 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7902 // lowered this:
7903 // (extract_vector_elt (v8f32 %1), Constant<6>)
7904 // to:
7905 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7906 // (extract_subvector (v8f32 %0), Constant<4>),
7907 // undef)
7908 // Constant<0>)
7909 // In this case the vector is the extract_subvector expression and the index
7910 // is 2, as specified by the shuffle.
7911 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7912 SDValue ShuffleVec = SVOp->getOperand(0);
7913 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7914 assert(ShuffleVecVT.getVectorElementType() ==
7915 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7916
7917 int ShuffleIdx = SVOp->getMaskElt(Idx);
7918 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7919 ExtractedFromVec = ShuffleVec;
7920 return ShuffleIdx;
7921 }
7922 return Idx;
7923}
7924
7926 SelectionDAG &DAG) {
7927 MVT VT = Op.getSimpleValueType();
7928
7929 // Skip if insert_vec_elt is not supported.
7930 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7932 return SDValue();
7933
7934 unsigned NumElems = Op.getNumOperands();
7935 SDValue VecIn1;
7936 SDValue VecIn2;
7937 SmallVector<unsigned, 4> InsertIndices;
7938 SmallVector<int, 8> Mask(NumElems, -1);
7939
7940 for (unsigned i = 0; i != NumElems; ++i) {
7941 unsigned Opc = Op.getOperand(i).getOpcode();
7942
7943 if (Opc == ISD::UNDEF)
7944 continue;
7945
7947 // Quit if more than 1 elements need inserting.
7948 if (InsertIndices.size() > 1)
7949 return SDValue();
7950
7951 InsertIndices.push_back(i);
7952 continue;
7953 }
7954
7955 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7956 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7957
7958 // Quit if non-constant index.
7959 if (!isa<ConstantSDNode>(ExtIdx))
7960 return SDValue();
7961 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7962
7963 // Quit if extracted from vector of different type.
7964 if (ExtractedFromVec.getValueType() != VT)
7965 return SDValue();
7966
7967 if (!VecIn1.getNode())
7968 VecIn1 = ExtractedFromVec;
7969 else if (VecIn1 != ExtractedFromVec) {
7970 if (!VecIn2.getNode())
7971 VecIn2 = ExtractedFromVec;
7972 else if (VecIn2 != ExtractedFromVec)
7973 // Quit if more than 2 vectors to shuffle
7974 return SDValue();
7975 }
7976
7977 if (ExtractedFromVec == VecIn1)
7978 Mask[i] = Idx;
7979 else if (ExtractedFromVec == VecIn2)
7980 Mask[i] = Idx + NumElems;
7981 }
7982
7983 if (!VecIn1.getNode())
7984 return SDValue();
7985
7986 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7987 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7988
7989 for (unsigned Idx : InsertIndices)
7990 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7991 DAG.getVectorIdxConstant(Idx, DL));
7992
7993 return NV;
7994}
7995
7996// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7998 const X86Subtarget &Subtarget) {
7999 MVT VT = Op.getSimpleValueType();
8000 MVT IVT =
8001 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
8003 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8004 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
8005 Op.getOperand(I)));
8006 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8007 return DAG.getBitcast(VT, Res);
8008}
8009
8010// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8012 SelectionDAG &DAG,
8013 const X86Subtarget &Subtarget) {
8014
8015 MVT VT = Op.getSimpleValueType();
8016 assert((VT.getVectorElementType() == MVT::i1) &&
8017 "Unexpected type in LowerBUILD_VECTORvXi1!");
8018 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8019 ISD::isBuildVectorAllOnes(Op.getNode()))
8020 return Op;
8021
8022 uint64_t Immediate = 0;
8023 SmallVector<unsigned, 16> NonConstIdx;
8024 bool IsSplat = true;
8025 bool HasConstElts = false;
8026 int SplatIdx = -1;
8027 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8028 SDValue In = Op.getOperand(idx);
8029 if (In.isUndef())
8030 continue;
8031 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8032 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8033 HasConstElts = true;
8034 } else {
8035 NonConstIdx.push_back(idx);
8036 }
8037 if (SplatIdx < 0)
8038 SplatIdx = idx;
8039 else if (In != Op.getOperand(SplatIdx))
8040 IsSplat = false;
8041 }
8042
8043 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8044 if (IsSplat) {
8045 // The build_vector allows the scalar element to be larger than the vector
8046 // element type. We need to mask it to use as a condition unless we know
8047 // the upper bits are zero.
8048 // FIXME: Use computeKnownBits instead of checking specific opcode?
8049 SDValue Cond = Op.getOperand(SplatIdx);
8050 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8051 if (Cond.getOpcode() != ISD::SETCC)
8052 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8053 DAG.getConstant(1, dl, MVT::i8));
8054
8055 // Perform the select in the scalar domain so we can use cmov.
8056 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8057 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8058 DAG.getAllOnesConstant(dl, MVT::i32),
8059 DAG.getConstant(0, dl, MVT::i32));
8060 Select = DAG.getBitcast(MVT::v32i1, Select);
8061 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8062 } else {
8063 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8064 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8065 DAG.getAllOnesConstant(dl, ImmVT),
8066 DAG.getConstant(0, dl, ImmVT));
8067 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8068 Select = DAG.getBitcast(VecVT, Select);
8069 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8070 DAG.getVectorIdxConstant(0, dl));
8071 }
8072 }
8073
8074 // insert elements one by one
8075 SDValue DstVec;
8076 if (HasConstElts) {
8077 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8078 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8079 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8080 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8081 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8082 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8083 } else {
8084 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8085 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8086 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8087 DstVec = DAG.getBitcast(VecVT, Imm);
8088 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8089 DAG.getVectorIdxConstant(0, dl));
8090 }
8091 } else
8092 DstVec = DAG.getUNDEF(VT);
8093
8094 for (unsigned InsertIdx : NonConstIdx) {
8095 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8096 Op.getOperand(InsertIdx),
8097 DAG.getVectorIdxConstant(InsertIdx, dl));
8098 }
8099 return DstVec;
8100}
8101
8102LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
8103 switch (Opcode) {
8104 case X86ISD::PACKSS:
8105 case X86ISD::PACKUS:
8106 case X86ISD::FHADD:
8107 case X86ISD::FHSUB:
8108 case X86ISD::HADD:
8109 case X86ISD::HSUB:
8110 return true;
8111 }
8112 return false;
8113}
8114
8115/// This is a helper function of LowerToHorizontalOp().
8116/// This function checks that the build_vector \p N in input implements a
8117/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8118/// may not match the layout of an x86 256-bit horizontal instruction.
8119/// In other words, if this returns true, then some extraction/insertion will
8120/// be required to produce a valid horizontal instruction.
8121///
8122/// Parameter \p Opcode defines the kind of horizontal operation to match.
8123/// For example, if \p Opcode is equal to ISD::ADD, then this function
8124/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8125/// is equal to ISD::SUB, then this function checks if this is a horizontal
8126/// arithmetic sub.
8127///
8128/// This function only analyzes elements of \p N whose indices are
8129/// in range [BaseIdx, LastIdx).
8130///
8131/// TODO: This function was originally used to match both real and fake partial
8132/// horizontal operations, but the index-matching logic is incorrect for that.
8133/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8134/// code because it is only used for partial h-op matching now?
8135static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8136 const SDLoc &DL, SelectionDAG &DAG,
8137 unsigned BaseIdx, unsigned LastIdx,
8138 SDValue &V0, SDValue &V1) {
8139 EVT VT = N->getValueType(0);
8140 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8141 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8142 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8143 "Invalid Vector in input!");
8144
8145 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8146 bool CanFold = true;
8147 unsigned ExpectedVExtractIdx = BaseIdx;
8148 unsigned NumElts = LastIdx - BaseIdx;
8149 V0 = DAG.getUNDEF(VT);
8150 V1 = DAG.getUNDEF(VT);
8151
8152 // Check if N implements a horizontal binop.
8153 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8154 SDValue Op = N->getOperand(i + BaseIdx);
8155
8156 // Skip UNDEFs.
8157 if (Op->isUndef()) {
8158 // Update the expected vector extract index.
8159 if (i * 2 == NumElts)
8160 ExpectedVExtractIdx = BaseIdx;
8161 ExpectedVExtractIdx += 2;
8162 continue;
8163 }
8164
8165 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8166
8167 if (!CanFold)
8168 break;
8169
8170 SDValue Op0 = Op.getOperand(0);
8171 SDValue Op1 = Op.getOperand(1);
8172
8173 // Try to match the following pattern:
8174 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8175 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8177 Op0.getOperand(0) == Op1.getOperand(0) &&
8180 if (!CanFold)
8181 break;
8182
8183 unsigned I0 = Op0.getConstantOperandVal(1);
8184 unsigned I1 = Op1.getConstantOperandVal(1);
8185
8186 if (i * 2 < NumElts) {
8187 if (V0.isUndef()) {
8188 V0 = Op0.getOperand(0);
8189 if (V0.getValueType() != VT)
8190 return false;
8191 }
8192 } else {
8193 if (V1.isUndef()) {
8194 V1 = Op0.getOperand(0);
8195 if (V1.getValueType() != VT)
8196 return false;
8197 }
8198 if (i * 2 == NumElts)
8199 ExpectedVExtractIdx = BaseIdx;
8200 }
8201
8202 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8203 if (I0 == ExpectedVExtractIdx)
8204 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8205 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8206 // Try to match the following dag sequence:
8207 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8208 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8209 } else
8210 CanFold = false;
8211
8212 ExpectedVExtractIdx += 2;
8213 }
8214
8215 return CanFold;
8216}
8217
8218/// Emit a sequence of two 128-bit horizontal add/sub followed by
8219/// a concat_vector.
8220///
8221/// This is a helper function of LowerToHorizontalOp().
8222/// This function expects two 256-bit vectors called V0 and V1.
8223/// At first, each vector is split into two separate 128-bit vectors.
8224/// Then, the resulting 128-bit vectors are used to implement two
8225/// horizontal binary operations.
8226///
8227/// The kind of horizontal binary operation is defined by \p X86Opcode.
8228///
8229/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8230/// the two new horizontal binop.
8231/// When Mode is set, the first horizontal binop dag node would take as input
8232/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8233/// horizontal binop dag node would take as input the lower 128-bit of V1
8234/// and the upper 128-bit of V1.
8235/// Example:
8236/// HADD V0_LO, V0_HI
8237/// HADD V1_LO, V1_HI
8238///
8239/// Otherwise, the first horizontal binop dag node takes as input the lower
8240/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8241/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8242/// Example:
8243/// HADD V0_LO, V1_LO
8244/// HADD V0_HI, V1_HI
8245///
8246/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8247/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8248/// the upper 128-bits of the result.
8249static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8250 const SDLoc &DL, SelectionDAG &DAG,
8251 unsigned X86Opcode, bool Mode,
8252 bool isUndefLO, bool isUndefHI) {
8253 MVT VT = V0.getSimpleValueType();
8254 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8255 "Invalid nodes in input!");
8256
8257 unsigned NumElts = VT.getVectorNumElements();
8258 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8259 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8260 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8261 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8262 MVT NewVT = V0_LO.getSimpleValueType();
8263
8264 SDValue LO = DAG.getUNDEF(NewVT);
8265 SDValue HI = DAG.getUNDEF(NewVT);
8266
8267 if (Mode) {
8268 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8269 if (!isUndefLO && !V0->isUndef())
8270 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8271 if (!isUndefHI && !V1->isUndef())
8272 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8273 } else {
8274 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8275 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8276 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8277
8278 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8279 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8280 }
8281
8282 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8283}
8284
8285/// Returns true iff \p BV builds a vector with the result equivalent to
8286/// the result of ADDSUB/SUBADD operation.
8287/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8288/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8289/// \p Opnd0 and \p Opnd1.
8291 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8292 SDValue &Opnd0, SDValue &Opnd1,
8293 unsigned &NumExtracts, bool &IsSubAdd,
8294 bool &HasAllowContract) {
8295 using namespace SDPatternMatch;
8296
8297 MVT VT = BV->getSimpleValueType(0);
8298 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8299 return false;
8300
8301 unsigned NumElts = VT.getVectorNumElements();
8302 SDValue InVec0 = DAG.getUNDEF(VT);
8303 SDValue InVec1 = DAG.getUNDEF(VT);
8304
8305 NumExtracts = 0;
8306 HasAllowContract = NumElts != 0;
8307
8308 // Odd-numbered elements in the input build vector are obtained from
8309 // adding/subtracting two integer/float elements.
8310 // Even-numbered elements in the input build vector are obtained from
8311 // subtracting/adding two integer/float elements.
8312 unsigned Opc[2] = {0, 0};
8313 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8314 SDValue Op = BV->getOperand(i);
8315
8316 // Skip 'undef' values.
8317 unsigned Opcode = Op.getOpcode();
8318 if (Opcode == ISD::UNDEF)
8319 continue;
8320
8321 // Early exit if we found an unexpected opcode.
8322 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8323 return false;
8324
8325 SDValue Op0 = Op.getOperand(0);
8326 SDValue Op1 = Op.getOperand(1);
8327
8328 // Try to match the following pattern:
8329 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8330 // Early exit if we cannot match that sequence.
8331 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8332 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8333 return false;
8334
8335 // We found a valid add/sub node, make sure its the same opcode as previous
8336 // elements for this parity.
8337 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8338 return false;
8339 Opc[i % 2] = Opcode;
8340
8341 // Update InVec0 and InVec1.
8342 if (InVec0.isUndef())
8343 InVec0 = Op0.getOperand(0);
8344 if (InVec1.isUndef())
8345 InVec1 = Op1.getOperand(0);
8346
8347 // Make sure that operands in input to each add/sub node always
8348 // come from a same pair of vectors.
8349 if (InVec0 != Op0.getOperand(0)) {
8350 if (Opcode == ISD::FSUB)
8351 return false;
8352
8353 // FADD is commutable. Try to commute the operands
8354 // and then test again.
8355 std::swap(Op0, Op1);
8356 if (InVec0 != Op0.getOperand(0))
8357 return false;
8358 }
8359
8360 if (InVec1 != Op1.getOperand(0))
8361 return false;
8362
8363 // Increment the number of extractions done.
8364 ++NumExtracts;
8365 HasAllowContract &= Op->getFlags().hasAllowContract();
8366 }
8367
8368 // Ensure we have found an opcode for both parities and that they are
8369 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8370 // inputs are undef.
8371 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8372 InVec0.isUndef() || InVec1.isUndef())
8373 return false;
8374
8375 IsSubAdd = Opc[0] == ISD::FADD;
8376
8377 Opnd0 = InVec0;
8378 Opnd1 = InVec1;
8379 return true;
8380}
8381
8382/// Returns true if is possible to fold MUL and an idiom that has already been
8383/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8384/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8385/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8386///
8387/// Prior to calling this function it should be known that there is some
8388/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8389/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8390/// before replacement of such SDNode with ADDSUB operation. Thus the number
8391/// of \p Opnd0 uses is expected to be equal to 2.
8392/// For example, this function may be called for the following IR:
8393/// %AB = fmul fast <2 x double> %A, %B
8394/// %Sub = fsub fast <2 x double> %AB, %C
8395/// %Add = fadd fast <2 x double> %AB, %C
8396/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8397/// <2 x i32> <i32 0, i32 3>
8398/// There is a def for %Addsub here, which potentially can be replaced by
8399/// X86ISD::ADDSUB operation:
8400/// %Addsub = X86ISD::ADDSUB %AB, %C
8401/// and such ADDSUB can further be replaced with FMADDSUB:
8402/// %Addsub = FMADDSUB %A, %B, %C.
8403///
8404/// The main reason why this method is called before the replacement of the
8405/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8406/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8407/// FMADDSUB is.
8408static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8409 SelectionDAG &DAG, SDValue &Opnd0,
8410 SDValue &Opnd1, SDValue &Opnd2,
8411 unsigned ExpectedUses,
8412 bool AllowSubAddOrAddSubContract) {
8413 if (Opnd0.getOpcode() != ISD::FMUL ||
8414 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8415 return false;
8416
8417 // FIXME: These checks must match the similar ones in
8418 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8419 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8420 // or MUL + ADDSUB to FMADDSUB.
8421 const TargetOptions &Options = DAG.getTarget().Options;
8422 bool AllowFusion =
8423 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8424 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8425 if (!AllowFusion)
8426 return false;
8427
8428 Opnd2 = Opnd1;
8429 Opnd1 = Opnd0.getOperand(1);
8430 Opnd0 = Opnd0.getOperand(0);
8431
8432 return true;
8433}
8434
8435/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8436/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8437/// X86ISD::FMSUBADD node.
8439 const SDLoc &DL,
8440 const X86Subtarget &Subtarget,
8441 SelectionDAG &DAG) {
8442 SDValue Opnd0, Opnd1;
8443 unsigned NumExtracts;
8444 bool IsSubAdd;
8445 bool HasAllowContract;
8446 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8447 HasAllowContract))
8448 return SDValue();
8449
8450 MVT VT = BV->getSimpleValueType(0);
8451
8452 // Try to generate X86ISD::FMADDSUB node here.
8453 SDValue Opnd2;
8454 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8455 HasAllowContract)) {
8456 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8457 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8458 }
8459
8460 // We only support ADDSUB.
8461 if (IsSubAdd)
8462 return SDValue();
8463
8464 // There are no known X86 targets with 512-bit ADDSUB instructions!
8465 // Convert to blend(fsub,fadd).
8466 if (VT.is512BitVector()) {
8467 SmallVector<int> Mask;
8468 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8469 Mask.push_back(I);
8470 Mask.push_back(I + E + 1);
8471 }
8472 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8473 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8474 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8475 }
8476
8477 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8478}
8479
8481 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8482 // Initialize outputs to known values.
8483 MVT VT = BV->getSimpleValueType(0);
8484 HOpcode = ISD::DELETED_NODE;
8485 V0 = DAG.getUNDEF(VT);
8486 V1 = DAG.getUNDEF(VT);
8487
8488 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8489 // half of the result is calculated independently from the 128-bit halves of
8490 // the inputs, so that makes the index-checking logic below more complicated.
8491 unsigned NumElts = VT.getVectorNumElements();
8492 unsigned GenericOpcode = ISD::DELETED_NODE;
8493 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8494 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8495 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8496 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8497 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8498 // Ignore undef elements.
8499 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8500 if (Op.isUndef())
8501 continue;
8502
8503 // If there's an opcode mismatch, we're done.
8504 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8505 return false;
8506
8507 // Initialize horizontal opcode.
8508 if (HOpcode == ISD::DELETED_NODE) {
8509 GenericOpcode = Op.getOpcode();
8510 switch (GenericOpcode) {
8511 // clang-format off
8512 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8513 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8514 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8515 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8516 default: return false;
8517 // clang-format on
8518 }
8519 }
8520
8521 SDValue Op0 = Op.getOperand(0);
8522 SDValue Op1 = Op.getOperand(1);
8523 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8525 Op0.getOperand(0) != Op1.getOperand(0) ||
8527 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8528 return false;
8529
8530 // The source vector is chosen based on which 64-bit half of the
8531 // destination vector is being calculated.
8532 if (j < NumEltsIn64Bits) {
8533 if (V0.isUndef())
8534 V0 = Op0.getOperand(0);
8535 } else {
8536 if (V1.isUndef())
8537 V1 = Op0.getOperand(0);
8538 }
8539
8540 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8541 if (SourceVec != Op0.getOperand(0))
8542 return false;
8543
8544 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8545 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8546 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8547 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8548 (j % NumEltsIn64Bits) * 2;
8549 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8550 continue;
8551
8552 // If this is not a commutative op, this does not match.
8553 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8554 return false;
8555
8556 // Addition is commutative, so try swapping the extract indexes.
8557 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8558 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8559 continue;
8560
8561 // Extract indexes do not match horizontal requirement.
8562 return false;
8563 }
8564 }
8565 // We matched. Opcode and operands are returned by reference as arguments.
8566 return true;
8567}
8568
8570 const SDLoc &DL, SelectionDAG &DAG,
8571 unsigned HOpcode, SDValue V0, SDValue V1) {
8572 // If either input vector is not the same size as the build vector,
8573 // extract/insert the low bits to the correct size.
8574 // This is free (examples: zmm --> xmm, xmm --> ymm).
8575 MVT VT = BV->getSimpleValueType(0);
8576 unsigned Width = VT.getSizeInBits();
8577 if (V0.getValueSizeInBits() > Width)
8578 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8579 else if (V0.getValueSizeInBits() < Width)
8580 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8581
8582 if (V1.getValueSizeInBits() > Width)
8583 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8584 else if (V1.getValueSizeInBits() < Width)
8585 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8586
8587 unsigned NumElts = VT.getVectorNumElements();
8588 APInt DemandedElts = APInt::getAllOnes(NumElts);
8589 for (unsigned i = 0; i != NumElts; ++i)
8590 if (BV->getOperand(i).isUndef())
8591 DemandedElts.clearBit(i);
8592
8593 // If we don't need the upper xmm, then perform as a xmm hop.
8594 unsigned HalfNumElts = NumElts / 2;
8595 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8596 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8597 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8598 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8599 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8600 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8601 }
8602
8603 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8604}
8605
8606/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8608 const X86Subtarget &Subtarget,
8609 SelectionDAG &DAG) {
8610 // We need at least 2 non-undef elements to make this worthwhile by default.
8611 unsigned NumNonUndefs =
8612 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8613 if (NumNonUndefs < 2)
8614 return SDValue();
8615
8616 // There are 4 sets of horizontal math operations distinguished by type:
8617 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8618 // subtarget feature. Try to match those "native" patterns first.
8619 MVT VT = BV->getSimpleValueType(0);
8620 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8621 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8622 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8623 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8624 unsigned HOpcode;
8625 SDValue V0, V1;
8626 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8627 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8628 }
8629
8630 // Try harder to match 256-bit ops by using extract/concat.
8631 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8632 return SDValue();
8633
8634 // Count the number of UNDEF operands in the build_vector in input.
8635 unsigned NumElts = VT.getVectorNumElements();
8636 unsigned Half = NumElts / 2;
8637 unsigned NumUndefsLO = 0;
8638 unsigned NumUndefsHI = 0;
8639 for (unsigned i = 0, e = Half; i != e; ++i)
8640 if (BV->getOperand(i)->isUndef())
8641 NumUndefsLO++;
8642
8643 for (unsigned i = Half, e = NumElts; i != e; ++i)
8644 if (BV->getOperand(i)->isUndef())
8645 NumUndefsHI++;
8646
8647 SDValue InVec0, InVec1;
8648 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8649 SDValue InVec2, InVec3;
8650 unsigned X86Opcode;
8651 bool CanFold = true;
8652
8653 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8654 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8655 InVec3) &&
8656 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8657 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8658 X86Opcode = X86ISD::HADD;
8659 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8660 InVec1) &&
8661 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8662 InVec3) &&
8663 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8664 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8665 X86Opcode = X86ISD::HSUB;
8666 else
8667 CanFold = false;
8668
8669 if (CanFold) {
8670 // Do not try to expand this build_vector into a pair of horizontal
8671 // add/sub if we can emit a pair of scalar add/sub.
8672 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8673 return SDValue();
8674
8675 // Convert this build_vector into a pair of horizontal binops followed by
8676 // a concat vector. We must adjust the outputs from the partial horizontal
8677 // matching calls above to account for undefined vector halves.
8678 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8679 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8680 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8681 bool isUndefLO = NumUndefsLO == Half;
8682 bool isUndefHI = NumUndefsHI == Half;
8683 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8684 isUndefHI);
8685 }
8686 }
8687
8688 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8689 VT == MVT::v16i16) {
8690 unsigned X86Opcode;
8691 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8692 InVec1))
8693 X86Opcode = X86ISD::HADD;
8694 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8695 InVec1))
8696 X86Opcode = X86ISD::HSUB;
8697 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8698 InVec1))
8699 X86Opcode = X86ISD::FHADD;
8700 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8701 InVec1))
8702 X86Opcode = X86ISD::FHSUB;
8703 else
8704 return SDValue();
8705
8706 // Don't try to expand this build_vector into a pair of horizontal add/sub
8707 // if we can simply emit a pair of scalar add/sub.
8708 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8709 return SDValue();
8710
8711 // Convert this build_vector into two horizontal add/sub followed by
8712 // a concat vector.
8713 bool isUndefLO = NumUndefsLO == Half;
8714 bool isUndefHI = NumUndefsHI == Half;
8715 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8716 isUndefLO, isUndefHI);
8717 }
8718
8719 return SDValue();
8720}
8721
8722static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8723 SelectionDAG &DAG);
8724
8725/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8726/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8727/// just apply the bit to the vectors.
8728/// NOTE: Its not in our interest to start make a general purpose vectorizer
8729/// from this, but enough scalar bit operations are created from the later
8730/// legalization + scalarization stages to need basic support.
8732 const X86Subtarget &Subtarget,
8733 SelectionDAG &DAG) {
8734 MVT VT = Op->getSimpleValueType(0);
8735 unsigned NumElems = VT.getVectorNumElements();
8736 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8737
8738 // Check that all elements have the same opcode.
8739 // TODO: Should we allow UNDEFS and if so how many?
8740 unsigned Opcode = Op->getOperand(0).getOpcode();
8741 for (unsigned i = 1; i < NumElems; ++i)
8742 if (Opcode != Op->getOperand(i).getOpcode())
8743 return SDValue();
8744
8745 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8746 bool IsShift = false;
8747 switch (Opcode) {
8748 default:
8749 return SDValue();
8750 case ISD::SHL:
8751 case ISD::SRL:
8752 case ISD::SRA:
8753 IsShift = true;
8754 break;
8755 case ISD::AND:
8756 case ISD::XOR:
8757 case ISD::OR:
8758 // Don't do this if the buildvector is a splat - we'd replace one
8759 // constant with an entire vector.
8760 if (Op->getSplatValue())
8761 return SDValue();
8762 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8763 return SDValue();
8764 break;
8765 }
8766
8767 SmallVector<SDValue, 4> LHSElts, RHSElts;
8768 for (SDValue Elt : Op->ops()) {
8769 SDValue LHS = Elt.getOperand(0);
8770 SDValue RHS = Elt.getOperand(1);
8771
8772 // We expect the canonicalized RHS operand to be the constant.
8774 return SDValue();
8775
8776 // Extend shift amounts.
8777 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8778 if (!IsShift)
8779 return SDValue();
8780 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8781 }
8782
8783 LHSElts.push_back(LHS);
8784 RHSElts.push_back(RHS);
8785 }
8786
8787 // Limit to shifts by uniform immediates.
8788 // TODO: Only accept vXi8/vXi64 special cases?
8789 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8790 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8791 return SDValue();
8792
8793 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8794 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8795 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8796
8797 if (!IsShift)
8798 return Res;
8799
8800 // Immediately lower the shift to ensure the constant build vector doesn't
8801 // get converted to a constant pool before the shift is lowered.
8802 return LowerShift(Res, Subtarget, DAG);
8803}
8804
8805static bool isShuffleFoldableLoad(SDValue);
8806
8807/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8808/// representing a blend.
8810 X86Subtarget const &Subtarget,
8811 SelectionDAG &DAG) {
8812 MVT VT = BVOp->getSimpleValueType(0u);
8813
8814 if (VT != MVT::v4f64)
8815 return SDValue();
8816
8817 // Collect unique operands.
8818 auto UniqueOps = SmallSet<SDValue, 16u>();
8819 for (SDValue Op : BVOp->ops()) {
8820 if (isIntOrFPConstant(Op) || Op.isUndef())
8821 return SDValue();
8822 UniqueOps.insert(Op);
8823 }
8824
8825 // Candidate BUILD_VECTOR must have 2 unique operands.
8826 if (UniqueOps.size() != 2u)
8827 return SDValue();
8828
8829 SDValue Op0 = BVOp->getOperand(0u);
8830 UniqueOps.erase(Op0);
8831 SDValue Op1 = *UniqueOps.begin();
8832
8833 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8834 isShuffleFoldableLoad(Op1)) {
8835 // Create shuffle mask.
8836 auto const NumElems = VT.getVectorNumElements();
8837 SmallVector<int, 16u> Mask(NumElems);
8838 for (auto I = 0u; I < NumElems; ++I) {
8839 SDValue Op = BVOp->getOperand(I);
8840 Mask[I] = Op == Op0 ? I : I + NumElems;
8841 }
8842 // Create shuffle of splats.
8843 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8844 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8845 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8846 }
8847
8848 return SDValue();
8849}
8850
8851/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8852/// functionality to do this, so it's all zeros, all ones, or some derivation
8853/// that is cheap to calculate.
8855 SelectionDAG &DAG,
8856 const X86Subtarget &Subtarget) {
8857 MVT VT = Op.getSimpleValueType();
8858
8859 // Vectors containing all zeros can be matched by pxor and xorps.
8860 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8861 return Op;
8862
8863 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8864 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8865 // vpcmpeqd on 256-bit vectors.
8866 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8867 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8868 return Op;
8869
8870 return getOnesVector(VT, DAG, DL);
8871 }
8872
8873 return SDValue();
8874}
8875
8876/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8877/// from a vector of source values and a vector of extraction indices.
8878/// The vectors might be manipulated to match the type of the permute op.
8879static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8880 const SDLoc &DL, SelectionDAG &DAG,
8881 const X86Subtarget &Subtarget) {
8882 MVT ShuffleVT = VT;
8883 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8884 unsigned NumElts = VT.getVectorNumElements();
8885 unsigned SizeInBits = VT.getSizeInBits();
8886
8887 // Adjust IndicesVec to match VT size.
8888 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8889 "Illegal variable permute mask size");
8890 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8891 // Narrow/widen the indices vector to the correct size.
8892 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8893 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8894 NumElts * VT.getScalarSizeInBits());
8895 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8896 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8897 SDLoc(IndicesVec), SizeInBits);
8898 // Zero-extend the index elements within the vector.
8899 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8900 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8901 IndicesVT, IndicesVec);
8902 }
8903 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8904
8905 // Handle SrcVec that don't match VT type.
8906 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8907 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8908 // Handle larger SrcVec by treating it as a larger permute.
8909 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8910 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8911 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8912 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8913 Subtarget, DAG, SDLoc(IndicesVec));
8914 SDValue NewSrcVec =
8915 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8916 if (NewSrcVec)
8917 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8918 return SDValue();
8919 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8920 // Widen smaller SrcVec to match VT.
8921 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8922 } else
8923 return SDValue();
8924 }
8925
8926 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8927 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8928 EVT SrcVT = Idx.getValueType();
8929 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8930 uint64_t IndexScale = 0;
8931 uint64_t IndexOffset = 0;
8932
8933 // If we're scaling a smaller permute op, then we need to repeat the
8934 // indices, scaling and offsetting them as well.
8935 // e.g. v4i32 -> v16i8 (Scale = 4)
8936 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8937 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8938 for (uint64_t i = 0; i != Scale; ++i) {
8939 IndexScale |= Scale << (i * NumDstBits);
8940 IndexOffset |= i << (i * NumDstBits);
8941 }
8942
8943 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8944 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8945 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8946 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8947 return Idx;
8948 };
8949
8950 unsigned Opcode = 0;
8951 switch (VT.SimpleTy) {
8952 default:
8953 break;
8954 case MVT::v16i8:
8955 if (Subtarget.hasSSSE3())
8956 Opcode = X86ISD::PSHUFB;
8957 break;
8958 case MVT::v8i16:
8959 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8960 Opcode = X86ISD::VPERMV;
8961 else if (Subtarget.hasSSSE3()) {
8962 Opcode = X86ISD::PSHUFB;
8963 ShuffleVT = MVT::v16i8;
8964 }
8965 break;
8966 case MVT::v4f32:
8967 case MVT::v4i32:
8968 if (Subtarget.hasAVX()) {
8969 Opcode = X86ISD::VPERMILPV;
8970 ShuffleVT = MVT::v4f32;
8971 } else if (Subtarget.hasSSSE3()) {
8972 Opcode = X86ISD::PSHUFB;
8973 ShuffleVT = MVT::v16i8;
8974 }
8975 break;
8976 case MVT::v2f64:
8977 case MVT::v2i64:
8978 if (Subtarget.hasAVX()) {
8979 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8980 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8981 Opcode = X86ISD::VPERMILPV;
8982 ShuffleVT = MVT::v2f64;
8983 } else if (Subtarget.hasSSE41()) {
8984 // SSE41 can compare v2i64 - select between indices 0 and 1.
8985 return DAG.getSelectCC(
8986 DL, IndicesVec,
8987 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8988 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8989 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8991 }
8992 break;
8993 case MVT::v32i8:
8994 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8995 Opcode = X86ISD::VPERMV;
8996 else if (Subtarget.hasXOP()) {
8997 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8998 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8999 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9000 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9001 return DAG.getNode(
9003 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9004 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9005 } else if (Subtarget.hasAVX()) {
9006 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9007 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9008 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9009 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9010 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9012 // Permute Lo and Hi and then select based on index range.
9013 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9014 // care about the bit[7] as its just an index vector.
9015 SDValue Idx = Ops[2];
9016 EVT VT = Idx.getValueType();
9017 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9018 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9019 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9021 };
9022 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9023 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9024 PSHUFBBuilder);
9025 }
9026 break;
9027 case MVT::v16i16:
9028 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9029 Opcode = X86ISD::VPERMV;
9030 else if (Subtarget.hasAVX()) {
9031 // Scale to v32i8 and perform as v32i8.
9032 IndicesVec = ScaleIndices(IndicesVec, 2);
9033 return DAG.getBitcast(
9035 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9036 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9037 }
9038 break;
9039 case MVT::v8f32:
9040 case MVT::v8i32:
9041 if (Subtarget.hasAVX2())
9042 Opcode = X86ISD::VPERMV;
9043 else if (Subtarget.hasAVX()) {
9044 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9045 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9046 {0, 1, 2, 3, 0, 1, 2, 3});
9047 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9048 {4, 5, 6, 7, 4, 5, 6, 7});
9049 if (Subtarget.hasXOP())
9050 return DAG.getBitcast(
9051 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9052 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9053 // Permute Lo and Hi and then select based on index range.
9054 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9055 SDValue Res = DAG.getSelectCC(
9056 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9057 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9058 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9060 return DAG.getBitcast(VT, Res);
9061 }
9062 break;
9063 case MVT::v4i64:
9064 case MVT::v4f64:
9065 if (Subtarget.hasAVX512()) {
9066 if (!Subtarget.hasVLX()) {
9067 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9068 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9069 SDLoc(SrcVec));
9070 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9071 DAG, SDLoc(IndicesVec));
9072 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9073 DAG, Subtarget);
9074 return extract256BitVector(Res, 0, DAG, DL);
9075 }
9076 Opcode = X86ISD::VPERMV;
9077 } else if (Subtarget.hasAVX()) {
9078 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9079 SDValue LoLo =
9080 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9081 SDValue HiHi =
9082 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9083 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9084 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9085 if (Subtarget.hasXOP())
9086 return DAG.getBitcast(
9087 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9088 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9089 // Permute Lo and Hi and then select based on index range.
9090 // This works as VPERMILPD only uses index bit[1] to permute elements.
9091 SDValue Res = DAG.getSelectCC(
9092 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9093 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9094 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9096 return DAG.getBitcast(VT, Res);
9097 }
9098 break;
9099 case MVT::v64i8:
9100 if (Subtarget.hasVBMI())
9101 Opcode = X86ISD::VPERMV;
9102 break;
9103 case MVT::v32i16:
9104 if (Subtarget.hasBWI())
9105 Opcode = X86ISD::VPERMV;
9106 break;
9107 case MVT::v16f32:
9108 case MVT::v16i32:
9109 case MVT::v8f64:
9110 case MVT::v8i64:
9111 if (Subtarget.hasAVX512())
9112 Opcode = X86ISD::VPERMV;
9113 break;
9114 }
9115 if (!Opcode)
9116 return SDValue();
9117
9118 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9119 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9120 "Illegal variable permute shuffle type");
9121
9122 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9123 if (Scale > 1)
9124 IndicesVec = ScaleIndices(IndicesVec, Scale);
9125
9126 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9127 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9128
9129 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9130 SDValue Res = Opcode == X86ISD::VPERMV
9131 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9132 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9133 return DAG.getBitcast(VT, Res);
9134}
9135
9136// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9137// reasoned to be a permutation of a vector by indices in a non-constant vector.
9138// (build_vector (extract_elt V, (extract_elt I, 0)),
9139// (extract_elt V, (extract_elt I, 1)),
9140// ...
9141// ->
9142// (vpermv I, V)
9143//
9144// TODO: Handle undefs
9145// TODO: Utilize pshufb and zero mask blending to support more efficient
9146// construction of vectors with constant-0 elements.
9147static SDValue
9149 SelectionDAG &DAG,
9150 const X86Subtarget &Subtarget) {
9151 SDValue SrcVec, IndicesVec;
9152
9153 auto PeekThroughFreeze = [](SDValue N) {
9154 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9155 return N->getOperand(0);
9156 return N;
9157 };
9158 // Check for a match of the permute source vector and permute index elements.
9159 // This is done by checking that the i-th build_vector operand is of the form:
9160 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9161 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9162 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9163 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9164 return SDValue();
9165
9166 // If this is the first extract encountered in V, set the source vector,
9167 // otherwise verify the extract is from the previously defined source
9168 // vector.
9169 if (!SrcVec)
9170 SrcVec = Op.getOperand(0);
9171 else if (SrcVec != Op.getOperand(0))
9172 return SDValue();
9173 SDValue ExtractedIndex = Op->getOperand(1);
9174 // Peek through extends.
9175 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9176 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9177 ExtractedIndex = ExtractedIndex.getOperand(0);
9178 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9179 return SDValue();
9180
9181 // If this is the first extract from the index vector candidate, set the
9182 // indices vector, otherwise verify the extract is from the previously
9183 // defined indices vector.
9184 if (!IndicesVec)
9185 IndicesVec = ExtractedIndex.getOperand(0);
9186 else if (IndicesVec != ExtractedIndex.getOperand(0))
9187 return SDValue();
9188
9189 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9190 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9191 return SDValue();
9192 }
9193
9194 MVT VT = V.getSimpleValueType();
9195 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9196}
9197
9198SDValue
9199X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9200 SDLoc dl(Op);
9201
9202 MVT VT = Op.getSimpleValueType();
9203 MVT EltVT = VT.getVectorElementType();
9204 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9205 unsigned NumElems = Op.getNumOperands();
9206
9207 // Generate vectors for predicate vectors.
9208 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9209 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9210
9211 if (VT.getVectorElementType() == MVT::bf16 &&
9212 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9213 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9214
9215 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9216 return VectorCst;
9217
9218 unsigned EVTBits = EltVT.getSizeInBits();
9219 APInt UndefMask = APInt::getZero(NumElems);
9220 APInt FrozenUndefMask = APInt::getZero(NumElems);
9221 APInt ZeroMask = APInt::getZero(NumElems);
9222 APInt NonZeroMask = APInt::getZero(NumElems);
9223 bool IsAllConstants = true;
9224 bool OneUseFrozenUndefs = true;
9225 SmallSet<SDValue, 8> Values;
9226 unsigned NumConstants = NumElems;
9227 for (unsigned i = 0; i < NumElems; ++i) {
9228 SDValue Elt = Op.getOperand(i);
9229 if (Elt.isUndef()) {
9230 UndefMask.setBit(i);
9231 continue;
9232 }
9233 if (ISD::isFreezeUndef(Elt.getNode())) {
9234 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9235 FrozenUndefMask.setBit(i);
9236 continue;
9237 }
9238 Values.insert(Elt);
9239 if (!isIntOrFPConstant(Elt)) {
9240 IsAllConstants = false;
9241 NumConstants--;
9242 }
9243 if (X86::isZeroNode(Elt)) {
9244 ZeroMask.setBit(i);
9245 } else {
9246 NonZeroMask.setBit(i);
9247 }
9248 }
9249
9250 // All undef vector. Return an UNDEF.
9251 if (UndefMask.isAllOnes())
9252 return DAG.getUNDEF(VT);
9253
9254 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9255 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9256 return DAG.getFreeze(DAG.getUNDEF(VT));
9257
9258 // All undef/freeze(undef)/zero vector. Return a zero vector.
9259 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9260 return getZeroVector(VT, Subtarget, DAG, dl);
9261
9262 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9263 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9264 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9265 // and blend the FREEZE-UNDEF operands back in.
9266 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9267 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9268 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9269 SmallVector<int, 16> BlendMask(NumElems, -1);
9270 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9271 for (unsigned i = 0; i < NumElems; ++i) {
9272 if (UndefMask[i]) {
9273 BlendMask[i] = -1;
9274 continue;
9275 }
9276 BlendMask[i] = i;
9277 if (!FrozenUndefMask[i])
9278 Elts[i] = Op.getOperand(i);
9279 else
9280 BlendMask[i] += NumElems;
9281 }
9282 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9283 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9284 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9285 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9286 }
9287
9288 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9289
9290 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9291 // be better off lowering to a smaller build vector and padding with
9292 // undef/zero.
9293 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9295 unsigned UpperElems = NumElems / 2;
9296 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9297 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9298 if (NumUpperUndefsOrZeros >= UpperElems) {
9299 if (VT.is512BitVector() &&
9300 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9301 UpperElems = NumElems - (NumElems / 4);
9302 // If freeze(undef) is in any upper elements, force to zero.
9303 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9304 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9305 SDValue NewBV =
9306 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9307 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9308 }
9309 }
9310
9311 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9312 return AddSub;
9313 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9314 return HorizontalOp;
9315 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9316 return Broadcast;
9317 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9318 return BitOp;
9319 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9320 return Blend;
9321
9322 unsigned NumZero = ZeroMask.popcount();
9323 unsigned NumNonZero = NonZeroMask.popcount();
9324
9325 // If we are inserting one variable into a vector of non-zero constants, try
9326 // to avoid loading each constant element as a scalar. Load the constants as a
9327 // vector and then insert the variable scalar element. If insertion is not
9328 // supported, fall back to a shuffle to get the scalar blended with the
9329 // constants. Insertion into a zero vector is handled as a special-case
9330 // somewhere below here.
9331 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9332 FrozenUndefMask.isZero() &&
9335 // Create an all-constant vector. The variable element in the old
9336 // build vector is replaced by undef in the constant vector. Save the
9337 // variable scalar element and its index for use in the insertelement.
9338 LLVMContext &Context = *DAG.getContext();
9339 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9340 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9341 SDValue VarElt;
9342 SDValue InsIndex;
9343 for (unsigned i = 0; i != NumElems; ++i) {
9344 SDValue Elt = Op.getOperand(i);
9345 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9346 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9347 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9348 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9349 else if (!Elt.isUndef()) {
9350 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9351 "Expected one variable element in this vector");
9352 VarElt = Elt;
9353 InsIndex = DAG.getVectorIdxConstant(i, dl);
9354 }
9355 }
9356 Constant *CV = ConstantVector::get(ConstVecOps);
9357 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9358
9359 // The constants we just created may not be legal (eg, floating point). We
9360 // must lower the vector right here because we can not guarantee that we'll
9361 // legalize it before loading it. This is also why we could not just create
9362 // a new build vector here. If the build vector contains illegal constants,
9363 // it could get split back up into a series of insert elements.
9364 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9365 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9366 MachineFunction &MF = DAG.getMachineFunction();
9367 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9368 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9369 unsigned InsertC = InsIndex->getAsZExtVal();
9370 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9371 if (InsertC < NumEltsInLow128Bits)
9372 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9373
9374 // There's no good way to insert into the high elements of a >128-bit
9375 // vector, so use shuffles to avoid an extract/insert sequence.
9376 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9377 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9378 SmallVector<int, 8> ShuffleMask;
9379 unsigned NumElts = VT.getVectorNumElements();
9380 for (unsigned i = 0; i != NumElts; ++i)
9381 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9382 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9383 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9384 }
9385
9386 // Special case for single non-zero, non-undef, element.
9387 if (NumNonZero == 1) {
9388 unsigned Idx = NonZeroMask.countr_zero();
9389 SDValue Item = Op.getOperand(Idx);
9390
9391 // If we have a constant or non-constant insertion into the low element of
9392 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9393 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9394 // depending on what the source datatype is.
9395 if (Idx == 0) {
9396 if (NumZero == 0)
9397 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9398
9399 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9400 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9401 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9402 assert((VT.is128BitVector() || VT.is256BitVector() ||
9403 VT.is512BitVector()) &&
9404 "Expected an SSE value type!");
9405 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9406 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9407 // zero vector.
9408 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9409 }
9410
9411 // We can't directly insert an i8 or i16 into a vector, so zero extend
9412 // it to i32 first.
9413 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9414 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9415 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9416 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9417 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9418 return DAG.getBitcast(VT, Item);
9419 }
9420 }
9421
9422 // Is it a vector logical left shift?
9423 if (NumElems == 2 && Idx == 1 &&
9424 X86::isZeroNode(Op.getOperand(0)) &&
9425 !X86::isZeroNode(Op.getOperand(1))) {
9426 unsigned NumBits = VT.getSizeInBits();
9427 return getVShift(true, VT,
9429 VT, Op.getOperand(1)),
9430 NumBits/2, DAG, *this, dl);
9431 }
9432
9433 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9434 return SDValue();
9435
9436 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9437 // is a non-constant being inserted into an element other than the low one,
9438 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9439 // movd/movss) to move this into the low element, then shuffle it into
9440 // place.
9441 if (EVTBits == 32) {
9442 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9443 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9444 }
9445 }
9446
9447 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9448 if (Values.size() == 1) {
9449 if (EVTBits == 32) {
9450 // Instead of a shuffle like this:
9451 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9452 // Check if it's possible to issue this instead.
9453 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9454 unsigned Idx = NonZeroMask.countr_zero();
9455 SDValue Item = Op.getOperand(Idx);
9456 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9457 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9458 }
9459 return SDValue();
9460 }
9461
9462 // A vector full of immediates; various special cases are already
9463 // handled, so this is best done with a single constant-pool load.
9464 if (IsAllConstants)
9465 return SDValue();
9466
9467 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9468 return V;
9469
9470 // See if we can use a vector load to get all of the elements.
9471 {
9472 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9473 if (SDValue LD =
9474 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9475 return LD;
9476 }
9477
9478 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9479 // build_vector and broadcast it.
9480 // TODO: We could probably generalize this more.
9481 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9482 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9483 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9484 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9485 // Make sure all the even/odd operands match.
9486 for (unsigned i = 2; i != NumElems; ++i)
9487 if (Ops[i % 2] != Op.getOperand(i))
9488 return false;
9489 return true;
9490 };
9491 if (CanSplat(Op, NumElems, Ops)) {
9492 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9493 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9494 // Create a new build vector and cast to v2i64/v2f64.
9495 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9496 DAG.getBuildVector(NarrowVT, dl, Ops));
9497 // Broadcast from v2i64/v2f64 and cast to final VT.
9498 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9499 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9500 NewBV));
9501 }
9502 }
9503
9504 // For AVX-length vectors, build the individual 128-bit pieces and use
9505 // shuffles to put them in place.
9506 if (VT.getSizeInBits() > 128) {
9507 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9508
9509 // Build both the lower and upper subvector.
9510 SDValue Lower =
9511 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9513 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9514
9515 // Recreate the wider vector with the lower and upper part.
9516 return concatSubVectors(Lower, Upper, DAG, dl);
9517 }
9518
9519 // Let legalizer expand 2-wide build_vectors.
9520 if (EVTBits == 64) {
9521 if (NumNonZero == 1) {
9522 // One half is zero or undef.
9523 unsigned Idx = NonZeroMask.countr_zero();
9524 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9525 Op.getOperand(Idx));
9526 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9527 }
9528 return SDValue();
9529 }
9530
9531 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9532 if (EVTBits == 8 && NumElems == 16)
9533 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9534 NumZero, DAG, Subtarget))
9535 return V;
9536
9537 if (EltVT == MVT::i16 && NumElems == 8)
9538 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9539 NumZero, DAG, Subtarget))
9540 return V;
9541
9542 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9543 if (EVTBits == 32 && NumElems == 4)
9544 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9545 return V;
9546
9547 // If element VT is == 32 bits, turn it into a number of shuffles.
9548 if (NumElems == 4 && NumZero > 0) {
9549 SmallVector<SDValue, 8> Ops(NumElems);
9550 for (unsigned i = 0; i < 4; ++i) {
9551 bool isZero = !NonZeroMask[i];
9552 if (isZero)
9553 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9554 else
9555 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9556 }
9557
9558 for (unsigned i = 0; i < 2; ++i) {
9559 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9560 default: llvm_unreachable("Unexpected NonZero count");
9561 case 0:
9562 Ops[i] = Ops[i*2]; // Must be a zero vector.
9563 break;
9564 case 1:
9565 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9566 break;
9567 case 2:
9568 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9569 break;
9570 case 3:
9571 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9572 break;
9573 }
9574 }
9575
9576 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9577 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9578 int MaskVec[] = {
9579 Reverse1 ? 1 : 0,
9580 Reverse1 ? 0 : 1,
9581 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9582 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9583 };
9584 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9585 }
9586
9587 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9588
9589 // Check for a build vector from mostly shuffle plus few inserting.
9590 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9591 return Sh;
9592
9593 // For SSE 4.1, use insertps to put the high elements into the low element.
9594 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9596 if (!Op.getOperand(0).isUndef())
9597 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9598 else
9599 Result = DAG.getUNDEF(VT);
9600
9601 for (unsigned i = 1; i < NumElems; ++i) {
9602 if (Op.getOperand(i).isUndef()) continue;
9603 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9604 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9605 }
9606 return Result;
9607 }
9608
9609 // Otherwise, expand into a number of unpckl*, start by extending each of
9610 // our (non-undef) elements to the full vector width with the element in the
9611 // bottom slot of the vector (which generates no code for SSE).
9612 SmallVector<SDValue, 8> Ops(NumElems);
9613 for (unsigned i = 0; i < NumElems; ++i) {
9614 if (!Op.getOperand(i).isUndef())
9615 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9616 else
9617 Ops[i] = DAG.getUNDEF(VT);
9618 }
9619
9620 // Next, we iteratively mix elements, e.g. for v4f32:
9621 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9622 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9623 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9624 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9625 // Generate scaled UNPCKL shuffle mask.
9626 SmallVector<int, 16> Mask;
9627 for(unsigned i = 0; i != Scale; ++i)
9628 Mask.push_back(i);
9629 for (unsigned i = 0; i != Scale; ++i)
9630 Mask.push_back(NumElems+i);
9631 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9632
9633 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9634 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9635 }
9636 return Ops[0];
9637}
9638
9639// 256-bit AVX can use the vinsertf128 instruction
9640// to create 256-bit vectors from two other 128-bit ones.
9641// TODO: Detect subvector broadcast here instead of DAG combine?
9643 SelectionDAG &DAG,
9644 const X86Subtarget &Subtarget) {
9645 MVT ResVT = Op.getSimpleValueType();
9646 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9647 "Value type must be 256-/512-bit wide");
9648
9649 unsigned NumOperands = Op.getNumOperands();
9650 unsigned NumFreezeUndef = 0;
9651 unsigned NumZero = 0;
9652 unsigned NumNonZero = 0;
9653 unsigned NonZeros = 0;
9654 SmallSet<SDValue, 4> Undefs;
9655 for (unsigned i = 0; i != NumOperands; ++i) {
9656 SDValue SubVec = Op.getOperand(i);
9657 if (SubVec.isUndef())
9658 continue;
9659 if (ISD::isFreezeUndef(SubVec.getNode())) {
9660 // If the freeze(undef) has multiple uses then we must fold to zero.
9661 if (SubVec.hasOneUse()) {
9662 ++NumFreezeUndef;
9663 } else {
9664 ++NumZero;
9665 Undefs.insert(SubVec);
9666 }
9667 }
9668 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9669 ++NumZero;
9670 else {
9671 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9672 NonZeros |= 1 << i;
9673 ++NumNonZero;
9674 }
9675 }
9676
9677 // If we have more than 2 non-zeros, build each half separately.
9678 if (NumNonZero > 2) {
9679 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9680 ArrayRef<SDUse> Ops = Op->ops();
9681 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9682 Ops.slice(0, NumOperands/2));
9683 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9684 Ops.slice(NumOperands/2));
9685 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9686 }
9687
9688 // Otherwise, build it up through insert_subvectors.
9689 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9690 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9691 : DAG.getUNDEF(ResVT));
9692
9693 // Replace Undef operands with ZeroVector.
9694 for (SDValue U : Undefs)
9696 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9697
9698 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9699 unsigned NumSubElems = SubVT.getVectorNumElements();
9700 for (unsigned i = 0; i != NumOperands; ++i) {
9701 if ((NonZeros & (1 << i)) == 0)
9702 continue;
9703
9704 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9705 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9706 }
9707
9708 return Vec;
9709}
9710
9711// Returns true if the given node is a type promotion (by concatenating i1
9712// zeros) of the result of a node that already zeros all upper bits of
9713// k-register.
9714// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9716 const X86Subtarget &Subtarget,
9717 SelectionDAG & DAG) {
9718 MVT ResVT = Op.getSimpleValueType();
9719 unsigned NumOperands = Op.getNumOperands();
9720 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9721 "Unexpected number of operands in CONCAT_VECTORS");
9722
9723 uint64_t Zeros = 0;
9724 uint64_t NonZeros = 0;
9725 for (unsigned i = 0; i != NumOperands; ++i) {
9726 SDValue SubVec = Op.getOperand(i);
9727 if (SubVec.isUndef())
9728 continue;
9729 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9730 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9731 Zeros |= (uint64_t)1 << i;
9732 else
9733 NonZeros |= (uint64_t)1 << i;
9734 }
9735
9736 unsigned NumElems = ResVT.getVectorNumElements();
9737
9738 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9739 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9740 // insert_subvector will give us two kshifts.
9741 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9742 Log2_64(NonZeros) != NumOperands - 1) {
9743 unsigned Idx = Log2_64(NonZeros);
9744 SDValue SubVec = Op.getOperand(Idx);
9745 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9746 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9747 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9748 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9749 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9750 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9751 DAG.getVectorIdxConstant(0, dl));
9752 }
9753
9754 // If there are zero or one non-zeros we can handle this very simply.
9755 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9756 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9757 if (!NonZeros)
9758 return Vec;
9759 unsigned Idx = Log2_64(NonZeros);
9760 SDValue SubVec = Op.getOperand(Idx);
9761 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9762 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9763 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9764 }
9765
9766 if (NumOperands > 2) {
9767 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9768 ArrayRef<SDUse> Ops = Op->ops();
9769 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9770 Ops.slice(0, NumOperands / 2));
9771 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9772 Ops.slice(NumOperands / 2));
9773 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9774 }
9775
9776 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9777
9778 if (ResVT.getVectorNumElements() >= 16)
9779 return Op; // The operation is legal with KUNPCK
9780
9781 SDValue Vec =
9782 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9783 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9784 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9785 DAG.getVectorIdxConstant(NumElems / 2, dl));
9786}
9787
9789 const X86Subtarget &Subtarget,
9790 SelectionDAG &DAG) {
9791 SDLoc DL(Op);
9792 MVT VT = Op.getSimpleValueType();
9793 if (VT.getVectorElementType() == MVT::i1)
9794 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9795
9796 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9797 // from two other 128-bit ones.
9798 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9799 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9800 (VT.is512BitVector() &&
9801 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9802 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9803}
9804
9805//===----------------------------------------------------------------------===//
9806// Vector shuffle lowering
9807//
9808// This is an experimental code path for lowering vector shuffles on x86. It is
9809// designed to handle arbitrary vector shuffles and blends, gracefully
9810// degrading performance as necessary. It works hard to recognize idiomatic
9811// shuffles and lower them to optimal instruction patterns without leaving
9812// a framework that allows reasonably efficient handling of all vector shuffle
9813// patterns.
9814//===----------------------------------------------------------------------===//
9815
9816/// Checks whether the vector elements referenced by two shuffle masks are
9817/// equivalent.
9818static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9819 int Idx, int ExpectedIdx) {
9820 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9821 ExpectedIdx < MaskSize && "Out of range element index");
9822 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9823 return false;
9824
9825 EVT VT = Op.getValueType();
9826 EVT ExpectedVT = ExpectedOp.getValueType();
9827
9828 // Sources must be vectors and match the mask's element count.
9829 if (!VT.isVector() || !ExpectedVT.isVector() ||
9830 (int)VT.getVectorNumElements() != MaskSize ||
9831 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9832 return false;
9833
9834 // Exact match.
9835 if (Idx == ExpectedIdx && Op == ExpectedOp)
9836 return true;
9837
9838 switch (Op.getOpcode()) {
9839 case ISD::BUILD_VECTOR:
9840 // If the values are build vectors, we can look through them to find
9841 // equivalent inputs that make the shuffles equivalent.
9842 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9843 case ISD::BITCAST: {
9845 EVT SrcVT = Src.getValueType();
9846 if (Op == ExpectedOp && SrcVT.isVector()) {
9847 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9848 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9849 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9850 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9851 Idx / Scale, ExpectedIdx / Scale);
9852 }
9853 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9854 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9855 for (unsigned I = 0; I != Scale; ++I)
9856 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9857 (Idx * Scale) + I,
9858 (ExpectedIdx * Scale) + I))
9859 return false;
9860 return true;
9861 }
9862 }
9863 break;
9864 }
9865 case ISD::VECTOR_SHUFFLE: {
9866 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9867 return Op == ExpectedOp &&
9868 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9869 }
9870 case X86ISD::VBROADCAST:
9872 return Op == ExpectedOp;
9874 if (Op == ExpectedOp) {
9875 auto *MemOp = cast<MemSDNode>(Op);
9876 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9877 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9878 }
9879 break;
9880 case X86ISD::VPERMI: {
9881 if (Op == ExpectedOp) {
9883 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9884 SDValue Src = Op.getOperand(0);
9885 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9886 Mask[ExpectedIdx]);
9887 }
9888 break;
9889 }
9890 case X86ISD::HADD:
9891 case X86ISD::HSUB:
9892 case X86ISD::FHADD:
9893 case X86ISD::FHSUB:
9894 case X86ISD::PACKSS:
9895 case X86ISD::PACKUS:
9896 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9897 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9898 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9899 int NumElts = VT.getVectorNumElements();
9900 int NumLanes = VT.getSizeInBits() / 128;
9901 int NumEltsPerLane = NumElts / NumLanes;
9902 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9903 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9904 bool SameElt =
9905 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9906 return SameLane && SameElt;
9907 }
9908 break;
9909 }
9910
9911 return false;
9912}
9913
9914/// Tiny helper function to identify a no-op mask.
9915///
9916/// This is a somewhat boring predicate function. It checks whether the mask
9917/// array input, which is assumed to be a single-input shuffle mask of the kind
9918/// used by the X86 shuffle instructions (not a fully general
9919/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9920/// in-place shuffle are 'no-op's.
9922 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9923 assert(Mask[i] >= -1 && "Out of bound mask element!");
9924 if (Mask[i] >= 0 && Mask[i] != i)
9925 return false;
9926 }
9927 return true;
9928}
9929
9930/// Test whether there are elements crossing LaneSizeInBits lanes in this
9931/// shuffle mask.
9932///
9933/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9934/// and we routinely test for these.
9935static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9936 unsigned ScalarSizeInBits,
9937 ArrayRef<int> Mask) {
9938 assert(LaneSizeInBits && ScalarSizeInBits &&
9939 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9940 "Illegal shuffle lane size");
9941 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9942 int Size = Mask.size();
9943 for (int i = 0; i < Size; ++i)
9944 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9945 return true;
9946 return false;
9947}
9948
9949/// Test whether there are elements crossing 128-bit lanes in this
9950/// shuffle mask.
9952 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9953}
9954
9955/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9956/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9957/// better support 'repeated mask + lane permute' style shuffles.
9958static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9959 unsigned ScalarSizeInBits,
9960 ArrayRef<int> Mask) {
9961 assert(LaneSizeInBits && ScalarSizeInBits &&
9962 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9963 "Illegal shuffle lane size");
9964 int NumElts = Mask.size();
9965 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9966 int NumLanes = NumElts / NumEltsPerLane;
9967 if (NumLanes > 1) {
9968 for (int i = 0; i != NumLanes; ++i) {
9969 int SrcLane = -1;
9970 for (int j = 0; j != NumEltsPerLane; ++j) {
9971 int M = Mask[(i * NumEltsPerLane) + j];
9972 if (M < 0)
9973 continue;
9974 int Lane = (M % NumElts) / NumEltsPerLane;
9975 if (SrcLane >= 0 && SrcLane != Lane)
9976 return true;
9977 SrcLane = Lane;
9978 }
9979 }
9980 }
9981 return false;
9982}
9983
9984/// Test whether a shuffle mask is equivalent within each sub-lane.
9985///
9986/// This checks a shuffle mask to see if it is performing the same
9987/// lane-relative shuffle in each sub-lane. This trivially implies
9988/// that it is also not lane-crossing. It may however involve a blend from the
9989/// same lane of a second vector.
9990///
9991/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9992/// non-trivial to compute in the face of undef lanes. The representation is
9993/// suitable for use with existing 128-bit shuffles as entries from the second
9994/// vector have been remapped to [LaneSize, 2*LaneSize).
9995static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9996 ArrayRef<int> Mask,
9997 SmallVectorImpl<int> &RepeatedMask) {
9998 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9999 RepeatedMask.assign(LaneSize, -1);
10000 int Size = Mask.size();
10001 for (int i = 0; i < Size; ++i) {
10002 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10003 if (Mask[i] < 0)
10004 continue;
10005 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10006 // This entry crosses lanes, so there is no way to model this shuffle.
10007 return false;
10008
10009 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10010 // Adjust second vector indices to start at LaneSize instead of Size.
10011 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10012 : Mask[i] % LaneSize + LaneSize;
10013 if (RepeatedMask[i % LaneSize] < 0)
10014 // This is the first non-undef entry in this slot of a 128-bit lane.
10015 RepeatedMask[i % LaneSize] = LocalM;
10016 else if (RepeatedMask[i % LaneSize] != LocalM)
10017 // Found a mismatch with the repeated mask.
10018 return false;
10019 }
10020 return true;
10021}
10022
10023/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10024static bool
10026 SmallVectorImpl<int> &RepeatedMask) {
10027 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10028}
10029
10030static bool
10032 SmallVector<int, 32> RepeatedMask;
10033 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10034}
10035
10036/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10037static bool
10039 SmallVectorImpl<int> &RepeatedMask) {
10040 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10041}
10042
10043/// Test whether a target shuffle mask is equivalent within each sub-lane.
10044/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10045static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10046 unsigned EltSizeInBits,
10047 ArrayRef<int> Mask,
10048 SmallVectorImpl<int> &RepeatedMask) {
10049 int LaneSize = LaneSizeInBits / EltSizeInBits;
10050 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10051 int Size = Mask.size();
10052 for (int i = 0; i < Size; ++i) {
10053 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10054 if (Mask[i] == SM_SentinelUndef)
10055 continue;
10056 if (Mask[i] == SM_SentinelZero) {
10057 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10058 return false;
10059 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10060 continue;
10061 }
10062 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10063 // This entry crosses lanes, so there is no way to model this shuffle.
10064 return false;
10065
10066 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10067 // later vector indices to start at multiples of LaneSize instead of Size.
10068 int LaneM = Mask[i] / Size;
10069 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10070 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10071 // This is the first non-undef entry in this slot of a 128-bit lane.
10072 RepeatedMask[i % LaneSize] = LocalM;
10073 else if (RepeatedMask[i % LaneSize] != LocalM)
10074 // Found a mismatch with the repeated mask.
10075 return false;
10076 }
10077 return true;
10078}
10079
10080/// Test whether a target shuffle mask is equivalent within each sub-lane.
10081/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10082static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10083 ArrayRef<int> Mask,
10084 SmallVectorImpl<int> &RepeatedMask) {
10085 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10086 Mask, RepeatedMask);
10087}
10088
10089/// Checks whether a shuffle mask is equivalent to an explicit list of
10090/// arguments.
10091///
10092/// This is a fast way to test a shuffle mask against a fixed pattern:
10093///
10094/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10095///
10096/// It returns true if the mask is exactly as wide as the argument list, and
10097/// each element of the mask is either -1 (signifying undef) or the value given
10098/// in the argument.
10099static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10100 SDValue V1 = SDValue(),
10101 SDValue V2 = SDValue()) {
10102 int Size = Mask.size();
10103 if (Size != (int)ExpectedMask.size())
10104 return false;
10105
10106 for (int i = 0; i < Size; ++i) {
10107 assert(Mask[i] >= -1 && "Out of bound mask element!");
10108 int MaskIdx = Mask[i];
10109 int ExpectedIdx = ExpectedMask[i];
10110 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10111 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10112 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10113 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10114 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10115 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10116 return false;
10117 }
10118 }
10119 return true;
10120}
10121
10122/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10123///
10124/// The masks must be exactly the same width.
10125///
10126/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10127/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10128///
10129/// SM_SentinelZero is accepted as a valid negative index but must match in
10130/// both, or via a known bits test.
10132 ArrayRef<int> ExpectedMask,
10133 const SelectionDAG &DAG,
10134 SDValue V1 = SDValue(),
10135 SDValue V2 = SDValue()) {
10136 int Size = Mask.size();
10137 if (Size != (int)ExpectedMask.size())
10138 return false;
10139 assert(llvm::all_of(ExpectedMask,
10140 [Size](int M) {
10141 return M == SM_SentinelZero ||
10142 isInRange(M, 0, 2 * Size);
10143 }) &&
10144 "Illegal target shuffle mask");
10145
10146 // Check for out-of-range target shuffle mask indices.
10147 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10148 return false;
10149
10150 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10151 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10152 !V1.getValueType().isVector()))
10153 V1 = SDValue();
10154 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10155 !V2.getValueType().isVector()))
10156 V2 = SDValue();
10157
10158 APInt ZeroV1 = APInt::getZero(Size);
10159 APInt ZeroV2 = APInt::getZero(Size);
10160
10161 for (int i = 0; i < Size; ++i) {
10162 int MaskIdx = Mask[i];
10163 int ExpectedIdx = ExpectedMask[i];
10164 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10165 continue;
10166 // If we failed to match an expected SM_SentinelZero then early out.
10167 if (ExpectedIdx < 0)
10168 return false;
10169 if (MaskIdx == SM_SentinelZero) {
10170 // If we need this expected index to be a zero element, then update the
10171 // relevant zero mask and perform the known bits at the end to minimize
10172 // repeated computes.
10173 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10174 if (ExpectedV &&
10175 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10176 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10177 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10178 ZeroMask.setBit(BitIdx);
10179 continue;
10180 }
10181 }
10182 if (MaskIdx >= 0) {
10183 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10184 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10185 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10186 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10187 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10188 continue;
10189 }
10190 return false;
10191 }
10192 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10193 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10194}
10195
10196// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10197// instructions.
10199 const SelectionDAG &DAG) {
10200 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10201 return false;
10202
10203 SmallVector<int, 8> Unpcklwd;
10204 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10205 /* Unary = */ false);
10206 SmallVector<int, 8> Unpckhwd;
10207 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10208 /* Unary = */ false);
10209 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10210 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10211 return IsUnpackwdMask;
10212}
10213
10215 const SelectionDAG &DAG) {
10216 // Create 128-bit vector type based on mask size.
10217 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10218 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10219
10220 // We can't assume a canonical shuffle mask, so try the commuted version too.
10221 SmallVector<int, 4> CommutedMask(Mask);
10223
10224 // Match any of unary/binary or low/high.
10225 for (unsigned i = 0; i != 4; ++i) {
10226 SmallVector<int, 16> UnpackMask;
10227 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10228 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10229 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10230 return true;
10231 }
10232 return false;
10233}
10234
10235/// Return true if a shuffle mask chooses elements identically in its top and
10236/// bottom halves. For example, any splat mask has the same top and bottom
10237/// halves. If an element is undefined in only one half of the mask, the halves
10238/// are not considered identical.
10240 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10241 unsigned HalfSize = Mask.size() / 2;
10242 for (unsigned i = 0; i != HalfSize; ++i) {
10243 if (Mask[i] != Mask[i + HalfSize])
10244 return false;
10245 }
10246 return true;
10247}
10248
10249/// Get a 4-lane 8-bit shuffle immediate for a mask.
10250///
10251/// This helper function produces an 8-bit shuffle immediate corresponding to
10252/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10253/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10254/// example.
10255///
10256/// NB: We rely heavily on "undef" masks preserving the input lane.
10257static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10258 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10259 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10260 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10261 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10262 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10263
10264 // If the mask only uses one non-undef element, then fully 'splat' it to
10265 // improve later broadcast matching.
10266 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10267 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10268
10269 int FirstElt = Mask[FirstIndex];
10270 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10271 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10272
10273 unsigned Imm = 0;
10274 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10275 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10276 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10277 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10278 return Imm;
10279}
10280
10282 SelectionDAG &DAG) {
10283 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10284}
10285
10286// Canonicalize SHUFPD mask to improve chances of further folding.
10287// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10288static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10289 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10290 "Unexpected SHUFPD mask size");
10291 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10292 "Unexpected SHUFPD mask elements");
10293
10294 // If the mask only uses one non-undef element, then fully 'splat' it to
10295 // improve later broadcast matching.
10296 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10297 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10298 "All undef shuffle mask");
10299
10300 int FirstElt = Mask[FirstIndex];
10301 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10302 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10303 unsigned Imm = 0;
10304 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10305 Imm |= FirstElt << I;
10306 return Imm;
10307 }
10308
10309 // Attempt to keep any undef elements in place to improve chances of the
10310 // shuffle becoming a (commutative) blend.
10311 unsigned Imm = 0;
10312 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10313 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10314
10315 return Imm;
10316}
10317
10319 SelectionDAG &DAG) {
10320 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10321}
10322
10323// The Shuffle result is as follow:
10324// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10325// Each Zeroable's element correspond to a particular Mask's element.
10326// As described in computeZeroableShuffleElements function.
10327//
10328// The function looks for a sub-mask that the nonzero elements are in
10329// increasing order. If such sub-mask exist. The function returns true.
10330static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10331 ArrayRef<int> Mask, const EVT &VectorType,
10332 bool &IsZeroSideLeft) {
10333 int NextElement = -1;
10334 // Check if the Mask's nonzero elements are in increasing order.
10335 for (int i = 0, e = Mask.size(); i < e; i++) {
10336 // Checks if the mask's zeros elements are built from only zeros.
10337 assert(Mask[i] >= -1 && "Out of bound mask element!");
10338 if (Mask[i] < 0)
10339 return false;
10340 if (Zeroable[i])
10341 continue;
10342 // Find the lowest non zero element
10343 if (NextElement < 0) {
10344 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10345 IsZeroSideLeft = NextElement != 0;
10346 }
10347 // Exit if the mask's non zero elements are not in increasing order.
10348 if (NextElement != Mask[i])
10349 return false;
10350 NextElement++;
10351 }
10352 return true;
10353}
10354
10355static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10357 const X86Subtarget &Subtarget,
10358 unsigned Depth = 0);
10359
10360/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10362 ArrayRef<int> Mask, SDValue V1,
10363 SDValue V2, const APInt &Zeroable,
10364 const X86Subtarget &Subtarget,
10365 SelectionDAG &DAG) {
10366 int Size = Mask.size();
10367 int LaneSize = 128 / VT.getScalarSizeInBits();
10368 const int NumBytes = VT.getSizeInBits() / 8;
10369 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10370
10371 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10372 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10373 (Subtarget.hasBWI() && VT.is512BitVector()));
10374
10375 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10376 // Sign bit set in i8 mask means zero element.
10377 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10378
10379 SDValue V;
10380 for (int i = 0; i < NumBytes; ++i) {
10381 int M = Mask[i / NumEltBytes];
10382 if (M < 0) {
10383 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10384 continue;
10385 }
10386 if (Zeroable[i / NumEltBytes]) {
10387 PSHUFBMask[i] = ZeroMask;
10388 continue;
10389 }
10390
10391 // We can only use a single input of V1 or V2.
10392 SDValue SrcV = (M >= Size ? V2 : V1);
10393 if (V && V != SrcV)
10394 return SDValue();
10395 V = SrcV;
10396 M %= Size;
10397
10398 // PSHUFB can't cross lanes, ensure this doesn't happen.
10399 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10400 return SDValue();
10401
10402 M = M % LaneSize;
10403 M = M * NumEltBytes + (i % NumEltBytes);
10404 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10405 }
10406 assert(V && "Failed to find a source input");
10407
10408 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10409 return DAG.getBitcast(
10410 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10411 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10412}
10413
10414static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10415 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10416 const SDLoc &dl);
10417
10418// X86 has dedicated shuffle that can be lowered to VEXPAND
10420 SDValue V2, ArrayRef<int> Mask,
10421 const APInt &Zeroable,
10422 const X86Subtarget &Subtarget,
10423 SelectionDAG &DAG) {
10424 bool IsLeftZeroSide = true;
10425 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10426 IsLeftZeroSide))
10427 return SDValue();
10428 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10430 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10431 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10432 unsigned NumElts = VT.getVectorNumElements();
10433 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10434 "Unexpected number of vector elements");
10435 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10436 Subtarget, DAG, DL);
10437 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10438 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10439 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10440}
10441
10442static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10443 unsigned &UnpackOpcode, bool IsUnary,
10444 ArrayRef<int> TargetMask, const SDLoc &DL,
10445 SelectionDAG &DAG,
10446 const X86Subtarget &Subtarget) {
10447 int NumElts = VT.getVectorNumElements();
10448
10449 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10450 for (int i = 0; i != NumElts; i += 2) {
10451 int M1 = TargetMask[i + 0];
10452 int M2 = TargetMask[i + 1];
10453 Undef1 &= (SM_SentinelUndef == M1);
10454 Undef2 &= (SM_SentinelUndef == M2);
10455 Zero1 &= isUndefOrZero(M1);
10456 Zero2 &= isUndefOrZero(M2);
10457 }
10458 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10459 "Zeroable shuffle detected");
10460
10461 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10462 SmallVector<int, 64> Unpckl, Unpckh;
10463 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10464 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10465 (IsUnary ? V1 : V2))) {
10466 UnpackOpcode = X86ISD::UNPCKL;
10467 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10468 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10469 return true;
10470 }
10471
10472 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10473 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10474 (IsUnary ? V1 : V2))) {
10475 UnpackOpcode = X86ISD::UNPCKH;
10476 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10477 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10478 return true;
10479 }
10480
10481 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10482 if (IsUnary && (Zero1 || Zero2)) {
10483 // Don't bother if we can blend instead.
10484 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10485 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10486 return false;
10487
10488 bool MatchLo = true, MatchHi = true;
10489 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10490 int M = TargetMask[i];
10491
10492 // Ignore if the input is known to be zero or the index is undef.
10493 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10494 (M == SM_SentinelUndef))
10495 continue;
10496
10497 MatchLo &= (M == Unpckl[i]);
10498 MatchHi &= (M == Unpckh[i]);
10499 }
10500
10501 if (MatchLo || MatchHi) {
10502 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10503 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10504 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10505 return true;
10506 }
10507 }
10508
10509 // If a binary shuffle, commute and try again.
10510 if (!IsUnary) {
10512 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10513 UnpackOpcode = X86ISD::UNPCKL;
10514 std::swap(V1, V2);
10515 return true;
10516 }
10517
10519 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10520 UnpackOpcode = X86ISD::UNPCKH;
10521 std::swap(V1, V2);
10522 return true;
10523 }
10524 }
10525
10526 return false;
10527}
10528
10529// X86 has dedicated unpack instructions that can handle specific blend
10530// operations: UNPCKH and UNPCKL.
10532 SDValue V2, ArrayRef<int> Mask,
10533 SelectionDAG &DAG) {
10534 SmallVector<int, 8> Unpckl;
10535 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10536 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10537 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10538
10539 SmallVector<int, 8> Unpckh;
10540 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10541 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10542 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10543
10544 // Commute and try again.
10546 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10547 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10548
10550 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10551 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10552
10553 return SDValue();
10554}
10555
10556/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10557/// followed by unpack 256-bit.
10559 SDValue V2, ArrayRef<int> Mask,
10560 SelectionDAG &DAG) {
10561 SmallVector<int, 32> Unpckl, Unpckh;
10562 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10563 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10564
10565 unsigned UnpackOpcode;
10566 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10567 UnpackOpcode = X86ISD::UNPCKL;
10568 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10569 UnpackOpcode = X86ISD::UNPCKH;
10570 else
10571 return SDValue();
10572
10573 // This is a "natural" unpack operation (rather than the 128-bit sectored
10574 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10575 // input in order to use the x86 instruction.
10576 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10577 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10578 V1 = DAG.getBitcast(VT, V1);
10579 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10580}
10581
10582// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10583// source into the lower elements and zeroing the upper elements.
10584static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10585 ArrayRef<int> Mask, const APInt &Zeroable,
10586 const X86Subtarget &Subtarget) {
10587 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10588 return false;
10589
10590 unsigned NumElts = Mask.size();
10591 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10592 unsigned MaxScale = 64 / EltSizeInBits;
10593
10594 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10595 unsigned SrcEltBits = EltSizeInBits * Scale;
10596 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10597 continue;
10598 unsigned NumSrcElts = NumElts / Scale;
10599 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10600 continue;
10601 unsigned UpperElts = NumElts - NumSrcElts;
10602 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10603 continue;
10604 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10605 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10606 DstVT = MVT::getIntegerVT(EltSizeInBits);
10607 if ((NumSrcElts * EltSizeInBits) >= 128) {
10608 // ISD::TRUNCATE
10609 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10610 } else {
10611 // X86ISD::VTRUNC
10612 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10613 }
10614 return true;
10615 }
10616
10617 return false;
10618}
10619
10620// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10621// element padding to the final DstVT.
10622static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10623 const X86Subtarget &Subtarget,
10624 SelectionDAG &DAG, bool ZeroUppers) {
10625 MVT SrcVT = Src.getSimpleValueType();
10626 MVT DstSVT = DstVT.getScalarType();
10627 unsigned NumDstElts = DstVT.getVectorNumElements();
10628 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10629 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10630
10631 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10632 return SDValue();
10633
10634 // Perform a direct ISD::TRUNCATE if possible.
10635 if (NumSrcElts == NumDstElts)
10636 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10637
10638 if (NumSrcElts > NumDstElts) {
10639 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10640 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10641 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10642 }
10643
10644 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10645 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10646 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10647 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10648 DstVT.getSizeInBits());
10649 }
10650
10651 // Non-VLX targets must truncate from a 512-bit type, so we need to
10652 // widen, truncate and then possibly extract the original subvector.
10653 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10654 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10655 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10656 }
10657
10658 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10659 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10660 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10661 if (DstVT != TruncVT)
10662 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10663 DstVT.getSizeInBits());
10664 return Trunc;
10665}
10666
10667// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10668//
10669// An example is the following:
10670//
10671// t0: ch = EntryToken
10672// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10673// t25: v4i32 = truncate t2
10674// t41: v8i16 = bitcast t25
10675// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10676// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10677// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10678// t18: v2i64 = bitcast t51
10679//
10680// One can just use a single vpmovdw instruction, without avx512vl we need to
10681// use the zmm variant and extract the lower subvector, padding with zeroes.
10682// TODO: Merge with lowerShuffleAsVTRUNC.
10684 SDValue V2, ArrayRef<int> Mask,
10685 const APInt &Zeroable,
10686 const X86Subtarget &Subtarget,
10687 SelectionDAG &DAG) {
10688 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10689 if (!Subtarget.hasAVX512())
10690 return SDValue();
10691
10692 unsigned NumElts = VT.getVectorNumElements();
10693 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10694 unsigned MaxScale = 64 / EltSizeInBits;
10695 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10696 unsigned SrcEltBits = EltSizeInBits * Scale;
10697 unsigned NumSrcElts = NumElts / Scale;
10698 unsigned UpperElts = NumElts - NumSrcElts;
10699 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10700 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10701 continue;
10702
10703 // Attempt to find a matching source truncation, but as a fall back VLX
10704 // cases can use the VPMOV directly.
10705 SDValue Src = peekThroughBitcasts(V1);
10706 if (Src.getOpcode() == ISD::TRUNCATE &&
10707 Src.getScalarValueSizeInBits() == SrcEltBits) {
10708 Src = Src.getOperand(0);
10709 } else if (Subtarget.hasVLX()) {
10710 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10711 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10712 Src = DAG.getBitcast(SrcVT, Src);
10713 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10714 if (Scale == 2 &&
10715 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10716 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10717 return SDValue();
10718 } else
10719 return SDValue();
10720
10721 // VPMOVWB is only available with avx512bw.
10722 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10723 return SDValue();
10724
10725 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10726 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10727 }
10728
10729 return SDValue();
10730}
10731
10732// Attempt to match binary shuffle patterns as a truncate.
10734 SDValue V2, ArrayRef<int> Mask,
10735 const APInt &Zeroable,
10736 const X86Subtarget &Subtarget,
10737 SelectionDAG &DAG) {
10738 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10739 "Unexpected VTRUNC type");
10740 if (!Subtarget.hasAVX512() ||
10741 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10742 return SDValue();
10743
10744 unsigned NumElts = VT.getVectorNumElements();
10745 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10746 unsigned MaxScale = 64 / EltSizeInBits;
10747 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10748 // TODO: Support non-BWI VPMOVWB truncations?
10749 unsigned SrcEltBits = EltSizeInBits * Scale;
10750 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10751 continue;
10752
10753 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10754 // Bail if the V2 elements are undef.
10755 unsigned NumHalfSrcElts = NumElts / Scale;
10756 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10757 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10758 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10759 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10760 continue;
10761
10762 // The elements beyond the truncation must be undef/zero.
10763 unsigned UpperElts = NumElts - NumSrcElts;
10764 if (UpperElts > 0 &&
10765 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10766 continue;
10767 bool UndefUppers =
10768 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10769
10770 // As we're using both sources then we need to concat them together
10771 // and truncate from the double-sized src.
10772 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10773
10774 // For offset truncations, ensure that the concat is cheap.
10775 SDValue Src =
10776 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10777 if (!Src) {
10778 if (Offset)
10779 continue;
10780 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10781 }
10782
10783 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10784 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10785 Src = DAG.getBitcast(SrcVT, Src);
10786
10787 // Shift the offset'd elements into place for the truncation.
10788 // TODO: Use getTargetVShiftByConstNode.
10789 if (Offset)
10790 Src = DAG.getNode(
10791 X86ISD::VSRLI, DL, SrcVT, Src,
10792 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10793
10794 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10795 }
10796 }
10797
10798 return SDValue();
10799}
10800
10801/// Check whether a compaction lowering can be done by dropping even/odd
10802/// elements and compute how many times even/odd elements must be dropped.
10803///
10804/// This handles shuffles which take every Nth element where N is a power of
10805/// two. Example shuffle masks:
10806///
10807/// (even)
10808/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10809/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10810/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10811/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10812/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10813/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10814///
10815/// (odd)
10816/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10817/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10818///
10819/// Any of these lanes can of course be undef.
10820///
10821/// This routine only supports N <= 3.
10822/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10823/// for larger N.
10824///
10825/// \returns N above, or the number of times even/odd elements must be dropped
10826/// if there is such a number. Otherwise returns zero.
10827static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10828 bool IsSingleInput) {
10829 // The modulus for the shuffle vector entries is based on whether this is
10830 // a single input or not.
10831 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10832 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10833 "We should only be called with masks with a power-of-2 size!");
10834
10835 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10836 int Offset = MatchEven ? 0 : 1;
10837
10838 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10839 // and 2^3 simultaneously. This is because we may have ambiguity with
10840 // partially undef inputs.
10841 bool ViableForN[3] = {true, true, true};
10842
10843 for (int i = 0, e = Mask.size(); i < e; ++i) {
10844 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10845 // want.
10846 if (Mask[i] < 0)
10847 continue;
10848
10849 bool IsAnyViable = false;
10850 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10851 if (ViableForN[j]) {
10852 uint64_t N = j + 1;
10853
10854 // The shuffle mask must be equal to (i * 2^N) % M.
10855 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10856 IsAnyViable = true;
10857 else
10858 ViableForN[j] = false;
10859 }
10860 // Early exit if we exhaust the possible powers of two.
10861 if (!IsAnyViable)
10862 break;
10863 }
10864
10865 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10866 if (ViableForN[j])
10867 return j + 1;
10868
10869 // Return 0 as there is no viable power of two.
10870 return 0;
10871}
10872
10873// X86 has dedicated pack instructions that can handle specific truncation
10874// operations: PACKSS and PACKUS.
10875// Checks for compaction shuffle masks if MaxStages > 1.
10876// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10877static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10878 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10879 const SelectionDAG &DAG,
10880 const X86Subtarget &Subtarget,
10881 unsigned MaxStages = 1) {
10882 unsigned NumElts = VT.getVectorNumElements();
10883 unsigned BitSize = VT.getScalarSizeInBits();
10884 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10885 "Illegal maximum compaction");
10886
10887 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10888 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10889 unsigned NumPackedBits = NumSrcBits - BitSize;
10890 N1 = peekThroughBitcasts(N1);
10891 N2 = peekThroughBitcasts(N2);
10892 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10893 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10894 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10895 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10896 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10897 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10898 return false;
10899 if (Subtarget.hasSSE41() || BitSize == 8) {
10900 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10901 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10902 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10903 V1 = N1;
10904 V2 = N2;
10905 SrcVT = PackVT;
10906 PackOpcode = X86ISD::PACKUS;
10907 return true;
10908 }
10909 }
10910 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10911 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10912 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10913 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10914 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10915 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10916 V1 = N1;
10917 V2 = N2;
10918 SrcVT = PackVT;
10919 PackOpcode = X86ISD::PACKSS;
10920 return true;
10921 }
10922 return false;
10923 };
10924
10925 // Attempt to match against wider and wider compaction patterns.
10926 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10927 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10928 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10929
10930 // Try binary shuffle.
10931 SmallVector<int, 32> BinaryMask;
10932 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10933 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10934 if (MatchPACK(V1, V2, PackVT))
10935 return true;
10936
10937 // Try unary shuffle.
10938 SmallVector<int, 32> UnaryMask;
10939 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10940 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10941 if (MatchPACK(V1, V1, PackVT))
10942 return true;
10943 }
10944
10945 return false;
10946}
10947
10949 SDValue V2, ArrayRef<int> Mask,
10950 const X86Subtarget &Subtarget,
10951 SelectionDAG &DAG) {
10952 MVT PackVT;
10953 unsigned PackOpcode;
10954 unsigned SizeBits = VT.getSizeInBits();
10955 unsigned EltBits = VT.getScalarSizeInBits();
10956 unsigned MaxStages = Log2_32(64 / EltBits);
10957 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10958 Subtarget, MaxStages))
10959 return SDValue();
10960
10961 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10962 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10963
10964 // Don't lower multi-stage packs on AVX512, truncation is better.
10965 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10966 return SDValue();
10967
10968 // Pack to the largest type possible:
10969 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10970 unsigned MaxPackBits = 16;
10971 if (CurrentEltBits > 16 &&
10972 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10973 MaxPackBits = 32;
10974
10975 // Repeatedly pack down to the target size.
10976 SDValue Res;
10977 for (unsigned i = 0; i != NumStages; ++i) {
10978 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10979 unsigned NumSrcElts = SizeBits / SrcEltBits;
10980 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10981 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10982 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10983 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10984 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10985 DAG.getBitcast(SrcVT, V2));
10986 V1 = V2 = Res;
10987 CurrentEltBits /= 2;
10988 }
10989 assert(Res && Res.getValueType() == VT &&
10990 "Failed to lower compaction shuffle");
10991 return Res;
10992}
10993
10994/// Try to emit a bitmask instruction for a shuffle.
10995///
10996/// This handles cases where we can model a blend exactly as a bitmask due to
10997/// one of the inputs being zeroable.
10999 SDValue V2, ArrayRef<int> Mask,
11000 const APInt &Zeroable,
11001 const X86Subtarget &Subtarget,
11002 SelectionDAG &DAG) {
11003 MVT MaskVT = VT;
11004 MVT EltVT = VT.getVectorElementType();
11005 SDValue Zero, AllOnes;
11006 // Use f64 if i64 isn't legal.
11007 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11008 EltVT = MVT::f64;
11009 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11010 }
11011
11012 MVT LogicVT = VT;
11013 if (EltVT.isFloatingPoint()) {
11014 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11015 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11016 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11017 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11018 } else {
11019 Zero = DAG.getConstant(0, DL, EltVT);
11020 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11021 }
11022
11023 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11024 SDValue V;
11025 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11026 if (Zeroable[i])
11027 continue;
11028 if (Mask[i] % Size != i)
11029 return SDValue(); // Not a blend.
11030 if (!V)
11031 V = Mask[i] < Size ? V1 : V2;
11032 else if (V != (Mask[i] < Size ? V1 : V2))
11033 return SDValue(); // Can only let one input through the mask.
11034
11035 VMaskOps[i] = AllOnes;
11036 }
11037 if (!V)
11038 return SDValue(); // No non-zeroable elements!
11039
11040 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11041 VMask = DAG.getBitcast(LogicVT, VMask);
11042 V = DAG.getBitcast(LogicVT, V);
11043 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11044 return DAG.getBitcast(VT, And);
11045}
11046
11047/// Try to emit a blend instruction for a shuffle using bit math.
11048///
11049/// This is used as a fallback approach when first class blend instructions are
11050/// unavailable. Currently it is only suitable for integer vectors, but could
11051/// be generalized for floating point vectors if desirable.
11053 SDValue V2, ArrayRef<int> Mask,
11054 SelectionDAG &DAG) {
11055 assert(VT.isInteger() && "Only supports integer vector types!");
11056 MVT EltVT = VT.getVectorElementType();
11057 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11058 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11060 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11061 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11062 return SDValue(); // Shuffled input!
11063 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11064 }
11065
11066 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11067 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11068}
11069
11071 SDValue PreservedSrc,
11072 const X86Subtarget &Subtarget,
11073 SelectionDAG &DAG);
11074
11077 const APInt &Zeroable, bool &ForceV1Zero,
11078 bool &ForceV2Zero, uint64_t &BlendMask) {
11079 bool V1IsZeroOrUndef =
11081 bool V2IsZeroOrUndef =
11083
11084 BlendMask = 0;
11085 ForceV1Zero = false, ForceV2Zero = false;
11086 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11087
11088 int NumElts = Mask.size();
11089 int NumLanes = VT.getSizeInBits() / 128;
11090 int NumEltsPerLane = NumElts / NumLanes;
11091 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11092
11093 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11094 // then ensure the blend mask part for that lane just references that input.
11095 bool ForceWholeLaneMasks =
11096 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11097
11098 // Attempt to generate the binary blend mask. If an input is zero then
11099 // we can use any lane.
11100 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11101 // Keep track of the inputs used per lane.
11102 bool LaneV1InUse = false;
11103 bool LaneV2InUse = false;
11104 uint64_t LaneBlendMask = 0;
11105 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11106 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11107 int M = Mask[Elt];
11108 if (M == SM_SentinelUndef)
11109 continue;
11110 if (M == Elt || (0 <= M && M < NumElts &&
11111 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11112 Mask[Elt] = Elt;
11113 LaneV1InUse = true;
11114 continue;
11115 }
11116 if (M == (Elt + NumElts) ||
11117 (NumElts <= M &&
11118 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11119 LaneBlendMask |= 1ull << LaneElt;
11120 Mask[Elt] = Elt + NumElts;
11121 LaneV2InUse = true;
11122 continue;
11123 }
11124 if (Zeroable[Elt]) {
11125 if (V1IsZeroOrUndef) {
11126 ForceV1Zero = true;
11127 Mask[Elt] = Elt;
11128 LaneV1InUse = true;
11129 continue;
11130 }
11131 if (V2IsZeroOrUndef) {
11132 ForceV2Zero = true;
11133 LaneBlendMask |= 1ull << LaneElt;
11134 Mask[Elt] = Elt + NumElts;
11135 LaneV2InUse = true;
11136 continue;
11137 }
11138 }
11139 return false;
11140 }
11141
11142 // If we only used V2 then splat the lane blend mask to avoid any demanded
11143 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11144 // blend mask bit).
11145 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11146 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11147
11148 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11149 }
11150 return true;
11151}
11152
11153/// Try to emit a blend instruction for a shuffle.
11154///
11155/// This doesn't do any checks for the availability of instructions for blending
11156/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11157/// be matched in the backend with the type given. What it does check for is
11158/// that the shuffle mask is a blend, or convertible into a blend with zero.
11160 SDValue V2, ArrayRef<int> Original,
11161 const APInt &Zeroable,
11162 const X86Subtarget &Subtarget,
11163 SelectionDAG &DAG) {
11164 uint64_t BlendMask = 0;
11165 bool ForceV1Zero = false, ForceV2Zero = false;
11166 SmallVector<int, 64> Mask(Original);
11167 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11168 BlendMask))
11169 return SDValue();
11170
11171 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11172 if (ForceV1Zero)
11173 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11174 if (ForceV2Zero)
11175 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11176
11177 unsigned NumElts = VT.getVectorNumElements();
11178
11179 switch (VT.SimpleTy) {
11180 case MVT::v4i64:
11181 case MVT::v8i32:
11182 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11183 [[fallthrough]];
11184 case MVT::v4f64:
11185 case MVT::v8f32:
11186 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11187 [[fallthrough]];
11188 case MVT::v2f64:
11189 case MVT::v2i64:
11190 case MVT::v4f32:
11191 case MVT::v4i32:
11192 case MVT::v8i16:
11193 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11194 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11195 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11196 case MVT::v16i16: {
11197 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11198 SmallVector<int, 8> RepeatedMask;
11199 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11200 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11201 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11202 BlendMask = 0;
11203 for (int i = 0; i < 8; ++i)
11204 if (RepeatedMask[i] >= 8)
11205 BlendMask |= 1ull << i;
11206 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11207 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11208 }
11209 // Use PBLENDW for lower/upper lanes and then blend lanes.
11210 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11211 // merge to VSELECT where useful.
11212 uint64_t LoMask = BlendMask & 0xFF;
11213 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11214 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11215 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11216 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11217 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11218 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11219 return DAG.getVectorShuffle(
11220 MVT::v16i16, DL, Lo, Hi,
11221 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11222 }
11223 [[fallthrough]];
11224 }
11225 case MVT::v32i8:
11226 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11227 [[fallthrough]];
11228 case MVT::v16i8: {
11229 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11230
11231 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11232 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11233 Subtarget, DAG))
11234 return Masked;
11235
11236 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11237 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11238 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11239 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11240 }
11241
11242 // If we have VPTERNLOG, we can use that as a bit blend.
11243 if (Subtarget.hasVLX())
11244 if (SDValue BitBlend =
11245 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11246 return BitBlend;
11247
11248 // Scale the blend by the number of bytes per element.
11249 int Scale = VT.getScalarSizeInBits() / 8;
11250
11251 // This form of blend is always done on bytes. Compute the byte vector
11252 // type.
11253 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11254
11255 // x86 allows load folding with blendvb from the 2nd source operand. But
11256 // we are still using LLVM select here (see comment below), so that's V1.
11257 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11258 // allow that load-folding possibility.
11259 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11261 std::swap(V1, V2);
11262 }
11263
11264 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11265 // mix of LLVM's code generator and the x86 backend. We tell the code
11266 // generator that boolean values in the elements of an x86 vector register
11267 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11268 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11269 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11270 // of the element (the remaining are ignored) and 0 in that high bit would
11271 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11272 // the LLVM model for boolean values in vector elements gets the relevant
11273 // bit set, it is set backwards and over constrained relative to x86's
11274 // actual model.
11275 SmallVector<SDValue, 32> VSELECTMask;
11276 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11277 for (int j = 0; j < Scale; ++j)
11278 VSELECTMask.push_back(
11279 Mask[i] < 0
11280 ? DAG.getUNDEF(MVT::i8)
11281 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11282
11283 V1 = DAG.getBitcast(BlendVT, V1);
11284 V2 = DAG.getBitcast(BlendVT, V2);
11285 return DAG.getBitcast(
11286 VT,
11287 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11288 V1, V2));
11289 }
11290 case MVT::v16f32:
11291 case MVT::v8f64:
11292 case MVT::v8i64:
11293 case MVT::v16i32:
11294 case MVT::v32i16:
11295 case MVT::v64i8: {
11296 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11297 bool OptForSize = DAG.shouldOptForSize();
11298 if (!OptForSize) {
11299 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11300 Subtarget, DAG))
11301 return Masked;
11302 }
11303
11304 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11305 // masked move.
11306 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11307 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11308 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11309 }
11310 default:
11311 llvm_unreachable("Not a supported integer vector type!");
11312 }
11313}
11314
11315/// Try to lower as a blend of elements from two inputs followed by
11316/// a single-input permutation.
11317///
11318/// This matches the pattern where we can blend elements from two inputs and
11319/// then reduce the shuffle to a single-input permutation.
11321 SDValue V1, SDValue V2,
11322 ArrayRef<int> Mask,
11323 SelectionDAG &DAG,
11324 bool ImmBlends = false) {
11325 // We build up the blend mask while checking whether a blend is a viable way
11326 // to reduce the shuffle.
11327 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11328 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11329
11330 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11331 if (Mask[i] < 0)
11332 continue;
11333
11334 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11335
11336 if (BlendMask[Mask[i] % Size] < 0)
11337 BlendMask[Mask[i] % Size] = Mask[i];
11338 else if (BlendMask[Mask[i] % Size] != Mask[i])
11339 return SDValue(); // Can't blend in the needed input!
11340
11341 PermuteMask[i] = Mask[i] % Size;
11342 }
11343
11344 // If only immediate blends, then bail if the blend mask can't be widened to
11345 // i16.
11346 unsigned EltSize = VT.getScalarSizeInBits();
11347 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11348 return SDValue();
11349
11350 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11351 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11352}
11353
11354/// Try to lower as an unpack of elements from two inputs followed by
11355/// a single-input permutation.
11356///
11357/// This matches the pattern where we can unpack elements from two inputs and
11358/// then reduce the shuffle to a single-input (wider) permutation.
11360 SDValue V1, SDValue V2,
11361 ArrayRef<int> Mask,
11362 SelectionDAG &DAG) {
11363 int NumElts = Mask.size();
11364 int NumLanes = VT.getSizeInBits() / 128;
11365 int NumLaneElts = NumElts / NumLanes;
11366 int NumHalfLaneElts = NumLaneElts / 2;
11367
11368 bool MatchLo = true, MatchHi = true;
11369 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11370
11371 // Determine UNPCKL/UNPCKH type and operand order.
11372 for (int Elt = 0; Elt != NumElts; ++Elt) {
11373 int M = Mask[Elt];
11374 if (M < 0)
11375 continue;
11376
11377 // Normalize the mask value depending on whether it's V1 or V2.
11378 int NormM = M;
11379 SDValue &Op = Ops[Elt & 1];
11380 if (M < NumElts && (Op.isUndef() || Op == V1))
11381 Op = V1;
11382 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11383 Op = V2;
11384 NormM -= NumElts;
11385 } else
11386 return SDValue();
11387
11388 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11389 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11390 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11391 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11392 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11393 if (MatchLoAnyLane || MatchHiAnyLane) {
11394 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11395 "Failed to match UNPCKLO/UNPCKHI");
11396 break;
11397 }
11398 }
11399 MatchLo &= MatchLoAnyLane;
11400 MatchHi &= MatchHiAnyLane;
11401 if (!MatchLo && !MatchHi)
11402 return SDValue();
11403 }
11404 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11405
11406 // Element indices have changed after unpacking. Calculate permute mask
11407 // so that they will be put back to the position as dictated by the
11408 // original shuffle mask indices.
11409 SmallVector<int, 32> PermuteMask(NumElts, -1);
11410 for (int Elt = 0; Elt != NumElts; ++Elt) {
11411 int M = Mask[Elt];
11412 if (M < 0)
11413 continue;
11414 int NormM = M;
11415 if (NumElts <= M)
11416 NormM -= NumElts;
11417 bool IsFirstOp = M < NumElts;
11418 int BaseMaskElt =
11419 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11420 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11421 PermuteMask[Elt] = BaseMaskElt;
11422 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11423 PermuteMask[Elt] = BaseMaskElt + 1;
11424 assert(PermuteMask[Elt] != -1 &&
11425 "Input mask element is defined but failed to assign permute mask");
11426 }
11427
11428 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11429 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11430 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11431}
11432
11433/// Try to lower a shuffle as a permute of the inputs followed by an
11434/// UNPCK instruction.
11435///
11436/// This specifically targets cases where we end up with alternating between
11437/// the two inputs, and so can permute them into something that feeds a single
11438/// UNPCK instruction. Note that this routine only targets integer vectors
11439/// because for floating point vectors we have a generalized SHUFPS lowering
11440/// strategy that handles everything that doesn't *exactly* match an unpack,
11441/// making this clever lowering unnecessary.
11443 SDValue V1, SDValue V2,
11444 ArrayRef<int> Mask,
11445 const X86Subtarget &Subtarget,
11446 SelectionDAG &DAG) {
11447 int Size = Mask.size();
11448 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11449
11450 // This routine only supports 128-bit integer dual input vectors.
11451 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11452 return SDValue();
11453
11454 int NumLoInputs =
11455 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11456 int NumHiInputs =
11457 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11458
11459 bool UnpackLo = NumLoInputs >= NumHiInputs;
11460
11461 auto TryUnpack = [&](int ScalarSize, int Scale) {
11462 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11463 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11464
11465 for (int i = 0; i < Size; ++i) {
11466 if (Mask[i] < 0)
11467 continue;
11468
11469 // Each element of the unpack contains Scale elements from this mask.
11470 int UnpackIdx = i / Scale;
11471
11472 // We only handle the case where V1 feeds the first slots of the unpack.
11473 // We rely on canonicalization to ensure this is the case.
11474 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11475 return SDValue();
11476
11477 // Setup the mask for this input. The indexing is tricky as we have to
11478 // handle the unpack stride.
11479 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11480 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11481 Mask[i] % Size;
11482 }
11483
11484 // If we will have to shuffle both inputs to use the unpack, check whether
11485 // we can just unpack first and shuffle the result. If so, skip this unpack.
11486 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11487 !isNoopShuffleMask(V2Mask))
11488 return SDValue();
11489
11490 // Shuffle the inputs into place.
11491 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11492 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11493
11494 // Cast the inputs to the type we will use to unpack them.
11495 MVT UnpackVT =
11496 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11497 V1 = DAG.getBitcast(UnpackVT, V1);
11498 V2 = DAG.getBitcast(UnpackVT, V2);
11499
11500 // Unpack the inputs and cast the result back to the desired type.
11501 return DAG.getBitcast(
11502 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11503 UnpackVT, V1, V2));
11504 };
11505
11506 // We try each unpack from the largest to the smallest to try and find one
11507 // that fits this mask.
11508 int OrigScalarSize = VT.getScalarSizeInBits();
11509 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11510 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11511 return Unpack;
11512
11513 // If we're shuffling with a zero vector then we're better off not doing
11514 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11517 return SDValue();
11518
11519 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11520 // initial unpack.
11521 if (NumLoInputs == 0 || NumHiInputs == 0) {
11522 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11523 "We have to have *some* inputs!");
11524 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11525
11526 // FIXME: We could consider the total complexity of the permute of each
11527 // possible unpacking. Or at the least we should consider how many
11528 // half-crossings are created.
11529 // FIXME: We could consider commuting the unpacks.
11530
11531 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11532 for (int i = 0; i < Size; ++i) {
11533 if (Mask[i] < 0)
11534 continue;
11535
11536 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11537
11538 PermMask[i] =
11539 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11540 }
11541 return DAG.getVectorShuffle(
11542 VT, DL,
11543 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11544 V1, V2),
11545 DAG.getUNDEF(VT), PermMask);
11546 }
11547
11548 return SDValue();
11549}
11550
11551/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11552/// permuting the elements of the result in place.
11554 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11555 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11556 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11557 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11558 (VT.is512BitVector() && !Subtarget.hasBWI()))
11559 return SDValue();
11560
11561 // We don't currently support lane crossing permutes.
11562 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11563 return SDValue();
11564
11565 int Scale = VT.getScalarSizeInBits() / 8;
11566 int NumLanes = VT.getSizeInBits() / 128;
11567 int NumElts = VT.getVectorNumElements();
11568 int NumEltsPerLane = NumElts / NumLanes;
11569
11570 // Determine range of mask elts.
11571 bool Blend1 = true;
11572 bool Blend2 = true;
11573 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11574 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11575 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11576 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11577 int M = Mask[Lane + Elt];
11578 if (M < 0)
11579 continue;
11580 if (M < NumElts) {
11581 Blend1 &= (M == (Lane + Elt));
11582 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11583 M = M % NumEltsPerLane;
11584 Range1.first = std::min(Range1.first, M);
11585 Range1.second = std::max(Range1.second, M);
11586 } else {
11587 M -= NumElts;
11588 Blend2 &= (M == (Lane + Elt));
11589 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11590 M = M % NumEltsPerLane;
11591 Range2.first = std::min(Range2.first, M);
11592 Range2.second = std::max(Range2.second, M);
11593 }
11594 }
11595 }
11596
11597 // Bail if we don't need both elements.
11598 // TODO - it might be worth doing this for unary shuffles if the permute
11599 // can be widened.
11600 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11601 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11602 return SDValue();
11603
11604 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11605 return SDValue();
11606
11607 // Rotate the 2 ops so we can access both ranges, then permute the result.
11608 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11609 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11610 SDValue Rotate = DAG.getBitcast(
11611 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11612 DAG.getBitcast(ByteVT, Lo),
11613 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11614 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11615 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11616 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11617 int M = Mask[Lane + Elt];
11618 if (M < 0)
11619 continue;
11620 if (M < NumElts)
11621 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11622 else
11623 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11624 }
11625 }
11626 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11627 };
11628
11629 // Check if the ranges are small enough to rotate from either direction.
11630 if (Range2.second < Range1.first)
11631 return RotateAndPermute(V1, V2, Range1.first, 0);
11632 if (Range1.second < Range2.first)
11633 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11634 return SDValue();
11635}
11636
11638 return isUndefOrEqual(Mask, 0);
11639}
11640
11642 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11643}
11644
11645/// Check if the Mask consists of the same element repeated multiple times.
11647 size_t NumUndefs = 0;
11648 std::optional<int> UniqueElt;
11649 for (int Elt : Mask) {
11650 if (Elt == SM_SentinelUndef) {
11651 NumUndefs++;
11652 continue;
11653 }
11654 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11655 return false;
11656 UniqueElt = Elt;
11657 }
11658 // Make sure the element is repeated enough times by checking the number of
11659 // undefs is small.
11660 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11661}
11662
11663/// Generic routine to decompose a shuffle and blend into independent
11664/// blends and permutes.
11665///
11666/// This matches the extremely common pattern for handling combined
11667/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11668/// operations. It will try to pick the best arrangement of shuffles and
11669/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11671 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11672 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11673 int NumElts = Mask.size();
11674 int NumLanes = VT.getSizeInBits() / 128;
11675 int NumEltsPerLane = NumElts / NumLanes;
11676
11677 // Shuffle the input elements into the desired positions in V1 and V2 and
11678 // unpack/blend them together.
11679 bool IsAlternating = true;
11680 bool V1Zero = true, V2Zero = true;
11681 SmallVector<int, 32> V1Mask(NumElts, -1);
11682 SmallVector<int, 32> V2Mask(NumElts, -1);
11683 SmallVector<int, 32> FinalMask(NumElts, -1);
11684 for (int i = 0; i < NumElts; ++i) {
11685 int M = Mask[i];
11686 if (M >= 0 && M < NumElts) {
11687 V1Mask[i] = M;
11688 FinalMask[i] = i;
11689 V1Zero &= Zeroable[i];
11690 IsAlternating &= (i & 1) == 0;
11691 } else if (M >= NumElts) {
11692 V2Mask[i] = M - NumElts;
11693 FinalMask[i] = i + NumElts;
11694 V2Zero &= Zeroable[i];
11695 IsAlternating &= (i & 1) == 1;
11696 }
11697 }
11698
11699 // If we effectively only demand the 0'th element of \p Input, and not only
11700 // as 0'th element, then broadcast said input,
11701 // and change \p InputMask to be a no-op (identity) mask.
11702 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11703 &DAG](SDValue &Input,
11704 MutableArrayRef<int> InputMask) {
11705 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11706 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11707 !X86::mayFoldLoad(Input, Subtarget)))
11708 return;
11709 if (isNoopShuffleMask(InputMask))
11710 return;
11711 assert(isBroadcastShuffleMask(InputMask) &&
11712 "Expected to demand only the 0'th element.");
11714 for (auto I : enumerate(InputMask)) {
11715 int &InputMaskElt = I.value();
11716 if (InputMaskElt >= 0)
11717 InputMaskElt = I.index();
11718 }
11719 };
11720
11721 // Currently, we may need to produce one shuffle per input, and blend results.
11722 // It is possible that the shuffle for one of the inputs is already a no-op.
11723 // See if we can simplify non-no-op shuffles into broadcasts,
11724 // which we consider to be strictly better than an arbitrary shuffle.
11725 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11727 canonicalizeBroadcastableInput(V1, V1Mask);
11728 canonicalizeBroadcastableInput(V2, V2Mask);
11729 }
11730
11731 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11732 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11733 // the shuffle may be able to fold with a load or other benefit. However, when
11734 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11735 // pre-shuffle first is a better strategy.
11736 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11737 // If we don't have blends, see if we can create a cheap unpack.
11738 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
11739 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
11740 is128BitUnpackShuffleMask(V2Mask, DAG)))
11741 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11742 DL, VT, V1, V2, Mask, Subtarget, DAG))
11743 return PermUnpack;
11744
11745 // Only prefer immediate blends to unpack/rotate.
11746 if (SDValue BlendPerm =
11747 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
11748 return BlendPerm;
11749
11750 // If either input vector provides only a single element which is repeated
11751 // multiple times, unpacking from both input vectors would generate worse
11752 // code. e.g. for
11753 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11754 // it is better to process t4 first to create a vector of t4[0], then unpack
11755 // that vector with t2.
11756 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11758 if (SDValue UnpackPerm =
11759 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11760 return UnpackPerm;
11761
11763 DL, VT, V1, V2, Mask, Subtarget, DAG))
11764 return RotatePerm;
11765
11766 // Unpack/rotate failed - try again with variable blends.
11767 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11768 DAG))
11769 return BlendPerm;
11770
11771 if (VT.getScalarSizeInBits() >= 32)
11772 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11773 DL, VT, V1, V2, Mask, Subtarget, DAG))
11774 return PermUnpack;
11775 }
11776
11777 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11778 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11779 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11780 // than half the elements coming from each source.
11781 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11782 V1Mask.assign(NumElts, -1);
11783 V2Mask.assign(NumElts, -1);
11784 FinalMask.assign(NumElts, -1);
11785 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11786 for (int j = 0; j != NumEltsPerLane; ++j) {
11787 int M = Mask[i + j];
11788 if (M >= 0 && M < NumElts) {
11789 V1Mask[i + (j / 2)] = M;
11790 FinalMask[i + j] = i + (j / 2);
11791 } else if (M >= NumElts) {
11792 V2Mask[i + (j / 2)] = M - NumElts;
11793 FinalMask[i + j] = i + (j / 2) + NumElts;
11794 }
11795 }
11796 }
11797
11798 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11799 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11800 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11801}
11802
11803static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11804 const X86Subtarget &Subtarget,
11805 ArrayRef<int> Mask) {
11806 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11807 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11808
11809 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11810 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11811 int MaxSubElts = 64 / EltSizeInBits;
11812 unsigned RotateAmt, NumSubElts;
11813 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11814 MaxSubElts, NumSubElts, RotateAmt))
11815 return -1;
11816 unsigned NumElts = Mask.size();
11817 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11818 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11819 return RotateAmt;
11820}
11821
11822/// Lower shuffle using X86ISD::VROTLI rotations.
11824 ArrayRef<int> Mask,
11825 const X86Subtarget &Subtarget,
11826 SelectionDAG &DAG) {
11827 // Only XOP + AVX512 targets have bit rotation instructions.
11828 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11829 bool IsLegal =
11830 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11831 if (!IsLegal && Subtarget.hasSSE3())
11832 return SDValue();
11833
11834 MVT RotateVT;
11835 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11836 Subtarget, Mask);
11837 if (RotateAmt < 0)
11838 return SDValue();
11839
11840 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11841 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11842 // widen to vXi16 or more then existing lowering should will be better.
11843 if (!IsLegal) {
11844 if ((RotateAmt % 16) == 0)
11845 return SDValue();
11846 // TODO: Use getTargetVShiftByConstNode.
11847 unsigned ShlAmt = RotateAmt;
11848 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11849 V1 = DAG.getBitcast(RotateVT, V1);
11850 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11851 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11852 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11853 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11854 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11855 return DAG.getBitcast(VT, Rot);
11856 }
11857
11858 SDValue Rot =
11859 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11860 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11861 return DAG.getBitcast(VT, Rot);
11862}
11863
11864/// Try to match a vector shuffle as an element rotation.
11865///
11866/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11868 ArrayRef<int> Mask) {
11869 int NumElts = Mask.size();
11870
11871 // We need to detect various ways of spelling a rotation:
11872 // [11, 12, 13, 14, 15, 0, 1, 2]
11873 // [-1, 12, 13, 14, -1, -1, 1, -1]
11874 // [-1, -1, -1, -1, -1, -1, 1, 2]
11875 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11876 // [-1, 4, 5, 6, -1, -1, 9, -1]
11877 // [-1, 4, 5, 6, -1, -1, -1, -1]
11878 int Rotation = 0;
11879 SDValue Lo, Hi;
11880 for (int i = 0; i < NumElts; ++i) {
11881 int M = Mask[i];
11882 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11883 "Unexpected mask index.");
11884 if (M < 0)
11885 continue;
11886
11887 // Determine where a rotated vector would have started.
11888 int StartIdx = i - (M % NumElts);
11889 if (StartIdx == 0)
11890 // The identity rotation isn't interesting, stop.
11891 return -1;
11892
11893 // If we found the tail of a vector the rotation must be the missing
11894 // front. If we found the head of a vector, it must be how much of the
11895 // head.
11896 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11897
11898 if (Rotation == 0)
11899 Rotation = CandidateRotation;
11900 else if (Rotation != CandidateRotation)
11901 // The rotations don't match, so we can't match this mask.
11902 return -1;
11903
11904 // Compute which value this mask is pointing at.
11905 SDValue MaskV = M < NumElts ? V1 : V2;
11906
11907 // Compute which of the two target values this index should be assigned
11908 // to. This reflects whether the high elements are remaining or the low
11909 // elements are remaining.
11910 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11911
11912 // Either set up this value if we've not encountered it before, or check
11913 // that it remains consistent.
11914 if (!TargetV)
11915 TargetV = MaskV;
11916 else if (TargetV != MaskV)
11917 // This may be a rotation, but it pulls from the inputs in some
11918 // unsupported interleaving.
11919 return -1;
11920 }
11921
11922 // Check that we successfully analyzed the mask, and normalize the results.
11923 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11924 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11925 if (!Lo)
11926 Lo = Hi;
11927 else if (!Hi)
11928 Hi = Lo;
11929
11930 V1 = Lo;
11931 V2 = Hi;
11932
11933 return Rotation;
11934}
11935
11936/// Try to lower a vector shuffle as a byte rotation.
11937///
11938/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11939/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11940/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11941/// try to generically lower a vector shuffle through such an pattern. It
11942/// does not check for the profitability of lowering either as PALIGNR or
11943/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11944/// This matches shuffle vectors that look like:
11945///
11946/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11947///
11948/// Essentially it concatenates V1 and V2, shifts right by some number of
11949/// elements, and takes the low elements as the result. Note that while this is
11950/// specified as a *right shift* because x86 is little-endian, it is a *left
11951/// rotate* of the vector lanes.
11953 ArrayRef<int> Mask) {
11954 // Don't accept any shuffles with zero elements.
11955 if (isAnyZero(Mask))
11956 return -1;
11957
11958 // PALIGNR works on 128-bit lanes.
11959 SmallVector<int, 16> RepeatedMask;
11960 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11961 return -1;
11962
11963 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11964 if (Rotation <= 0)
11965 return -1;
11966
11967 // PALIGNR rotates bytes, so we need to scale the
11968 // rotation based on how many bytes are in the vector lane.
11969 int NumElts = RepeatedMask.size();
11970 int Scale = 16 / NumElts;
11971 return Rotation * Scale;
11972}
11973
11975 SDValue V2, ArrayRef<int> Mask,
11976 const X86Subtarget &Subtarget,
11977 SelectionDAG &DAG) {
11978 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11979
11980 SDValue Lo = V1, Hi = V2;
11981 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11982 if (ByteRotation <= 0)
11983 return SDValue();
11984
11985 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11986 // PSLLDQ/PSRLDQ.
11987 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11988 Lo = DAG.getBitcast(ByteVT, Lo);
11989 Hi = DAG.getBitcast(ByteVT, Hi);
11990
11991 // SSSE3 targets can use the palignr instruction.
11992 if (Subtarget.hasSSSE3()) {
11993 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11994 "512-bit PALIGNR requires BWI instructions");
11995 return DAG.getBitcast(
11996 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11997 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11998 }
11999
12000 assert(VT.is128BitVector() &&
12001 "Rotate-based lowering only supports 128-bit lowering!");
12002 assert(Mask.size() <= 16 &&
12003 "Can shuffle at most 16 bytes in a 128-bit vector!");
12004 assert(ByteVT == MVT::v16i8 &&
12005 "SSE2 rotate lowering only needed for v16i8!");
12006
12007 // Default SSE2 implementation
12008 int LoByteShift = 16 - ByteRotation;
12009 int HiByteShift = ByteRotation;
12010
12011 SDValue LoShift =
12012 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12013 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12014 SDValue HiShift =
12015 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12016 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12017 return DAG.getBitcast(VT,
12018 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12019}
12020
12021/// Try to lower a vector shuffle as a dword/qword rotation.
12022///
12023/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12024/// rotation of the concatenation of two vectors; This routine will
12025/// try to generically lower a vector shuffle through such an pattern.
12026///
12027/// Essentially it concatenates V1 and V2, shifts right by some number of
12028/// elements, and takes the low elements as the result. Note that while this is
12029/// specified as a *right shift* because x86 is little-endian, it is a *left
12030/// rotate* of the vector lanes.
12032 SDValue V2, ArrayRef<int> Mask,
12033 const APInt &Zeroable,
12034 const X86Subtarget &Subtarget,
12035 SelectionDAG &DAG) {
12036 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12037 "Only 32-bit and 64-bit elements are supported!");
12038
12039 // 128/256-bit vectors are only supported with VLX.
12040 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12041 && "VLX required for 128/256-bit vectors");
12042
12043 SDValue Lo = V1, Hi = V2;
12044 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12045 if (0 < Rotation)
12046 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12047 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12048
12049 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12050 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12051 // TODO: We can probably make this more aggressive and use shift-pairs like
12052 // lowerShuffleAsByteShiftMask.
12053 unsigned NumElts = Mask.size();
12054 unsigned ZeroLo = Zeroable.countr_one();
12055 unsigned ZeroHi = Zeroable.countl_one();
12056 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12057 if (!ZeroLo && !ZeroHi)
12058 return SDValue();
12059
12060 if (ZeroLo) {
12061 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12062 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12063 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12064 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12065 getZeroVector(VT, Subtarget, DAG, DL),
12066 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12067 }
12068
12069 if (ZeroHi) {
12070 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12071 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12072 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12073 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12074 getZeroVector(VT, Subtarget, DAG, DL), Src,
12075 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12076 }
12077
12078 return SDValue();
12079}
12080
12081/// Try to lower a vector shuffle as a byte shift sequence.
12083 SDValue V2, ArrayRef<int> Mask,
12084 const APInt &Zeroable,
12085 const X86Subtarget &Subtarget,
12086 SelectionDAG &DAG) {
12087 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12088 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12089
12090 // We need a shuffle that has zeros at one/both ends and a sequential
12091 // shuffle from one source within.
12092 unsigned ZeroLo = Zeroable.countr_one();
12093 unsigned ZeroHi = Zeroable.countl_one();
12094 if (!ZeroLo && !ZeroHi)
12095 return SDValue();
12096
12097 unsigned NumElts = Mask.size();
12098 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12099 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12100 return SDValue();
12101
12102 unsigned Scale = VT.getScalarSizeInBits() / 8;
12103 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12104 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12105 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12106 return SDValue();
12107
12108 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12109 Res = DAG.getBitcast(MVT::v16i8, Res);
12110
12111 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12112 // inner sequential set of elements, possibly offset:
12113 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12114 // 01234567 --> 4567zzzz --> zzzzz456
12115 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12116 if (ZeroLo == 0) {
12117 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12118 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12119 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12120 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12121 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12122 } else if (ZeroHi == 0) {
12123 unsigned Shift = Mask[ZeroLo] % NumElts;
12124 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12125 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12126 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12127 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12128 } else if (!Subtarget.hasSSSE3()) {
12129 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12130 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12131 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12132 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12133 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12134 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12135 Shift += Mask[ZeroLo] % NumElts;
12136 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12137 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12138 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12139 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12140 } else
12141 return SDValue();
12142
12143 return DAG.getBitcast(VT, Res);
12144}
12145
12146/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12147///
12148/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12149/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12150/// matches elements from one of the input vectors shuffled to the left or
12151/// right with zeroable elements 'shifted in'. It handles both the strictly
12152/// bit-wise element shifts and the byte shift across an entire 128-bit double
12153/// quad word lane.
12154///
12155/// PSHL : (little-endian) left bit shift.
12156/// [ zz, 0, zz, 2 ]
12157/// [ -1, 4, zz, -1 ]
12158/// PSRL : (little-endian) right bit shift.
12159/// [ 1, zz, 3, zz]
12160/// [ -1, -1, 7, zz]
12161/// PSLLDQ : (little-endian) left byte shift
12162/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12163/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12164/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12165/// PSRLDQ : (little-endian) right byte shift
12166/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12167/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12168/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12169static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12170 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12171 int MaskOffset, const APInt &Zeroable,
12172 const X86Subtarget &Subtarget) {
12173 int Size = Mask.size();
12174 unsigned SizeInBits = Size * ScalarSizeInBits;
12175
12176 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12177 for (int i = 0; i < Size; i += Scale)
12178 for (int j = 0; j < Shift; ++j)
12179 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12180 return false;
12181
12182 return true;
12183 };
12184
12185 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12186 for (int i = 0; i != Size; i += Scale) {
12187 unsigned Pos = Left ? i + Shift : i;
12188 unsigned Low = Left ? i : i + Shift;
12189 unsigned Len = Scale - Shift;
12190 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12191 return -1;
12192 }
12193
12194 int ShiftEltBits = ScalarSizeInBits * Scale;
12195 bool ByteShift = ShiftEltBits > 64;
12196 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12197 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12198 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12199
12200 // Normalize the scale for byte shifts to still produce an i64 element
12201 // type.
12202 Scale = ByteShift ? Scale / 2 : Scale;
12203
12204 // We need to round trip through the appropriate type for the shift.
12205 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12206 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12207 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12208 return (int)ShiftAmt;
12209 };
12210
12211 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12212 // keep doubling the size of the integer elements up to that. We can
12213 // then shift the elements of the integer vector by whole multiples of
12214 // their width within the elements of the larger integer vector. Test each
12215 // multiple to see if we can find a match with the moved element indices
12216 // and that the shifted in elements are all zeroable.
12217 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12218 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12219 for (int Shift = 1; Shift != Scale; ++Shift)
12220 for (bool Left : {true, false})
12221 if (CheckZeros(Shift, Scale, Left)) {
12222 int ShiftAmt = MatchShift(Shift, Scale, Left);
12223 if (0 < ShiftAmt)
12224 return ShiftAmt;
12225 }
12226
12227 // no match
12228 return -1;
12229}
12230
12232 SDValue V2, ArrayRef<int> Mask,
12233 const APInt &Zeroable,
12234 const X86Subtarget &Subtarget,
12235 SelectionDAG &DAG, bool BitwiseOnly) {
12236 int Size = Mask.size();
12237 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12238
12239 MVT ShiftVT;
12240 SDValue V = V1;
12241 unsigned Opcode;
12242
12243 // Try to match shuffle against V1 shift.
12244 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12245 Mask, 0, Zeroable, Subtarget);
12246
12247 // If V1 failed, try to match shuffle against V2 shift.
12248 if (ShiftAmt < 0) {
12249 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12250 Mask, Size, Zeroable, Subtarget);
12251 V = V2;
12252 }
12253
12254 if (ShiftAmt < 0)
12255 return SDValue();
12256
12257 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12258 return SDValue();
12259
12260 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12261 "Illegal integer vector type");
12262 V = DAG.getBitcast(ShiftVT, V);
12263 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12264 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12265 return DAG.getBitcast(VT, V);
12266}
12267
12268// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12269// Remainder of lower half result is zero and upper half is all undef.
12270static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12271 ArrayRef<int> Mask, uint64_t &BitLen,
12272 uint64_t &BitIdx, const APInt &Zeroable) {
12273 int Size = Mask.size();
12274 int HalfSize = Size / 2;
12275 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12276 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12277
12278 // Upper half must be undefined.
12279 if (!isUndefUpperHalf(Mask))
12280 return false;
12281
12282 // Determine the extraction length from the part of the
12283 // lower half that isn't zeroable.
12284 int Len = HalfSize;
12285 for (; Len > 0; --Len)
12286 if (!Zeroable[Len - 1])
12287 break;
12288 assert(Len > 0 && "Zeroable shuffle mask");
12289
12290 // Attempt to match first Len sequential elements from the lower half.
12291 SDValue Src;
12292 int Idx = -1;
12293 for (int i = 0; i != Len; ++i) {
12294 int M = Mask[i];
12295 if (M == SM_SentinelUndef)
12296 continue;
12297 SDValue &V = (M < Size ? V1 : V2);
12298 M = M % Size;
12299
12300 // The extracted elements must start at a valid index and all mask
12301 // elements must be in the lower half.
12302 if (i > M || M >= HalfSize)
12303 return false;
12304
12305 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12306 Src = V;
12307 Idx = M - i;
12308 continue;
12309 }
12310 return false;
12311 }
12312
12313 if (!Src || Idx < 0)
12314 return false;
12315
12316 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12317 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12318 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12319 V1 = Src;
12320 return true;
12321}
12322
12323// INSERTQ: Extract lowest Len elements from lower half of second source and
12324// insert over first source, starting at Idx.
12325// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12326static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12327 ArrayRef<int> Mask, uint64_t &BitLen,
12328 uint64_t &BitIdx) {
12329 int Size = Mask.size();
12330 int HalfSize = Size / 2;
12331 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12332
12333 // Upper half must be undefined.
12334 if (!isUndefUpperHalf(Mask))
12335 return false;
12336
12337 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12338 SDValue Base;
12339
12340 // Attempt to match first source from mask before insertion point.
12341 if (isUndefInRange(Mask, 0, Idx)) {
12342 /* EMPTY */
12343 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12344 Base = V1;
12345 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12346 Base = V2;
12347 } else {
12348 continue;
12349 }
12350
12351 // Extend the extraction length looking to match both the insertion of
12352 // the second source and the remaining elements of the first.
12353 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12354 SDValue Insert;
12355 int Len = Hi - Idx;
12356
12357 // Match insertion.
12358 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12359 Insert = V1;
12360 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12361 Insert = V2;
12362 } else {
12363 continue;
12364 }
12365
12366 // Match the remaining elements of the lower half.
12367 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12368 /* EMPTY */
12369 } else if ((!Base || (Base == V1)) &&
12370 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12371 Base = V1;
12372 } else if ((!Base || (Base == V2)) &&
12373 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12374 Size + Hi)) {
12375 Base = V2;
12376 } else {
12377 continue;
12378 }
12379
12380 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12381 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12382 V1 = Base;
12383 V2 = Insert;
12384 return true;
12385 }
12386 }
12387
12388 return false;
12389}
12390
12391/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12393 SDValue V2, ArrayRef<int> Mask,
12394 const APInt &Zeroable, SelectionDAG &DAG) {
12395 uint64_t BitLen, BitIdx;
12396 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12397 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12398 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12399 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12400
12401 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12402 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12403 V2 ? V2 : DAG.getUNDEF(VT),
12404 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12405 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12406
12407 return SDValue();
12408}
12409
12410/// Lower a vector shuffle as an any/signed/zero extension.
12411///
12412/// Given a specific number of elements, element bit width, and extension
12413/// stride, produce either an extension based on the available
12414/// features of the subtarget. The extended elements are consecutive and
12415/// begin and can start from an offsetted element index in the input; to
12416/// avoid excess shuffling the offset must either being in the bottom lane
12417/// or at the start of a higher lane. All extended elements must be from
12418/// the same lane.
12420 int Scale, int Offset,
12421 unsigned ExtOpc, SDValue InputV,
12422 ArrayRef<int> Mask,
12423 const X86Subtarget &Subtarget,
12424 SelectionDAG &DAG) {
12425 assert(Scale > 1 && "Need a scale to extend.");
12426 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12427 int EltBits = VT.getScalarSizeInBits();
12428 int NumElements = VT.getVectorNumElements();
12429 int NumEltsPerLane = 128 / EltBits;
12430 int OffsetLane = Offset / NumEltsPerLane;
12431 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12432 "Only 8, 16, and 32 bit elements can be extended.");
12433 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12434 assert(0 <= Offset && "Extension offset must be positive.");
12435 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12436 "Extension offset must be in the first lane or start an upper lane.");
12437
12438 // Check that an index is in same lane as the base offset.
12439 auto SafeOffset = [&](int Idx) {
12440 return OffsetLane == (Idx / NumEltsPerLane);
12441 };
12442
12443 // Shift along an input so that the offset base moves to the first element.
12444 auto ShuffleOffset = [&](SDValue V) {
12445 if (!Offset)
12446 return V;
12447
12448 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12449 for (int i = 0; i * Scale < NumElements; ++i) {
12450 int SrcIdx = i + Offset;
12451 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12452 }
12453 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12454 };
12455
12456 // Found a valid a/zext mask! Try various lowering strategies based on the
12457 // input type and available ISA extensions.
12458 if (Subtarget.hasSSE41()) {
12459 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12460 // PUNPCK will catch this in a later shuffle match.
12461 if (Offset && Scale == 2 && VT.is128BitVector())
12462 return SDValue();
12463 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12464 NumElements / Scale);
12465 InputV = DAG.getBitcast(VT, InputV);
12466 InputV = ShuffleOffset(InputV);
12467 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12468 return DAG.getBitcast(VT, InputV);
12469 }
12470
12471 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12472 InputV = DAG.getBitcast(VT, InputV);
12473 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12474
12475 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12476 if (ExtOpc == ISD::SIGN_EXTEND)
12477 return SDValue();
12478
12479 // For any extends we can cheat for larger element sizes and use shuffle
12480 // instructions that can fold with a load and/or copy.
12481 if (AnyExt && EltBits == 32) {
12482 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12483 -1};
12484 return DAG.getBitcast(
12485 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12486 DAG.getBitcast(MVT::v4i32, InputV),
12487 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12488 }
12489 if (AnyExt && EltBits == 16 && Scale > 2) {
12490 int PSHUFDMask[4] = {Offset / 2, -1,
12491 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12492 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12493 DAG.getBitcast(MVT::v4i32, InputV),
12494 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12495 int PSHUFWMask[4] = {1, -1, -1, -1};
12496 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12497 return DAG.getBitcast(
12498 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12499 DAG.getBitcast(MVT::v8i16, InputV),
12500 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12501 }
12502
12503 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12504 // to 64-bits.
12505 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12506 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12507 assert(VT.is128BitVector() && "Unexpected vector width!");
12508
12509 int LoIdx = Offset * EltBits;
12510 SDValue Lo = DAG.getBitcast(
12511 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12512 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12513 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12514
12515 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12516 return DAG.getBitcast(VT, Lo);
12517
12518 int HiIdx = (Offset + 1) * EltBits;
12519 SDValue Hi = DAG.getBitcast(
12520 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12521 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12522 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12523 return DAG.getBitcast(VT,
12524 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12525 }
12526
12527 // If this would require more than 2 unpack instructions to expand, use
12528 // pshufb when available. We can only use more than 2 unpack instructions
12529 // when zero extending i8 elements which also makes it easier to use pshufb.
12530 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12531 assert(NumElements == 16 && "Unexpected byte vector width!");
12532 SDValue PSHUFBMask[16];
12533 for (int i = 0; i < 16; ++i) {
12534 int Idx = Offset + (i / Scale);
12535 if ((i % Scale == 0 && SafeOffset(Idx))) {
12536 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12537 continue;
12538 }
12539 PSHUFBMask[i] =
12540 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12541 }
12542 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12543 return DAG.getBitcast(
12544 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12545 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12546 }
12547
12548 // If we are extending from an offset, ensure we start on a boundary that
12549 // we can unpack from.
12550 int AlignToUnpack = Offset % (NumElements / Scale);
12551 if (AlignToUnpack) {
12552 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12553 for (int i = AlignToUnpack; i < NumElements; ++i)
12554 ShMask[i - AlignToUnpack] = i;
12555 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12556 Offset -= AlignToUnpack;
12557 }
12558
12559 // Otherwise emit a sequence of unpacks.
12560 do {
12561 unsigned UnpackLoHi = X86ISD::UNPCKL;
12562 if (Offset >= (NumElements / 2)) {
12563 UnpackLoHi = X86ISD::UNPCKH;
12564 Offset -= (NumElements / 2);
12565 }
12566
12567 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12568 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12569 : getZeroVector(InputVT, Subtarget, DAG, DL);
12570 InputV = DAG.getBitcast(InputVT, InputV);
12571 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12572 Scale /= 2;
12573 EltBits *= 2;
12574 NumElements /= 2;
12575 } while (Scale > 1);
12576 return DAG.getBitcast(VT, InputV);
12577}
12578
12579/// Try to lower a vector shuffle as a zero extension on any microarch.
12580///
12581/// This routine will try to do everything in its power to cleverly lower
12582/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12583/// check for the profitability of this lowering, it tries to aggressively
12584/// match this pattern. It will use all of the micro-architectural details it
12585/// can to emit an efficient lowering. It handles both blends with all-zero
12586/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12587/// masking out later).
12588///
12589/// The reason we have dedicated lowering for zext-style shuffles is that they
12590/// are both incredibly common and often quite performance sensitive.
12592 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12593 const APInt &Zeroable, const X86Subtarget &Subtarget,
12594 SelectionDAG &DAG) {
12595 int Bits = VT.getSizeInBits();
12596 int NumLanes = Bits / 128;
12597 int NumElements = VT.getVectorNumElements();
12598 int NumEltsPerLane = NumElements / NumLanes;
12599 assert(VT.getScalarSizeInBits() <= 32 &&
12600 "Exceeds 32-bit integer zero extension limit");
12601 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12602
12603 // Define a helper function to check a particular ext-scale and lower to it if
12604 // valid.
12605 auto Lower = [&](int Scale) -> SDValue {
12606 SDValue InputV;
12607 bool AnyExt = true;
12608 int Offset = 0;
12609 int Matches = 0;
12610 for (int i = 0; i < NumElements; ++i) {
12611 int M = Mask[i];
12612 if (M < 0)
12613 continue; // Valid anywhere but doesn't tell us anything.
12614 if (i % Scale != 0) {
12615 // Each of the extended elements need to be zeroable.
12616 if (!Zeroable[i])
12617 return SDValue();
12618
12619 // We no longer are in the anyext case.
12620 AnyExt = false;
12621 continue;
12622 }
12623
12624 // Each of the base elements needs to be consecutive indices into the
12625 // same input vector.
12626 SDValue V = M < NumElements ? V1 : V2;
12627 M = M % NumElements;
12628 if (!InputV) {
12629 InputV = V;
12630 Offset = M - (i / Scale);
12631 } else if (InputV != V)
12632 return SDValue(); // Flip-flopping inputs.
12633
12634 // Offset must start in the lowest 128-bit lane or at the start of an
12635 // upper lane.
12636 // FIXME: Is it ever worth allowing a negative base offset?
12637 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12638 (Offset % NumEltsPerLane) == 0))
12639 return SDValue();
12640
12641 // If we are offsetting, all referenced entries must come from the same
12642 // lane.
12643 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12644 return SDValue();
12645
12646 if ((M % NumElements) != (Offset + (i / Scale)))
12647 return SDValue(); // Non-consecutive strided elements.
12648 Matches++;
12649 }
12650
12651 // If we fail to find an input, we have a zero-shuffle which should always
12652 // have already been handled.
12653 // FIXME: Maybe handle this here in case during blending we end up with one?
12654 if (!InputV)
12655 return SDValue();
12656
12657 // If we are offsetting, don't extend if we only match a single input, we
12658 // can always do better by using a basic PSHUF or PUNPCK.
12659 if (Offset != 0 && Matches < 2)
12660 return SDValue();
12661
12662 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12663 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12664 InputV, Mask, Subtarget, DAG);
12665 };
12666
12667 // The widest scale possible for extending is to a 64-bit integer.
12668 assert(Bits % 64 == 0 &&
12669 "The number of bits in a vector must be divisible by 64 on x86!");
12670 int NumExtElements = Bits / 64;
12671
12672 // Each iteration, try extending the elements half as much, but into twice as
12673 // many elements.
12674 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12675 assert(NumElements % NumExtElements == 0 &&
12676 "The input vector size must be divisible by the extended size.");
12677 if (SDValue V = Lower(NumElements / NumExtElements))
12678 return V;
12679 }
12680
12681 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12682 if (Bits != 128)
12683 return SDValue();
12684
12685 // Returns one of the source operands if the shuffle can be reduced to a
12686 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12687 auto CanZExtLowHalf = [&]() {
12688 for (int i = NumElements / 2; i != NumElements; ++i)
12689 if (!Zeroable[i])
12690 return SDValue();
12691 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12692 return V1;
12693 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12694 return V2;
12695 return SDValue();
12696 };
12697
12698 if (SDValue V = CanZExtLowHalf()) {
12699 V = DAG.getBitcast(MVT::v2i64, V);
12700 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12701 return DAG.getBitcast(VT, V);
12702 }
12703
12704 // No viable ext lowering found.
12705 return SDValue();
12706}
12707
12708/// Try to get a scalar value for a specific element of a vector.
12709///
12710/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12712 SelectionDAG &DAG) {
12713 MVT VT = V.getSimpleValueType();
12714 MVT EltVT = VT.getVectorElementType();
12715 V = peekThroughBitcasts(V);
12716
12717 // If the bitcasts shift the element size, we can't extract an equivalent
12718 // element from it.
12719 MVT NewVT = V.getSimpleValueType();
12720 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12721 return SDValue();
12722
12723 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12724 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12725 // Ensure the scalar operand is the same size as the destination.
12726 // FIXME: Add support for scalar truncation where possible.
12727 SDValue S = V.getOperand(Idx);
12728 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12729 return DAG.getBitcast(EltVT, S);
12730 }
12731
12732 return SDValue();
12733}
12734
12735/// Helper to test for a load that can be folded with x86 shuffles.
12736///
12737/// This is particularly important because the set of instructions varies
12738/// significantly based on whether the operand is a load or not.
12740 return V.hasOneUse() &&
12742}
12743
12744template<typename T>
12745static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12746 T EltVT = VT.getScalarType();
12747 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12748 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12749}
12750
12751/// Try to lower insertion of a single element into a zero vector.
12752///
12753/// This is a common pattern that we have especially efficient patterns to lower
12754/// across all subtarget feature sets.
12756 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12757 const APInt &Zeroable, const X86Subtarget &Subtarget,
12758 SelectionDAG &DAG) {
12759 MVT ExtVT = VT;
12760 MVT EltVT = VT.getVectorElementType();
12761 unsigned NumElts = VT.getVectorNumElements();
12762 unsigned EltBits = VT.getScalarSizeInBits();
12763
12764 if (isSoftF16(EltVT, Subtarget))
12765 return SDValue();
12766
12767 int V2Index =
12768 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12769 Mask.begin();
12770 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12771 bool IsV1Zeroable = true;
12772 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12773 if (i != V2Index && !Zeroable[i]) {
12774 IsV1Zeroable = false;
12775 break;
12776 }
12777
12778 // Bail if a non-zero V1 isn't used in place.
12779 if (!IsV1Zeroable) {
12780 SmallVector<int, 8> V1Mask(Mask);
12781 V1Mask[V2Index] = -1;
12782 if (!isNoopShuffleMask(V1Mask))
12783 return SDValue();
12784 }
12785
12786 // Check for a single input from a SCALAR_TO_VECTOR node.
12787 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12788 // all the smarts here sunk into that routine. However, the current
12789 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12790 // vector shuffle lowering is dead.
12791 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12792 DAG);
12793 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12794 // We need to zext the scalar if it is smaller than an i32.
12795 V2S = DAG.getBitcast(EltVT, V2S);
12796 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12797 // Using zext to expand a narrow element won't work for non-zero
12798 // insertions. But we can use a masked constant vector if we're
12799 // inserting V2 into the bottom of V1.
12800 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12801 return SDValue();
12802
12803 // Zero-extend directly to i32.
12804 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12805 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12806
12807 // If we're inserting into a constant, mask off the inserted index
12808 // and OR with the zero-extended scalar.
12809 if (!IsV1Zeroable) {
12810 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12811 Bits[V2Index] = APInt::getZero(EltBits);
12812 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12813 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12814 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12815 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12816 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12817 }
12818 }
12819 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12820 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12821 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12822 // Either not inserting from the low element of the input or the input
12823 // element size is too small to use VZEXT_MOVL to clear the high bits.
12824 return SDValue();
12825 }
12826
12827 if (!IsV1Zeroable) {
12828 // If V1 can't be treated as a zero vector we have fewer options to lower
12829 // this. We can't support integer vectors or non-zero targets cheaply.
12830 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12831 if (!VT.isFloatingPoint() || V2Index != 0)
12832 return SDValue();
12833 if (!VT.is128BitVector())
12834 return SDValue();
12835
12836 // Otherwise, use MOVSD, MOVSS or MOVSH.
12837 unsigned MovOpc = 0;
12838 if (EltVT == MVT::f16)
12839 MovOpc = X86ISD::MOVSH;
12840 else if (EltVT == MVT::f32)
12841 MovOpc = X86ISD::MOVSS;
12842 else if (EltVT == MVT::f64)
12843 MovOpc = X86ISD::MOVSD;
12844 else
12845 llvm_unreachable("Unsupported floating point element type to handle!");
12846 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12847 }
12848
12849 // This lowering only works for the low element with floating point vectors.
12850 if (VT.isFloatingPoint() && V2Index != 0)
12851 return SDValue();
12852
12853 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12854 if (ExtVT != VT)
12855 V2 = DAG.getBitcast(VT, V2);
12856
12857 if (V2Index != 0) {
12858 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12859 // the desired position. Otherwise it is more efficient to do a vector
12860 // shift left. We know that we can do a vector shift left because all
12861 // the inputs are zero.
12862 if (VT.isFloatingPoint() || NumElts <= 4) {
12863 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12864 V2Shuffle[V2Index] = 0;
12865 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12866 } else {
12867 V2 = DAG.getBitcast(MVT::v16i8, V2);
12868 V2 = DAG.getNode(
12869 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12870 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12871 V2 = DAG.getBitcast(VT, V2);
12872 }
12873 }
12874 return V2;
12875}
12876
12877/// Try to lower broadcast of a single - truncated - integer element,
12878/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12879///
12880/// This assumes we have AVX2.
12882 int BroadcastIdx,
12883 const X86Subtarget &Subtarget,
12884 SelectionDAG &DAG) {
12885 assert(Subtarget.hasAVX2() &&
12886 "We can only lower integer broadcasts with AVX2!");
12887
12888 MVT EltVT = VT.getVectorElementType();
12889 MVT V0VT = V0.getSimpleValueType();
12890
12891 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12892 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12893
12894 MVT V0EltVT = V0VT.getVectorElementType();
12895 if (!V0EltVT.isInteger())
12896 return SDValue();
12897
12898 const unsigned EltSize = EltVT.getSizeInBits();
12899 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12900
12901 // This is only a truncation if the original element type is larger.
12902 if (V0EltSize <= EltSize)
12903 return SDValue();
12904
12905 assert(((V0EltSize % EltSize) == 0) &&
12906 "Scalar type sizes must all be powers of 2 on x86!");
12907
12908 const unsigned V0Opc = V0.getOpcode();
12909 const unsigned Scale = V0EltSize / EltSize;
12910 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12911
12912 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12913 V0Opc != ISD::BUILD_VECTOR)
12914 return SDValue();
12915
12916 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12917
12918 // If we're extracting non-least-significant bits, shift so we can truncate.
12919 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12920 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12921 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12922 if (const int OffsetIdx = BroadcastIdx % Scale)
12923 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12924 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12925
12926 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12927 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12928}
12929
12930/// Test whether this can be lowered with a single SHUFPS instruction.
12931///
12932/// This is used to disable more specialized lowerings when the shufps lowering
12933/// will happen to be efficient.
12935 // This routine only handles 128-bit shufps.
12936 assert(Mask.size() == 4 && "Unsupported mask size!");
12937 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12938 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12939 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12940 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12941
12942 // To lower with a single SHUFPS we need to have the low half and high half
12943 // each requiring a single input.
12944 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12945 return false;
12946 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12947 return false;
12948
12949 return true;
12950}
12951
12952/// Test whether the specified input (0 or 1) is in-place blended by the
12953/// given mask.
12954///
12955/// This returns true if the elements from a particular input are already in the
12956/// slot required by the given mask and require no permutation.
12958 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12959 int Size = Mask.size();
12960 for (int i = 0; i < Size; ++i)
12961 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12962 return false;
12963
12964 return true;
12965}
12966
12967/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12968/// the given mask.
12969///
12971 int BroadcastableElement = 0) {
12972 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12973 int Size = Mask.size();
12974 for (int i = 0; i < Size; ++i)
12975 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12976 Mask[i] % Size != BroadcastableElement)
12977 return false;
12978 return true;
12979}
12980
12981/// If we are extracting two 128-bit halves of a vector and shuffling the
12982/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12983/// multi-shuffle lowering.
12985 SDValue N1, ArrayRef<int> Mask,
12986 SelectionDAG &DAG) {
12987 MVT VT = N0.getSimpleValueType();
12988 assert((VT.is128BitVector() &&
12989 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12990 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12991
12992 // Check that both sources are extracts of the same source vector.
12993 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12995 N0.getOperand(0) != N1.getOperand(0) ||
12996 !N0.hasOneUse() || !N1.hasOneUse())
12997 return SDValue();
12998
12999 SDValue WideVec = N0.getOperand(0);
13000 MVT WideVT = WideVec.getSimpleValueType();
13001 if (!WideVT.is256BitVector())
13002 return SDValue();
13003
13004 // Match extracts of each half of the wide source vector. Commute the shuffle
13005 // if the extract of the low half is N1.
13006 unsigned NumElts = VT.getVectorNumElements();
13007 SmallVector<int, 4> NewMask(Mask);
13008 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13009 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13010 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13012 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13013 return SDValue();
13014
13015 // Final bailout: if the mask is simple, we are better off using an extract
13016 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13017 // because that avoids a constant load from memory.
13018 if (NumElts == 4 &&
13019 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13020 return SDValue();
13021
13022 // Extend the shuffle mask with undef elements.
13023 NewMask.append(NumElts, -1);
13024
13025 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13026 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13027 NewMask);
13028 // This is free: ymm -> xmm.
13029 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13030 DAG.getVectorIdxConstant(0, DL));
13031}
13032
13033/// Try to lower broadcast of a single element.
13034///
13035/// For convenience, this code also bundles all of the subtarget feature set
13036/// filtering. While a little annoying to re-dispatch on type here, there isn't
13037/// a convenient way to factor it out.
13039 SDValue V2, ArrayRef<int> Mask,
13040 const X86Subtarget &Subtarget,
13041 SelectionDAG &DAG) {
13042 MVT EltVT = VT.getVectorElementType();
13043 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13044 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13045 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13046 return SDValue();
13047
13048 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13049 // we can only broadcast from a register with AVX2.
13050 unsigned NumEltBits = VT.getScalarSizeInBits();
13051 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13054 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13055
13056 // Check that the mask is a broadcast.
13057 int BroadcastIdx = getSplatIndex(Mask);
13058 if (BroadcastIdx < 0) {
13059 // Check for hidden broadcast.
13060 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13061 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13062 return SDValue();
13063 BroadcastIdx = 0;
13064 }
13065 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13066 "a sorted mask where the broadcast "
13067 "comes from V1.");
13068 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13069
13070 // Go up the chain of (vector) values to find a scalar load that we can
13071 // combine with the broadcast.
13072 // TODO: Combine this logic with findEltLoadSrc() used by
13073 // EltsFromConsecutiveLoads().
13074 int BitOffset = BroadcastIdx * NumEltBits;
13075 SDValue V = V1;
13076 for (;;) {
13077 switch (V.getOpcode()) {
13078 case ISD::BITCAST: {
13079 V = V.getOperand(0);
13080 continue;
13081 }
13082 case ISD::CONCAT_VECTORS: {
13083 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13084 int OpIdx = BitOffset / OpBitWidth;
13085 V = V.getOperand(OpIdx);
13086 BitOffset %= OpBitWidth;
13087 continue;
13088 }
13090 // The extraction index adds to the existing offset.
13091 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13092 unsigned Idx = V.getConstantOperandVal(1);
13093 unsigned BeginOffset = Idx * EltBitWidth;
13094 BitOffset += BeginOffset;
13095 V = V.getOperand(0);
13096 continue;
13097 }
13098 case ISD::INSERT_SUBVECTOR: {
13099 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13100 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13101 int Idx = (int)V.getConstantOperandVal(2);
13102 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13103 int BeginOffset = Idx * EltBitWidth;
13104 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13105 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13106 BitOffset -= BeginOffset;
13107 V = VInner;
13108 } else {
13109 V = VOuter;
13110 }
13111 continue;
13112 }
13113 }
13114 break;
13115 }
13116 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13117 BroadcastIdx = BitOffset / NumEltBits;
13118
13119 // Do we need to bitcast the source to retrieve the original broadcast index?
13120 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13121
13122 // Check if this is a broadcast of a scalar. We special case lowering
13123 // for scalars so that we can more effectively fold with loads.
13124 // If the original value has a larger element type than the shuffle, the
13125 // broadcast element is in essence truncated. Make that explicit to ease
13126 // folding.
13127 if (BitCastSrc && VT.isInteger())
13128 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13129 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13130 return TruncBroadcast;
13131
13132 // Also check the simpler case, where we can directly reuse the scalar.
13133 if (!BitCastSrc &&
13134 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13135 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13136 V = V.getOperand(BroadcastIdx);
13137
13138 // If we can't broadcast from a register, check that the input is a load.
13139 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13140 return SDValue();
13141 } else if (ISD::isNormalLoad(V.getNode()) &&
13142 cast<LoadSDNode>(V)->isSimple()) {
13143 // We do not check for one-use of the vector load because a broadcast load
13144 // is expected to be a win for code size, register pressure, and possibly
13145 // uops even if the original vector load is not eliminated.
13146
13147 // Reduce the vector load and shuffle to a broadcasted scalar load.
13148 auto *Ld = cast<LoadSDNode>(V);
13149 SDValue BaseAddr = Ld->getBasePtr();
13150 MVT SVT = VT.getScalarType();
13151 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13152 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13153 SDValue NewAddr =
13155
13156 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13157 // than MOVDDUP.
13158 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13159 if (Opcode == X86ISD::VBROADCAST) {
13160 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13161 SDValue Ops[] = {Ld->getChain(), NewAddr};
13162 V = DAG.getMemIntrinsicNode(
13163 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13165 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13167 return DAG.getBitcast(VT, V);
13168 }
13169 assert(SVT == MVT::f64 && "Unexpected VT!");
13170 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13172 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13174 } else if (!BroadcastFromReg) {
13175 // We can't broadcast from a vector register.
13176 return SDValue();
13177 } else if (BitOffset != 0) {
13178 // We can only broadcast from the zero-element of a vector register,
13179 // but it can be advantageous to broadcast from the zero-element of a
13180 // subvector.
13181 if (!VT.is256BitVector() && !VT.is512BitVector())
13182 return SDValue();
13183
13184 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13185 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13186 return SDValue();
13187
13188 // If we are broadcasting an element from the lowest 128-bit subvector, try
13189 // to move the element in position.
13190 if (BitOffset < 128 && NumActiveElts > 1 &&
13191 V.getScalarValueSizeInBits() == NumEltBits) {
13192 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13193 "Unexpected bit-offset");
13194 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13195 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13196 V = extractSubVector(V, 0, DAG, DL, 128);
13197 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13198 } else {
13199 // Only broadcast the zero-element of a 128-bit subvector.
13200 if ((BitOffset % 128) != 0)
13201 return SDValue();
13202
13203 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13204 "Unexpected bit-offset");
13205 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13206 "Unexpected vector size");
13207 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13208 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13209 }
13210 }
13211
13212 // On AVX we can use VBROADCAST directly for scalar sources.
13213 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13214 V = DAG.getBitcast(MVT::f64, V);
13215 if (Subtarget.hasAVX()) {
13216 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13217 return DAG.getBitcast(VT, V);
13218 }
13219 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13220 }
13221
13222 // If this is a scalar, do the broadcast on this type and bitcast.
13223 if (!V.getValueType().isVector()) {
13224 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13225 "Unexpected scalar size");
13226 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13228 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13229 }
13230
13231 // We only support broadcasting from 128-bit vectors to minimize the
13232 // number of patterns we need to deal with in isel. So extract down to
13233 // 128-bits, removing as many bitcasts as possible.
13234 if (V.getValueSizeInBits() > 128)
13236
13237 // Otherwise cast V to a vector with the same element type as VT, but
13238 // possibly narrower than VT. Then perform the broadcast.
13239 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13240 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13241 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13242}
13243
13244// Check for whether we can use INSERTPS to perform the shuffle. We only use
13245// INSERTPS when the V1 elements are already in the correct locations
13246// because otherwise we can just always use two SHUFPS instructions which
13247// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13248// perform INSERTPS if a single V1 element is out of place and all V2
13249// elements are zeroable.
13251 unsigned &InsertPSMask,
13252 const APInt &Zeroable,
13253 ArrayRef<int> Mask, SelectionDAG &DAG) {
13254 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13255 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13256 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13257
13258 // Attempt to match INSERTPS with one element from VA or VB being
13259 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13260 // are updated.
13261 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13262 ArrayRef<int> CandidateMask) {
13263 unsigned ZMask = 0;
13264 int VADstIndex = -1;
13265 int VBDstIndex = -1;
13266 bool VAUsedInPlace = false;
13267
13268 for (int i = 0; i < 4; ++i) {
13269 // Synthesize a zero mask from the zeroable elements (includes undefs).
13270 if (Zeroable[i]) {
13271 ZMask |= 1 << i;
13272 continue;
13273 }
13274
13275 // Flag if we use any VA inputs in place.
13276 if (i == CandidateMask[i]) {
13277 VAUsedInPlace = true;
13278 continue;
13279 }
13280
13281 // We can only insert a single non-zeroable element.
13282 if (VADstIndex >= 0 || VBDstIndex >= 0)
13283 return false;
13284
13285 if (CandidateMask[i] < 4) {
13286 // VA input out of place for insertion.
13287 VADstIndex = i;
13288 } else {
13289 // VB input for insertion.
13290 VBDstIndex = i;
13291 }
13292 }
13293
13294 // Don't bother if we have no (non-zeroable) element for insertion.
13295 if (VADstIndex < 0 && VBDstIndex < 0)
13296 return false;
13297
13298 // Determine element insertion src/dst indices. The src index is from the
13299 // start of the inserted vector, not the start of the concatenated vector.
13300 unsigned VBSrcIndex = 0;
13301 if (VADstIndex >= 0) {
13302 // If we have a VA input out of place, we use VA as the V2 element
13303 // insertion and don't use the original V2 at all.
13304 VBSrcIndex = CandidateMask[VADstIndex];
13305 VBDstIndex = VADstIndex;
13306 VB = VA;
13307 } else {
13308 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13309 }
13310
13311 // If no V1 inputs are used in place, then the result is created only from
13312 // the zero mask and the V2 insertion - so remove V1 dependency.
13313 if (!VAUsedInPlace)
13314 VA = DAG.getUNDEF(MVT::v4f32);
13315
13316 // Update V1, V2 and InsertPSMask accordingly.
13317 V1 = VA;
13318 V2 = VB;
13319
13320 // Insert the V2 element into the desired position.
13321 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13322 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13323 return true;
13324 };
13325
13326 if (matchAsInsertPS(V1, V2, Mask))
13327 return true;
13328
13329 // Commute and try again.
13330 SmallVector<int, 4> CommutedMask(Mask);
13332 if (matchAsInsertPS(V2, V1, CommutedMask))
13333 return true;
13334
13335 return false;
13336}
13337
13339 ArrayRef<int> Mask, const APInt &Zeroable,
13340 SelectionDAG &DAG) {
13341 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13342 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13343
13344 // Attempt to match the insertps pattern.
13345 unsigned InsertPSMask = 0;
13346 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13347 return SDValue();
13348
13349 // Insert the V2 element into the desired position.
13350 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13351 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13352}
13353
13354/// Handle lowering of 2-lane 64-bit floating point shuffles.
13355///
13356/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13357/// support for floating point shuffles but not integer shuffles. These
13358/// instructions will incur a domain crossing penalty on some chips though so
13359/// it is better to avoid lowering through this for integer vectors where
13360/// possible.
13362 const APInt &Zeroable, SDValue V1, SDValue V2,
13363 const X86Subtarget &Subtarget,
13364 SelectionDAG &DAG) {
13365 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13366 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13367 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13368
13369 if (V2.isUndef()) {
13370 // Check for being able to broadcast a single element.
13371 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13372 Mask, Subtarget, DAG))
13373 return Broadcast;
13374
13375 // Straight shuffle of a single input vector. Simulate this by using the
13376 // single input as both of the "inputs" to this instruction..
13377 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13378
13379 if (Subtarget.hasAVX()) {
13380 // If we have AVX, we can use VPERMILPS which will allow folding a load
13381 // into the shuffle.
13382 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13383 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13384 }
13385
13386 return DAG.getNode(
13387 X86ISD::SHUFP, DL, MVT::v2f64,
13388 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13389 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13390 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13391 }
13392 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13393 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13394 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13395 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13396
13397 if (Subtarget.hasAVX2())
13398 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13399 return Extract;
13400
13401 // When loading a scalar and then shuffling it into a vector we can often do
13402 // the insertion cheaply.
13404 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13405 return Insertion;
13406 // Try inverting the insertion since for v2 masks it is easy to do and we
13407 // can't reliably sort the mask one way or the other.
13408 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13409 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13411 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13412 return Insertion;
13413
13414 // Try to use one of the special instruction patterns to handle two common
13415 // blend patterns if a zero-blend above didn't work.
13416 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13417 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13418 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13419 // We can either use a special instruction to load over the low double or
13420 // to move just the low double.
13421 return DAG.getNode(
13422 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13423 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13424
13425 if (Subtarget.hasSSE41())
13426 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13427 Zeroable, Subtarget, DAG))
13428 return Blend;
13429
13430 // Use dedicated unpack instructions for masks that match their pattern.
13431 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13432 return V;
13433
13434 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13435 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13436 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13437}
13438
13439/// Handle lowering of 2-lane 64-bit integer shuffles.
13440///
13441/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13442/// the integer unit to minimize domain crossing penalties. However, for blends
13443/// it falls back to the floating point shuffle operation with appropriate bit
13444/// casting.
13446 const APInt &Zeroable, SDValue V1, SDValue V2,
13447 const X86Subtarget &Subtarget,
13448 SelectionDAG &DAG) {
13449 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13450 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13451 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13452
13453 if (V2.isUndef()) {
13454 // Check for being able to broadcast a single element.
13455 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13456 Mask, Subtarget, DAG))
13457 return Broadcast;
13458
13459 // Straight shuffle of a single input vector. For everything from SSE2
13460 // onward this has a single fast instruction with no scary immediates.
13461 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13462 V1 = DAG.getBitcast(MVT::v4i32, V1);
13463 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13464 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13465 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13466 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13467 return DAG.getBitcast(
13468 MVT::v2i64,
13469 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13470 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13471 }
13472 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13473 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13474 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13475 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13476
13477 if (Subtarget.hasAVX2())
13478 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13479 return Extract;
13480
13481 // Try to use shift instructions.
13482 if (SDValue Shift =
13483 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13484 DAG, /*BitwiseOnly*/ false))
13485 return Shift;
13486
13487 // When loading a scalar and then shuffling it into a vector we can often do
13488 // the insertion cheaply.
13490 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13491 return Insertion;
13492 // Try inverting the insertion since for v2 masks it is easy to do and we
13493 // can't reliably sort the mask one way or the other.
13494 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13496 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13497 return Insertion;
13498
13499 // We have different paths for blend lowering, but they all must use the
13500 // *exact* same predicate.
13501 bool IsBlendSupported = Subtarget.hasSSE41();
13502 if (IsBlendSupported)
13503 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13504 Zeroable, Subtarget, DAG))
13505 return Blend;
13506
13507 // Use dedicated unpack instructions for masks that match their pattern.
13508 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13509 return V;
13510
13511 // Try to use byte rotation instructions.
13512 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13513 if (Subtarget.hasSSSE3()) {
13514 if (Subtarget.hasVLX())
13515 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13516 Zeroable, Subtarget, DAG))
13517 return Rotate;
13518
13519 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13520 Subtarget, DAG))
13521 return Rotate;
13522 }
13523
13524 // If we have direct support for blends, we should lower by decomposing into
13525 // a permute. That will be faster than the domain cross.
13526 if (IsBlendSupported)
13527 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13528 Zeroable, Subtarget, DAG);
13529
13530 // We implement this with SHUFPD which is pretty lame because it will likely
13531 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13532 // However, all the alternatives are still more cycles and newer chips don't
13533 // have this problem. It would be really nice if x86 had better shuffles here.
13534 V1 = DAG.getBitcast(MVT::v2f64, V1);
13535 V2 = DAG.getBitcast(MVT::v2f64, V2);
13536 return DAG.getBitcast(MVT::v2i64,
13537 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13538}
13539
13540/// Lower a vector shuffle using the SHUFPS instruction.
13541///
13542/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13543/// It makes no assumptions about whether this is the *best* lowering, it simply
13544/// uses it.
13546 ArrayRef<int> Mask, SDValue V1,
13547 SDValue V2, SelectionDAG &DAG) {
13548 SDValue LowV = V1, HighV = V2;
13549 SmallVector<int, 4> NewMask(Mask);
13550 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13551
13552 if (NumV2Elements == 1) {
13553 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13554
13555 // Compute the index adjacent to V2Index and in the same half by toggling
13556 // the low bit.
13557 int V2AdjIndex = V2Index ^ 1;
13558
13559 if (Mask[V2AdjIndex] < 0) {
13560 // Handles all the cases where we have a single V2 element and an undef.
13561 // This will only ever happen in the high lanes because we commute the
13562 // vector otherwise.
13563 if (V2Index < 2)
13564 std::swap(LowV, HighV);
13565 NewMask[V2Index] -= 4;
13566 } else {
13567 // Handle the case where the V2 element ends up adjacent to a V1 element.
13568 // To make this work, blend them together as the first step.
13569 int V1Index = V2AdjIndex;
13570 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13571 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13572 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13573
13574 // Now proceed to reconstruct the final blend as we have the necessary
13575 // high or low half formed.
13576 if (V2Index < 2) {
13577 LowV = V2;
13578 HighV = V1;
13579 } else {
13580 HighV = V2;
13581 }
13582 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13583 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13584 }
13585 } else if (NumV2Elements == 2) {
13586 if (Mask[0] < 4 && Mask[1] < 4) {
13587 // Handle the easy case where we have V1 in the low lanes and V2 in the
13588 // high lanes.
13589 NewMask[2] -= 4;
13590 NewMask[3] -= 4;
13591 } else if (Mask[2] < 4 && Mask[3] < 4) {
13592 // We also handle the reversed case because this utility may get called
13593 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13594 // arrange things in the right direction.
13595 NewMask[0] -= 4;
13596 NewMask[1] -= 4;
13597 HighV = V1;
13598 LowV = V2;
13599 } else {
13600 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13601 // trying to place elements directly, just blend them and set up the final
13602 // shuffle to place them.
13603
13604 // The first two blend mask elements are for V1, the second two are for
13605 // V2.
13606 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13607 Mask[2] < 4 ? Mask[2] : Mask[3],
13608 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13609 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13610 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13611 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13612
13613 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13614 // a blend.
13615 LowV = HighV = V1;
13616 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13617 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13618 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13619 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13620 }
13621 } else if (NumV2Elements == 3) {
13622 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13623 // we can get here due to other paths (e.g repeated mask matching) that we
13624 // don't want to do another round of lowerVECTOR_SHUFFLE.
13626 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13627 }
13628 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13629 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13630}
13631
13632/// Lower 4-lane 32-bit floating point shuffles.
13633///
13634/// Uses instructions exclusively from the floating point unit to minimize
13635/// domain crossing penalties, as these are sufficient to implement all v4f32
13636/// shuffles.
13638 const APInt &Zeroable, SDValue V1, SDValue V2,
13639 const X86Subtarget &Subtarget,
13640 SelectionDAG &DAG) {
13641 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13642 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13643 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13644
13645 if (Subtarget.hasSSE41())
13646 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13647 Zeroable, Subtarget, DAG))
13648 return Blend;
13649
13650 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13651
13652 if (NumV2Elements == 0) {
13653 // Check for being able to broadcast a single element.
13654 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13655 Mask, Subtarget, DAG))
13656 return Broadcast;
13657
13658 // Use even/odd duplicate instructions for masks that match their pattern.
13659 if (Subtarget.hasSSE3()) {
13660 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13661 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13662 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13663 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13664 }
13665
13666 if (Subtarget.hasAVX()) {
13667 // If we have AVX, we can use VPERMILPS which will allow folding a load
13668 // into the shuffle.
13669 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13670 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13671 }
13672
13673 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13674 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13675 if (!Subtarget.hasSSE2()) {
13676 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13677 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13678 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13679 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13680 }
13681
13682 // Otherwise, use a straight shuffle of a single input vector. We pass the
13683 // input vector to both operands to simulate this with a SHUFPS.
13684 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13685 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13686 }
13687
13688 if (Subtarget.hasSSE2())
13690 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13691 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13692 return ZExt;
13693 }
13694
13695 if (Subtarget.hasAVX2())
13696 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13697 return Extract;
13698
13699 // There are special ways we can lower some single-element blends. However, we
13700 // have custom ways we can lower more complex single-element blends below that
13701 // we defer to if both this and BLENDPS fail to match, so restrict this to
13702 // when the V2 input is targeting element 0 of the mask -- that is the fast
13703 // case here.
13704 if (NumV2Elements == 1 && Mask[0] >= 4)
13706 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13707 return V;
13708
13709 if (Subtarget.hasSSE41()) {
13710 // Use INSERTPS if we can complete the shuffle efficiently.
13711 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13712 return V;
13713
13714 if (!isSingleSHUFPSMask(Mask))
13715 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13716 V2, Mask, DAG))
13717 return BlendPerm;
13718 }
13719
13720 // Use low/high mov instructions. These are only valid in SSE1 because
13721 // otherwise they are widened to v2f64 and never get here.
13722 if (!Subtarget.hasSSE2()) {
13723 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13724 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13725 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13726 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13727 }
13728
13729 // Use dedicated unpack instructions for masks that match their pattern.
13730 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13731 return V;
13732
13733 // Otherwise fall back to a SHUFPS lowering strategy.
13734 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13735}
13736
13737/// Lower 4-lane i32 vector shuffles.
13738///
13739/// We try to handle these with integer-domain shuffles where we can, but for
13740/// blends we use the floating point domain blend instructions.
13742 const APInt &Zeroable, SDValue V1, SDValue V2,
13743 const X86Subtarget &Subtarget,
13744 SelectionDAG &DAG) {
13745 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13746 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13747 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13748
13749 // Whenever we can lower this as a zext, that instruction is strictly faster
13750 // than any alternative. It also allows us to fold memory operands into the
13751 // shuffle in many cases.
13752 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13753 Zeroable, Subtarget, DAG))
13754 return ZExt;
13755
13756 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13757
13758 // Try to use shift instructions if fast.
13759 if (Subtarget.preferLowerShuffleAsShift()) {
13760 if (SDValue Shift =
13761 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13762 Subtarget, DAG, /*BitwiseOnly*/ true))
13763 return Shift;
13764 if (NumV2Elements == 0)
13765 if (SDValue Rotate =
13766 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13767 return Rotate;
13768 }
13769
13770 if (NumV2Elements == 0) {
13771 // Try to use broadcast unless the mask only has one non-undef element.
13772 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13773 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13774 Mask, Subtarget, DAG))
13775 return Broadcast;
13776 }
13777
13778 // Straight shuffle of a single input vector. For everything from SSE2
13779 // onward this has a single fast instruction with no scary immediates.
13780 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13781 // but we aren't actually going to use the UNPCK instruction because doing
13782 // so prevents folding a load into this instruction or making a copy.
13783 const int UnpackLoMask[] = {0, 0, 1, 1};
13784 const int UnpackHiMask[] = {2, 2, 3, 3};
13785 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13786 Mask = UnpackLoMask;
13787 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13788 Mask = UnpackHiMask;
13789
13790 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13791 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13792 }
13793
13794 if (Subtarget.hasAVX2())
13795 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13796 return Extract;
13797
13798 // Try to use shift instructions.
13799 if (SDValue Shift =
13800 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13801 DAG, /*BitwiseOnly*/ false))
13802 return Shift;
13803
13804 // There are special ways we can lower some single-element blends.
13805 if (NumV2Elements == 1)
13807 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13808 return V;
13809
13810 // We have different paths for blend lowering, but they all must use the
13811 // *exact* same predicate.
13812 bool IsBlendSupported = Subtarget.hasSSE41();
13813 if (IsBlendSupported)
13814 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13815 Zeroable, Subtarget, DAG))
13816 return Blend;
13817
13818 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13819 Zeroable, Subtarget, DAG))
13820 return Masked;
13821
13822 // Use dedicated unpack instructions for masks that match their pattern.
13823 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13824 return V;
13825
13826 // Try to use byte rotation instructions.
13827 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13828 if (Subtarget.hasSSSE3()) {
13829 if (Subtarget.hasVLX())
13830 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13831 Zeroable, Subtarget, DAG))
13832 return Rotate;
13833
13834 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13835 Subtarget, DAG))
13836 return Rotate;
13837 }
13838
13839 // Assume that a single SHUFPS is faster than an alternative sequence of
13840 // multiple instructions (even if the CPU has a domain penalty).
13841 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13842 if (!isSingleSHUFPSMask(Mask)) {
13843 // If we have direct support for blends, we should lower by decomposing into
13844 // a permute. That will be faster than the domain cross.
13845 if (IsBlendSupported)
13846 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13847 Zeroable, Subtarget, DAG);
13848
13849 // Try to lower by permuting the inputs into an unpack instruction.
13850 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13851 Mask, Subtarget, DAG))
13852 return Unpack;
13853 }
13854
13855 // We implement this with SHUFPS because it can blend from two vectors.
13856 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13857 // up the inputs, bypassing domain shift penalties that we would incur if we
13858 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13859 // relevant.
13860 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13861 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13862 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13863 return DAG.getBitcast(MVT::v4i32, ShufPS);
13864}
13865
13866/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13867/// shuffle lowering, and the most complex part.
13868///
13869/// The lowering strategy is to try to form pairs of input lanes which are
13870/// targeted at the same half of the final vector, and then use a dword shuffle
13871/// to place them onto the right half, and finally unpack the paired lanes into
13872/// their final position.
13873///
13874/// The exact breakdown of how to form these dword pairs and align them on the
13875/// correct sides is really tricky. See the comments within the function for
13876/// more of the details.
13877///
13878/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13879/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13880/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13881/// vector, form the analogous 128-bit 8-element Mask.
13883 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13884 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13885 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13886 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13887
13888 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13889 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13890 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13891
13892 // Attempt to directly match PSHUFLW or PSHUFHW.
13893 if (isUndefOrInRange(LoMask, 0, 4) &&
13894 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13895 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13896 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13897 }
13898 if (isUndefOrInRange(HiMask, 4, 8) &&
13899 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13900 for (int i = 0; i != 4; ++i)
13901 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13902 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13903 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13904 }
13905
13906 SmallVector<int, 4> LoInputs;
13907 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13908 array_pod_sort(LoInputs.begin(), LoInputs.end());
13909 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13910 SmallVector<int, 4> HiInputs;
13911 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13912 array_pod_sort(HiInputs.begin(), HiInputs.end());
13913 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13914 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13915 int NumHToL = LoInputs.size() - NumLToL;
13916 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13917 int NumHToH = HiInputs.size() - NumLToH;
13918 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13919 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13920 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13921 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13922
13923 // If we are shuffling values from one half - check how many different DWORD
13924 // pairs we need to create. If only 1 or 2 then we can perform this as a
13925 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13926 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13927 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13928 V = DAG.getNode(ShufWOp, DL, VT, V,
13929 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13930 V = DAG.getBitcast(PSHUFDVT, V);
13931 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13932 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13933 return DAG.getBitcast(VT, V);
13934 };
13935
13936 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13937 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13938 SmallVector<std::pair<int, int>, 4> DWordPairs;
13939 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13940
13941 // Collect the different DWORD pairs.
13942 for (int DWord = 0; DWord != 4; ++DWord) {
13943 int M0 = Mask[2 * DWord + 0];
13944 int M1 = Mask[2 * DWord + 1];
13945 M0 = (M0 >= 0 ? M0 % 4 : M0);
13946 M1 = (M1 >= 0 ? M1 % 4 : M1);
13947 if (M0 < 0 && M1 < 0)
13948 continue;
13949
13950 bool Match = false;
13951 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13952 auto &DWordPair = DWordPairs[j];
13953 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13954 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13955 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13956 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13957 PSHUFDMask[DWord] = DOffset + j;
13958 Match = true;
13959 break;
13960 }
13961 }
13962 if (!Match) {
13963 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13964 DWordPairs.push_back(std::make_pair(M0, M1));
13965 }
13966 }
13967
13968 if (DWordPairs.size() <= 2) {
13969 DWordPairs.resize(2, std::make_pair(-1, -1));
13970 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13971 DWordPairs[1].first, DWordPairs[1].second};
13972 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13973 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13974 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13975 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13976 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13977 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13978 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13979 }
13980 if ((NumHToL + NumHToH) == 0)
13981 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13982 if ((NumLToL + NumLToH) == 0)
13983 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13984 }
13985 }
13986
13987 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13988 // such inputs we can swap two of the dwords across the half mark and end up
13989 // with <=2 inputs to each half in each half. Once there, we can fall through
13990 // to the generic code below. For example:
13991 //
13992 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13993 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13994 //
13995 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13996 // and an existing 2-into-2 on the other half. In this case we may have to
13997 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13998 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13999 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14000 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14001 // half than the one we target for fixing) will be fixed when we re-enter this
14002 // path. We will also combine away any sequence of PSHUFD instructions that
14003 // result into a single instruction. Here is an example of the tricky case:
14004 //
14005 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14006 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14007 //
14008 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14009 //
14010 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14011 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14012 //
14013 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14014 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14015 //
14016 // The result is fine to be handled by the generic logic.
14017 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14018 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14019 int AOffset, int BOffset) {
14020 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14021 "Must call this with A having 3 or 1 inputs from the A half.");
14022 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14023 "Must call this with B having 1 or 3 inputs from the B half.");
14024 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14025 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14026
14027 bool ThreeAInputs = AToAInputs.size() == 3;
14028
14029 // Compute the index of dword with only one word among the three inputs in
14030 // a half by taking the sum of the half with three inputs and subtracting
14031 // the sum of the actual three inputs. The difference is the remaining
14032 // slot.
14033 int ADWord = 0, BDWord = 0;
14034 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14035 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14036 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14037 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14038 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14039 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14040 int TripleNonInputIdx =
14041 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14042 TripleDWord = TripleNonInputIdx / 2;
14043
14044 // We use xor with one to compute the adjacent DWord to whichever one the
14045 // OneInput is in.
14046 OneInputDWord = (OneInput / 2) ^ 1;
14047
14048 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14049 // and BToA inputs. If there is also such a problem with the BToB and AToB
14050 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14051 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14052 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14053 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14054 // Compute how many inputs will be flipped by swapping these DWords. We
14055 // need
14056 // to balance this to ensure we don't form a 3-1 shuffle in the other
14057 // half.
14058 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14059 llvm::count(AToBInputs, 2 * ADWord + 1);
14060 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14061 llvm::count(BToBInputs, 2 * BDWord + 1);
14062 if ((NumFlippedAToBInputs == 1 &&
14063 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14064 (NumFlippedBToBInputs == 1 &&
14065 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14066 // We choose whether to fix the A half or B half based on whether that
14067 // half has zero flipped inputs. At zero, we may not be able to fix it
14068 // with that half. We also bias towards fixing the B half because that
14069 // will more commonly be the high half, and we have to bias one way.
14070 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14071 ArrayRef<int> Inputs) {
14072 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14073 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14074 // Determine whether the free index is in the flipped dword or the
14075 // unflipped dword based on where the pinned index is. We use this bit
14076 // in an xor to conditionally select the adjacent dword.
14077 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14078 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14079 if (IsFixIdxInput == IsFixFreeIdxInput)
14080 FixFreeIdx += 1;
14081 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14082 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14083 "We need to be changing the number of flipped inputs!");
14084 int PSHUFHalfMask[] = {0, 1, 2, 3};
14085 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14086 V = DAG.getNode(
14087 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14088 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14089 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14090
14091 for (int &M : Mask)
14092 if (M >= 0 && M == FixIdx)
14093 M = FixFreeIdx;
14094 else if (M >= 0 && M == FixFreeIdx)
14095 M = FixIdx;
14096 };
14097 if (NumFlippedBToBInputs != 0) {
14098 int BPinnedIdx =
14099 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14100 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14101 } else {
14102 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14103 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14104 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14105 }
14106 }
14107 }
14108
14109 int PSHUFDMask[] = {0, 1, 2, 3};
14110 PSHUFDMask[ADWord] = BDWord;
14111 PSHUFDMask[BDWord] = ADWord;
14112 V = DAG.getBitcast(
14113 VT,
14114 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14115 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14116
14117 // Adjust the mask to match the new locations of A and B.
14118 for (int &M : Mask)
14119 if (M >= 0 && M/2 == ADWord)
14120 M = 2 * BDWord + M % 2;
14121 else if (M >= 0 && M/2 == BDWord)
14122 M = 2 * ADWord + M % 2;
14123
14124 // Recurse back into this routine to re-compute state now that this isn't
14125 // a 3 and 1 problem.
14126 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14127 };
14128 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14129 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14130 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14131 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14132
14133 // At this point there are at most two inputs to the low and high halves from
14134 // each half. That means the inputs can always be grouped into dwords and
14135 // those dwords can then be moved to the correct half with a dword shuffle.
14136 // We use at most one low and one high word shuffle to collect these paired
14137 // inputs into dwords, and finally a dword shuffle to place them.
14138 int PSHUFLMask[4] = {-1, -1, -1, -1};
14139 int PSHUFHMask[4] = {-1, -1, -1, -1};
14140 int PSHUFDMask[4] = {-1, -1, -1, -1};
14141
14142 // First fix the masks for all the inputs that are staying in their
14143 // original halves. This will then dictate the targets of the cross-half
14144 // shuffles.
14145 auto fixInPlaceInputs =
14146 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14147 MutableArrayRef<int> SourceHalfMask,
14148 MutableArrayRef<int> HalfMask, int HalfOffset) {
14149 if (InPlaceInputs.empty())
14150 return;
14151 if (InPlaceInputs.size() == 1) {
14152 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14153 InPlaceInputs[0] - HalfOffset;
14154 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14155 return;
14156 }
14157 if (IncomingInputs.empty()) {
14158 // Just fix all of the in place inputs.
14159 for (int Input : InPlaceInputs) {
14160 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14161 PSHUFDMask[Input / 2] = Input / 2;
14162 }
14163 return;
14164 }
14165
14166 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14167 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14168 InPlaceInputs[0] - HalfOffset;
14169 // Put the second input next to the first so that they are packed into
14170 // a dword. We find the adjacent index by toggling the low bit.
14171 int AdjIndex = InPlaceInputs[0] ^ 1;
14172 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14173 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14174 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14175 };
14176 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14177 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14178
14179 // Now gather the cross-half inputs and place them into a free dword of
14180 // their target half.
14181 // FIXME: This operation could almost certainly be simplified dramatically to
14182 // look more like the 3-1 fixing operation.
14183 auto moveInputsToRightHalf = [&PSHUFDMask](
14184 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14185 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14186 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14187 int DestOffset) {
14188 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14189 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14190 };
14191 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14192 int Word) {
14193 int LowWord = Word & ~1;
14194 int HighWord = Word | 1;
14195 return isWordClobbered(SourceHalfMask, LowWord) ||
14196 isWordClobbered(SourceHalfMask, HighWord);
14197 };
14198
14199 if (IncomingInputs.empty())
14200 return;
14201
14202 if (ExistingInputs.empty()) {
14203 // Map any dwords with inputs from them into the right half.
14204 for (int Input : IncomingInputs) {
14205 // If the source half mask maps over the inputs, turn those into
14206 // swaps and use the swapped lane.
14207 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14208 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14209 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14210 Input - SourceOffset;
14211 // We have to swap the uses in our half mask in one sweep.
14212 for (int &M : HalfMask)
14213 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14214 M = Input;
14215 else if (M == Input)
14216 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14217 } else {
14218 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14219 Input - SourceOffset &&
14220 "Previous placement doesn't match!");
14221 }
14222 // Note that this correctly re-maps both when we do a swap and when
14223 // we observe the other side of the swap above. We rely on that to
14224 // avoid swapping the members of the input list directly.
14225 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14226 }
14227
14228 // Map the input's dword into the correct half.
14229 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14230 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14231 else
14232 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14233 Input / 2 &&
14234 "Previous placement doesn't match!");
14235 }
14236
14237 // And just directly shift any other-half mask elements to be same-half
14238 // as we will have mirrored the dword containing the element into the
14239 // same position within that half.
14240 for (int &M : HalfMask)
14241 if (M >= SourceOffset && M < SourceOffset + 4) {
14242 M = M - SourceOffset + DestOffset;
14243 assert(M >= 0 && "This should never wrap below zero!");
14244 }
14245 return;
14246 }
14247
14248 // Ensure we have the input in a viable dword of its current half. This
14249 // is particularly tricky because the original position may be clobbered
14250 // by inputs being moved and *staying* in that half.
14251 if (IncomingInputs.size() == 1) {
14252 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14253 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14254 SourceOffset;
14255 SourceHalfMask[InputFixed - SourceOffset] =
14256 IncomingInputs[0] - SourceOffset;
14257 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14258 IncomingInputs[0] = InputFixed;
14259 }
14260 } else if (IncomingInputs.size() == 2) {
14261 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14262 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14263 // We have two non-adjacent or clobbered inputs we need to extract from
14264 // the source half. To do this, we need to map them into some adjacent
14265 // dword slot in the source mask.
14266 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14267 IncomingInputs[1] - SourceOffset};
14268
14269 // If there is a free slot in the source half mask adjacent to one of
14270 // the inputs, place the other input in it. We use (Index XOR 1) to
14271 // compute an adjacent index.
14272 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14273 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14274 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14275 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14276 InputsFixed[1] = InputsFixed[0] ^ 1;
14277 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14278 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14279 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14280 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14281 InputsFixed[0] = InputsFixed[1] ^ 1;
14282 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14283 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14284 // The two inputs are in the same DWord but it is clobbered and the
14285 // adjacent DWord isn't used at all. Move both inputs to the free
14286 // slot.
14287 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14288 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14289 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14290 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14291 } else {
14292 // The only way we hit this point is if there is no clobbering
14293 // (because there are no off-half inputs to this half) and there is no
14294 // free slot adjacent to one of the inputs. In this case, we have to
14295 // swap an input with a non-input.
14296 for (int i = 0; i < 4; ++i)
14297 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14298 "We can't handle any clobbers here!");
14299 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14300 "Cannot have adjacent inputs here!");
14301
14302 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14303 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14304
14305 // We also have to update the final source mask in this case because
14306 // it may need to undo the above swap.
14307 for (int &M : FinalSourceHalfMask)
14308 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14309 M = InputsFixed[1] + SourceOffset;
14310 else if (M == InputsFixed[1] + SourceOffset)
14311 M = (InputsFixed[0] ^ 1) + SourceOffset;
14312
14313 InputsFixed[1] = InputsFixed[0] ^ 1;
14314 }
14315
14316 // Point everything at the fixed inputs.
14317 for (int &M : HalfMask)
14318 if (M == IncomingInputs[0])
14319 M = InputsFixed[0] + SourceOffset;
14320 else if (M == IncomingInputs[1])
14321 M = InputsFixed[1] + SourceOffset;
14322
14323 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14324 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14325 }
14326 } else {
14327 llvm_unreachable("Unhandled input size!");
14328 }
14329
14330 // Now hoist the DWord down to the right half.
14331 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14332 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14333 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14334 for (int &M : HalfMask)
14335 for (int Input : IncomingInputs)
14336 if (M == Input)
14337 M = FreeDWord * 2 + Input % 2;
14338 };
14339 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14340 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14341 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14342 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14343
14344 // Now enact all the shuffles we've computed to move the inputs into their
14345 // target half.
14346 if (!isNoopShuffleMask(PSHUFLMask))
14347 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14348 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14349 if (!isNoopShuffleMask(PSHUFHMask))
14350 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14351 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14352 if (!isNoopShuffleMask(PSHUFDMask))
14353 V = DAG.getBitcast(
14354 VT,
14355 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14356 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14357
14358 // At this point, each half should contain all its inputs, and we can then
14359 // just shuffle them into their final position.
14360 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14361 "Failed to lift all the high half inputs to the low mask!");
14362 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14363 "Failed to lift all the low half inputs to the high mask!");
14364
14365 // Do a half shuffle for the low mask.
14366 if (!isNoopShuffleMask(LoMask))
14367 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14368 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14369
14370 // Do a half shuffle with the high mask after shifting its values down.
14371 for (int &M : HiMask)
14372 if (M >= 0)
14373 M -= 4;
14374 if (!isNoopShuffleMask(HiMask))
14375 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14376 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14377
14378 return V;
14379}
14380
14381/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14382/// blend if only one input is used.
14384 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14385 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14387 "Lane crossing shuffle masks not supported");
14388
14389 int NumBytes = VT.getSizeInBits() / 8;
14390 int Size = Mask.size();
14391 int Scale = NumBytes / Size;
14392
14393 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14394 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14395 V1InUse = false;
14396 V2InUse = false;
14397
14398 for (int i = 0; i < NumBytes; ++i) {
14399 int M = Mask[i / Scale];
14400 if (M < 0)
14401 continue;
14402
14403 const int ZeroMask = 0x80;
14404 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14405 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14406 if (Zeroable[i / Scale])
14407 V1Idx = V2Idx = ZeroMask;
14408
14409 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14410 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14411 V1InUse |= (ZeroMask != V1Idx);
14412 V2InUse |= (ZeroMask != V2Idx);
14413 }
14414
14415 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14416 if (V1InUse)
14417 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14418 DAG.getBuildVector(ShufVT, DL, V1Mask));
14419 if (V2InUse)
14420 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14421 DAG.getBuildVector(ShufVT, DL, V2Mask));
14422
14423 // If we need shuffled inputs from both, blend the two.
14424 SDValue V;
14425 if (V1InUse && V2InUse)
14426 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14427 else
14428 V = V1InUse ? V1 : V2;
14429
14430 // Cast the result back to the correct type.
14431 return DAG.getBitcast(VT, V);
14432}
14433
14434/// Generic lowering of 8-lane i16 shuffles.
14435///
14436/// This handles both single-input shuffles and combined shuffle/blends with
14437/// two inputs. The single input shuffles are immediately delegated to
14438/// a dedicated lowering routine.
14439///
14440/// The blends are lowered in one of three fundamental ways. If there are few
14441/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14442/// of the input is significantly cheaper when lowered as an interleaving of
14443/// the two inputs, try to interleave them. Otherwise, blend the low and high
14444/// halves of the inputs separately (making them have relatively few inputs)
14445/// and then concatenate them.
14447 const APInt &Zeroable, SDValue V1, SDValue V2,
14448 const X86Subtarget &Subtarget,
14449 SelectionDAG &DAG) {
14450 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14451 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14452 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14453
14454 // Whenever we can lower this as a zext, that instruction is strictly faster
14455 // than any alternative.
14456 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14457 Zeroable, Subtarget, DAG))
14458 return ZExt;
14459
14460 // Try to use lower using a truncation.
14461 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14462 Subtarget, DAG))
14463 return V;
14464
14465 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14466
14467 if (NumV2Inputs == 0) {
14468 // Try to use shift instructions.
14469 if (SDValue Shift =
14470 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14471 Subtarget, DAG, /*BitwiseOnly*/ false))
14472 return Shift;
14473
14474 // Check for being able to broadcast a single element.
14475 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14476 Mask, Subtarget, DAG))
14477 return Broadcast;
14478
14479 // Try to use bit rotation instructions.
14480 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14481 Subtarget, DAG))
14482 return Rotate;
14483
14484 // Use dedicated unpack instructions for masks that match their pattern.
14485 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14486 return V;
14487
14488 // Use dedicated pack instructions for masks that match their pattern.
14489 if (SDValue V =
14490 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14491 return V;
14492
14493 // Try to use byte rotation instructions.
14494 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14495 Subtarget, DAG))
14496 return Rotate;
14497
14498 // Make a copy of the mask so it can be modified.
14499 SmallVector<int, 8> MutableMask(Mask);
14500 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14501 Subtarget, DAG);
14502 }
14503
14504 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14505 "All single-input shuffles should be canonicalized to be V1-input "
14506 "shuffles.");
14507
14508 // Try to use shift instructions.
14509 if (SDValue Shift =
14510 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14511 DAG, /*BitwiseOnly*/ false))
14512 return Shift;
14513
14514 // See if we can use SSE4A Extraction / Insertion.
14515 if (Subtarget.hasSSE4A())
14516 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14517 Zeroable, DAG))
14518 return V;
14519
14520 // There are special ways we can lower some single-element blends.
14521 if (NumV2Inputs == 1)
14523 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14524 return V;
14525
14526 // We have different paths for blend lowering, but they all must use the
14527 // *exact* same predicate.
14528 bool IsBlendSupported = Subtarget.hasSSE41();
14529 if (IsBlendSupported)
14530 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14531 Zeroable, Subtarget, DAG))
14532 return Blend;
14533
14534 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14535 Zeroable, Subtarget, DAG))
14536 return Masked;
14537
14538 // Use dedicated unpack instructions for masks that match their pattern.
14539 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14540 return V;
14541
14542 // Use dedicated pack instructions for masks that match their pattern.
14543 if (SDValue V =
14544 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14545 return V;
14546
14547 // Try to use lower using a truncation.
14548 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14549 Subtarget, DAG))
14550 return V;
14551
14552 // Try to use byte rotation instructions.
14553 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14554 Subtarget, DAG))
14555 return Rotate;
14556
14557 if (SDValue BitBlend =
14558 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14559 return BitBlend;
14560
14561 // Try to use byte shift instructions to mask.
14562 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14563 Zeroable, Subtarget, DAG))
14564 return V;
14565
14566 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14567 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14568 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14569 !Subtarget.hasVLX()) {
14570 // Check if this is part of a 256-bit vector truncation.
14571 unsigned PackOpc = 0;
14572 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14575 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14576 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14577 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14578 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14579 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14580 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14581 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14582 PackOpc = X86ISD::PACKUS;
14583 } else if (Subtarget.hasSSE41()) {
14584 SmallVector<SDValue, 4> DWordClearOps(4,
14585 DAG.getConstant(0, DL, MVT::i32));
14586 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14587 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14588 SDValue DWordClearMask =
14589 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14590 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14591 DWordClearMask);
14592 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14593 DWordClearMask);
14594 PackOpc = X86ISD::PACKUS;
14595 } else if (!Subtarget.hasSSSE3()) {
14596 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14597 V1 = DAG.getBitcast(MVT::v4i32, V1);
14598 V2 = DAG.getBitcast(MVT::v4i32, V2);
14599 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14600 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14601 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14602 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14603 PackOpc = X86ISD::PACKSS;
14604 }
14605 if (PackOpc) {
14606 // Now pack things back together.
14607 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14608 if (NumEvenDrops == 2) {
14609 Result = DAG.getBitcast(MVT::v4i32, Result);
14610 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14611 }
14612 return Result;
14613 }
14614 }
14615
14616 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14617 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14618 if (NumOddDrops == 1) {
14619 bool HasSSE41 = Subtarget.hasSSE41();
14620 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14621 DAG.getBitcast(MVT::v4i32, V1),
14622 DAG.getTargetConstant(16, DL, MVT::i8));
14623 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14624 DAG.getBitcast(MVT::v4i32, V2),
14625 DAG.getTargetConstant(16, DL, MVT::i8));
14626 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14627 MVT::v8i16, V1, V2);
14628 }
14629
14630 // Try to lower by permuting the inputs into an unpack instruction.
14631 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14632 Mask, Subtarget, DAG))
14633 return Unpack;
14634
14635 // If we can't directly blend but can use PSHUFB, that will be better as it
14636 // can both shuffle and set up the inefficient blend.
14637 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14638 bool V1InUse, V2InUse;
14639 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14640 Zeroable, DAG, V1InUse, V2InUse);
14641 }
14642
14643 // We can always bit-blend if we have to so the fallback strategy is to
14644 // decompose into single-input permutes and blends/unpacks.
14645 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14646 Zeroable, Subtarget, DAG);
14647}
14648
14649/// Lower 8-lane 16-bit floating point shuffles.
14651 const APInt &Zeroable, SDValue V1, SDValue V2,
14652 const X86Subtarget &Subtarget,
14653 SelectionDAG &DAG) {
14654 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14655 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14656 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14657 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14658
14659 if (Subtarget.hasFP16()) {
14660 if (NumV2Elements == 0) {
14661 // Check for being able to broadcast a single element.
14662 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14663 Mask, Subtarget, DAG))
14664 return Broadcast;
14665 }
14666 if (NumV2Elements == 1 && Mask[0] >= 8)
14668 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14669 return V;
14670 }
14671
14672 V1 = DAG.getBitcast(MVT::v8i16, V1);
14673 V2 = DAG.getBitcast(MVT::v8i16, V2);
14674 return DAG.getBitcast(MVT::v8f16,
14675 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14676}
14677
14678// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14679// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14680// the active subvector is extracted.
14682 ArrayRef<int> OriginalMask, SDValue V1,
14683 SDValue V2, const X86Subtarget &Subtarget,
14684 SelectionDAG &DAG) {
14685 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14686 SmallVector<int, 32> Mask(OriginalMask);
14687 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14688 !isShuffleFoldableLoad(V2)) {
14690 std::swap(V1, V2);
14691 }
14692
14693 MVT MaskVT = VT.changeTypeToInteger();
14694 SDValue MaskNode;
14695 MVT ShuffleVT = VT;
14696 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14697 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14698 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14699 ShuffleVT = V1.getSimpleValueType();
14700
14701 // Adjust mask to correct indices for the second input.
14702 int NumElts = VT.getVectorNumElements();
14703 unsigned Scale = 512 / VT.getSizeInBits();
14704 SmallVector<int, 32> AdjustedMask(Mask);
14705 for (int &M : AdjustedMask)
14706 if (NumElts <= M)
14707 M += (Scale - 1) * NumElts;
14708 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14709 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14710 } else {
14711 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14712 }
14713
14714 SDValue Result;
14715 if (V2.isUndef())
14716 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14717 else
14718 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14719
14720 if (VT != ShuffleVT)
14721 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14722
14723 return Result;
14724}
14725
14726/// Generic lowering of v16i8 shuffles.
14727///
14728/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14729/// detect any complexity reducing interleaving. If that doesn't help, it uses
14730/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14731/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14732/// back together.
14734 const APInt &Zeroable, SDValue V1, SDValue V2,
14735 const X86Subtarget &Subtarget,
14736 SelectionDAG &DAG) {
14737 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14738 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14739 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14740
14741 // Try to use shift instructions.
14742 if (SDValue Shift =
14743 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14744 DAG, /*BitwiseOnly*/ false))
14745 return Shift;
14746
14747 // Try to use byte rotation instructions.
14748 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14749 Subtarget, DAG))
14750 return Rotate;
14751
14752 // Use dedicated pack instructions for masks that match their pattern.
14753 if (SDValue V =
14754 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14755 return V;
14756
14757 // Try to use a zext lowering.
14758 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14759 Zeroable, Subtarget, DAG))
14760 return ZExt;
14761
14762 // Try to use lower using a truncation.
14763 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14764 Subtarget, DAG))
14765 return V;
14766
14767 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14768 Subtarget, DAG))
14769 return V;
14770
14771 // See if we can use SSE4A Extraction / Insertion.
14772 if (Subtarget.hasSSE4A())
14773 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14774 Zeroable, DAG))
14775 return V;
14776
14777 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14778
14779 // For single-input shuffles, there are some nicer lowering tricks we can use.
14780 if (NumV2Elements == 0) {
14781 // Check for being able to broadcast a single element.
14782 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14783 Mask, Subtarget, DAG))
14784 return Broadcast;
14785
14786 // Try to use bit rotation instructions.
14787 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14788 Subtarget, DAG))
14789 return Rotate;
14790
14791 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14792 return V;
14793
14794 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14795 // Notably, this handles splat and partial-splat shuffles more efficiently.
14796 // However, it only makes sense if the pre-duplication shuffle simplifies
14797 // things significantly. Currently, this means we need to be able to
14798 // express the pre-duplication shuffle as an i16 shuffle.
14799 //
14800 // FIXME: We should check for other patterns which can be widened into an
14801 // i16 shuffle as well.
14802 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14803 for (int i = 0; i < 16; i += 2)
14804 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14805 return false;
14806
14807 return true;
14808 };
14809 auto tryToWidenViaDuplication = [&]() -> SDValue {
14810 if (!canWidenViaDuplication(Mask))
14811 return SDValue();
14812 SmallVector<int, 4> LoInputs;
14813 copy_if(Mask, std::back_inserter(LoInputs),
14814 [](int M) { return M >= 0 && M < 8; });
14815 array_pod_sort(LoInputs.begin(), LoInputs.end());
14816 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14817 SmallVector<int, 4> HiInputs;
14818 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14819 array_pod_sort(HiInputs.begin(), HiInputs.end());
14820 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14821
14822 bool TargetLo = LoInputs.size() >= HiInputs.size();
14823 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14824 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14825
14826 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14828 for (int I : InPlaceInputs) {
14829 PreDupI16Shuffle[I/2] = I/2;
14830 LaneMap[I] = I;
14831 }
14832 int j = TargetLo ? 0 : 4, je = j + 4;
14833 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14834 // Check if j is already a shuffle of this input. This happens when
14835 // there are two adjacent bytes after we move the low one.
14836 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14837 // If we haven't yet mapped the input, search for a slot into which
14838 // we can map it.
14839 while (j < je && PreDupI16Shuffle[j] >= 0)
14840 ++j;
14841
14842 if (j == je)
14843 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14844 return SDValue();
14845
14846 // Map this input with the i16 shuffle.
14847 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14848 }
14849
14850 // Update the lane map based on the mapping we ended up with.
14851 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14852 }
14853 V1 = DAG.getBitcast(
14854 MVT::v16i8,
14855 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14856 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14857
14858 // Unpack the bytes to form the i16s that will be shuffled into place.
14859 bool EvenInUse = false, OddInUse = false;
14860 for (int i = 0; i < 16; i += 2) {
14861 EvenInUse |= (Mask[i + 0] >= 0);
14862 OddInUse |= (Mask[i + 1] >= 0);
14863 if (EvenInUse && OddInUse)
14864 break;
14865 }
14866 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14867 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14868 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14869
14870 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14871 for (int i = 0; i < 16; ++i)
14872 if (Mask[i] >= 0) {
14873 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14874 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14875 if (PostDupI16Shuffle[i / 2] < 0)
14876 PostDupI16Shuffle[i / 2] = MappedMask;
14877 else
14878 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14879 "Conflicting entries in the original shuffle!");
14880 }
14881 return DAG.getBitcast(
14882 MVT::v16i8,
14883 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14884 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14885 };
14886 if (SDValue V = tryToWidenViaDuplication())
14887 return V;
14888 }
14889
14890 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14891 Zeroable, Subtarget, DAG))
14892 return Masked;
14893
14894 // Use dedicated unpack instructions for masks that match their pattern.
14895 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14896 return V;
14897
14898 // Try to use byte shift instructions to mask.
14899 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14900 Zeroable, Subtarget, DAG))
14901 return V;
14902
14903 // Check for compaction patterns.
14904 bool IsSingleInput = V2.isUndef();
14905 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14906
14907 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14908 // with PSHUFB. It is important to do this before we attempt to generate any
14909 // blends but after all of the single-input lowerings. If the single input
14910 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14911 // want to preserve that and we can DAG combine any longer sequences into
14912 // a PSHUFB in the end. But once we start blending from multiple inputs,
14913 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14914 // and there are *very* few patterns that would actually be faster than the
14915 // PSHUFB approach because of its ability to zero lanes.
14916 //
14917 // If the mask is a binary compaction, we can more efficiently perform this
14918 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14919 //
14920 // FIXME: The only exceptions to the above are blends which are exact
14921 // interleavings with direct instructions supporting them. We currently don't
14922 // handle those well here.
14923 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14924 bool V1InUse = false;
14925 bool V2InUse = false;
14926
14928 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14929
14930 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14931 // do so. This avoids using them to handle blends-with-zero which is
14932 // important as a single pshufb is significantly faster for that.
14933 if (V1InUse && V2InUse) {
14934 if (Subtarget.hasSSE41())
14935 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14936 Zeroable, Subtarget, DAG))
14937 return Blend;
14938
14939 // We can use an unpack to do the blending rather than an or in some
14940 // cases. Even though the or may be (very minorly) more efficient, we
14941 // preference this lowering because there are common cases where part of
14942 // the complexity of the shuffles goes away when we do the final blend as
14943 // an unpack.
14944 // FIXME: It might be worth trying to detect if the unpack-feeding
14945 // shuffles will both be pshufb, in which case we shouldn't bother with
14946 // this.
14948 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14949 return Unpack;
14950
14951 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14952 if (Subtarget.hasVBMI())
14953 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14954 DAG);
14955
14956 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14957 if (Subtarget.hasXOP()) {
14958 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14959 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14960 }
14961
14962 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14963 // PALIGNR will be cheaper than the second PSHUFB+OR.
14965 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14966 return V;
14967 }
14968
14969 return PSHUFB;
14970 }
14971
14972 // There are special ways we can lower some single-element blends.
14973 if (NumV2Elements == 1)
14975 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14976 return V;
14977
14978 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14979 return Blend;
14980
14981 // Check whether a compaction lowering can be done. This handles shuffles
14982 // which take every Nth element for some even N. See the helper function for
14983 // details.
14984 //
14985 // We special case these as they can be particularly efficiently handled with
14986 // the PACKUSB instruction on x86 and they show up in common patterns of
14987 // rearranging bytes to truncate wide elements.
14988 if (NumEvenDrops) {
14989 // NumEvenDrops is the power of two stride of the elements. Another way of
14990 // thinking about it is that we need to drop the even elements this many
14991 // times to get the original input.
14992
14993 // First we need to zero all the dropped bytes.
14994 assert(NumEvenDrops <= 3 &&
14995 "No support for dropping even elements more than 3 times.");
14996 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14997 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14998 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14999 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15000 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15001 WordClearMask);
15002 if (!IsSingleInput)
15003 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15004 WordClearMask);
15005
15006 // Now pack things back together.
15007 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15008 IsSingleInput ? V1 : V2);
15009 for (int i = 1; i < NumEvenDrops; ++i) {
15010 Result = DAG.getBitcast(MVT::v8i16, Result);
15011 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15012 }
15013 return Result;
15014 }
15015
15016 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15017 if (NumOddDrops == 1) {
15018 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15019 DAG.getBitcast(MVT::v8i16, V1),
15020 DAG.getTargetConstant(8, DL, MVT::i8));
15021 if (!IsSingleInput)
15022 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15023 DAG.getBitcast(MVT::v8i16, V2),
15024 DAG.getTargetConstant(8, DL, MVT::i8));
15025 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15026 IsSingleInput ? V1 : V2);
15027 }
15028
15029 // Handle multi-input cases by blending/unpacking single-input shuffles.
15030 if (NumV2Elements > 0)
15031 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15032 Zeroable, Subtarget, DAG);
15033
15034 // The fallback path for single-input shuffles widens this into two v8i16
15035 // vectors with unpacks, shuffles those, and then pulls them back together
15036 // with a pack.
15037 SDValue V = V1;
15038
15039 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15040 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15041 for (int i = 0; i < 16; ++i)
15042 if (Mask[i] >= 0)
15043 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15044
15045 SDValue VLoHalf, VHiHalf;
15046 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15047 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15048 // i16s.
15049 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15050 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15051 // Use a mask to drop the high bytes.
15052 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15053 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15054 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15055
15056 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15057 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15058
15059 // Squash the masks to point directly into VLoHalf.
15060 for (int &M : LoBlendMask)
15061 if (M >= 0)
15062 M /= 2;
15063 for (int &M : HiBlendMask)
15064 if (M >= 0)
15065 M /= 2;
15066 } else {
15067 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15068 // VHiHalf so that we can blend them as i16s.
15069 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15070
15071 VLoHalf = DAG.getBitcast(
15072 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15073 VHiHalf = DAG.getBitcast(
15074 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15075 }
15076
15077 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15078 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15079
15080 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15081}
15082
15083/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15084///
15085/// This routine breaks down the specific type of 128-bit shuffle and
15086/// dispatches to the lowering routines accordingly.
15088 MVT VT, SDValue V1, SDValue V2,
15089 const APInt &Zeroable,
15090 const X86Subtarget &Subtarget,
15091 SelectionDAG &DAG) {
15092 if (VT == MVT::v8bf16) {
15093 V1 = DAG.getBitcast(MVT::v8i16, V1);
15094 V2 = DAG.getBitcast(MVT::v8i16, V2);
15095 return DAG.getBitcast(VT,
15096 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15097 }
15098
15099 switch (VT.SimpleTy) {
15100 case MVT::v2i64:
15101 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15102 case MVT::v2f64:
15103 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15104 case MVT::v4i32:
15105 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15106 case MVT::v4f32:
15107 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15108 case MVT::v8i16:
15109 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15110 case MVT::v8f16:
15111 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15112 case MVT::v16i8:
15113 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15114
15115 default:
15116 llvm_unreachable("Unimplemented!");
15117 }
15118}
15119
15120/// Generic routine to split vector shuffle into half-sized shuffles.
15121///
15122/// This routine just extracts two subvectors, shuffles them independently, and
15123/// then concatenates them back together. This should work effectively with all
15124/// AVX vector shuffle types.
15126 SDValue V2, ArrayRef<int> Mask,
15127 SelectionDAG &DAG, bool SimpleOnly) {
15128 assert(VT.getSizeInBits() >= 256 &&
15129 "Only for 256-bit or wider vector shuffles!");
15130 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15131 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15132
15133 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15134 if (VT == MVT::v8f32) {
15135 SDValue BC1 = peekThroughBitcasts(V1);
15136 SDValue BC2 = peekThroughBitcasts(V2);
15137 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15138 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15139 DAG, SimpleOnly))
15140 return DAG.getBitcast(VT, Split);
15141 }
15142 }
15143
15144 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15145 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15146
15147 int NumElements = VT.getVectorNumElements();
15148 int SplitNumElements = NumElements / 2;
15149 MVT ScalarVT = VT.getVectorElementType();
15150 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15151
15152 // Use splitVector/extractSubVector so that split build-vectors just build two
15153 // narrower build vectors. This helps shuffling with splats and zeros.
15154 auto SplitVector = [&](SDValue V) {
15155 SDValue LoV, HiV;
15156 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15157 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15158 DAG.getBitcast(SplitVT, HiV));
15159 };
15160
15161 SDValue LoV1, HiV1, LoV2, HiV2;
15162 std::tie(LoV1, HiV1) = SplitVector(V1);
15163 std::tie(LoV2, HiV2) = SplitVector(V2);
15164
15165 // Now create two 4-way blends of these half-width vectors.
15166 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15167 bool &UseHiV1, bool &UseLoV2,
15168 bool &UseHiV2) {
15169 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15170 for (int i = 0; i < SplitNumElements; ++i) {
15171 int M = HalfMask[i];
15172 if (M >= NumElements) {
15173 if (M >= NumElements + SplitNumElements)
15174 UseHiV2 = true;
15175 else
15176 UseLoV2 = true;
15177 } else if (M >= 0) {
15178 if (M >= SplitNumElements)
15179 UseHiV1 = true;
15180 else
15181 UseLoV1 = true;
15182 }
15183 }
15184 };
15185
15186 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15187 if (!SimpleOnly)
15188 return true;
15189
15190 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15191 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15192
15193 return !(UseHiV1 || UseHiV2);
15194 };
15195
15196 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15197 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15198 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15199 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15200 for (int i = 0; i < SplitNumElements; ++i) {
15201 int M = HalfMask[i];
15202 if (M >= NumElements) {
15203 V2BlendMask[i] = M - NumElements;
15204 BlendMask[i] = SplitNumElements + i;
15205 } else if (M >= 0) {
15206 V1BlendMask[i] = M;
15207 BlendMask[i] = i;
15208 }
15209 }
15210
15211 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15212 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15213
15214 // Because the lowering happens after all combining takes place, we need to
15215 // manually combine these blend masks as much as possible so that we create
15216 // a minimal number of high-level vector shuffle nodes.
15217 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15218
15219 // First try just blending the halves of V1 or V2.
15220 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15221 return DAG.getUNDEF(SplitVT);
15222 if (!UseLoV2 && !UseHiV2)
15223 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15224 if (!UseLoV1 && !UseHiV1)
15225 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15226
15227 SDValue V1Blend, V2Blend;
15228 if (UseLoV1 && UseHiV1) {
15229 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15230 } else {
15231 // We only use half of V1 so map the usage down into the final blend mask.
15232 V1Blend = UseLoV1 ? LoV1 : HiV1;
15233 for (int i = 0; i < SplitNumElements; ++i)
15234 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15235 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15236 }
15237 if (UseLoV2 && UseHiV2) {
15238 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15239 } else {
15240 // We only use half of V2 so map the usage down into the final blend mask.
15241 V2Blend = UseLoV2 ? LoV2 : HiV2;
15242 for (int i = 0; i < SplitNumElements; ++i)
15243 if (BlendMask[i] >= SplitNumElements)
15244 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15245 }
15246 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15247 };
15248
15249 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15250 return SDValue();
15251
15252 SDValue Lo = HalfBlend(LoMask);
15253 SDValue Hi = HalfBlend(HiMask);
15254 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15255}
15256
15257/// Either split a vector in halves or decompose the shuffles and the
15258/// blend/unpack.
15259///
15260/// This is provided as a good fallback for many lowerings of non-single-input
15261/// shuffles with more than one 128-bit lane. In those cases, we want to select
15262/// between splitting the shuffle into 128-bit components and stitching those
15263/// back together vs. extracting the single-input shuffles and blending those
15264/// results.
15266 SDValue V2, ArrayRef<int> Mask,
15267 const APInt &Zeroable,
15268 const X86Subtarget &Subtarget,
15269 SelectionDAG &DAG) {
15270 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15271 "shuffles as it could then recurse on itself.");
15272 int Size = Mask.size();
15273
15274 // If this can be modeled as a broadcast of two elements followed by a blend,
15275 // prefer that lowering. This is especially important because broadcasts can
15276 // often fold with memory operands.
15277 auto DoBothBroadcast = [&] {
15278 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15279 for (int M : Mask)
15280 if (M >= Size) {
15281 if (V2BroadcastIdx < 0)
15282 V2BroadcastIdx = M - Size;
15283 else if ((M - Size) != V2BroadcastIdx &&
15284 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15285 return false;
15286 } else if (M >= 0) {
15287 if (V1BroadcastIdx < 0)
15288 V1BroadcastIdx = M;
15289 else if (M != V1BroadcastIdx &&
15290 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15291 return false;
15292 }
15293 return true;
15294 };
15295 if (DoBothBroadcast())
15296 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15297 Subtarget, DAG);
15298
15299 // If the inputs all stem from a single 128-bit lane of each input, then we
15300 // split them rather than blending because the split will decompose to
15301 // unusually few instructions.
15302 int LaneCount = VT.getSizeInBits() / 128;
15303 int LaneSize = Size / LaneCount;
15304 SmallBitVector LaneInputs[2];
15305 LaneInputs[0].resize(LaneCount, false);
15306 LaneInputs[1].resize(LaneCount, false);
15307 for (int i = 0; i < Size; ++i)
15308 if (Mask[i] >= 0)
15309 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15310 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15311 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15312 /*SimpleOnly*/ false);
15313
15314 // Without AVX2, if we can freely split the subvectors then we're better off
15315 // performing half width shuffles.
15316 if (!Subtarget.hasAVX2()) {
15317 SDValue BC1 = peekThroughBitcasts(V1);
15318 SDValue BC2 = peekThroughBitcasts(V2);
15319 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15320 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15321 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15322 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15323 if (SplatOrSplitV1 && SplatOrSplitV2)
15324 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15325 /*SimpleOnly*/ false);
15326 }
15327
15328 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15329 // requires that the decomposed single-input shuffles don't end up here.
15330 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15331 Subtarget, DAG);
15332}
15333
15334// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15335// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15337 SDValue V1, SDValue V2,
15338 ArrayRef<int> Mask,
15339 SelectionDAG &DAG) {
15340 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15341
15342 int LHSMask[4] = {-1, -1, -1, -1};
15343 int RHSMask[4] = {-1, -1, -1, -1};
15344 int SHUFPDMask[4] = {-1, -1, -1, -1};
15345
15346 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15347 // perform the shuffle once the lanes have been shuffled in place.
15348 for (int i = 0; i != 4; ++i) {
15349 int M = Mask[i];
15350 if (M < 0)
15351 continue;
15352 int LaneBase = i & ~1;
15353 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15354 LaneMask[LaneBase + (M & 1)] = M;
15355 SHUFPDMask[i] = M & 1;
15356 }
15357
15358 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15359 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15360 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15361 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15362}
15363
15364/// Lower a vector shuffle crossing multiple 128-bit lanes as
15365/// a lane permutation followed by a per-lane permutation.
15366///
15367/// This is mainly for cases where we can have non-repeating permutes
15368/// in each lane.
15369///
15370/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15371/// we should investigate merging them.
15373 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15374 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15375 int NumElts = VT.getVectorNumElements();
15376 int NumLanes = VT.getSizeInBits() / 128;
15377 int NumEltsPerLane = NumElts / NumLanes;
15378 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15379
15380 /// Attempts to find a sublane permute with the given size
15381 /// that gets all elements into their target lanes.
15382 ///
15383 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15384 /// If unsuccessful, returns false and may overwrite InLaneMask.
15385 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15386 int NumSublanesPerLane = NumSublanes / NumLanes;
15387 int NumEltsPerSublane = NumElts / NumSublanes;
15388
15389 SmallVector<int, 16> CrossLaneMask;
15390 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15391 // CrossLaneMask but one entry == one sublane.
15392 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15393 APInt DemandedCrossLane = APInt::getZero(NumElts);
15394
15395 for (int i = 0; i != NumElts; ++i) {
15396 int M = Mask[i];
15397 if (M < 0)
15398 continue;
15399
15400 int SrcSublane = M / NumEltsPerSublane;
15401 int DstLane = i / NumEltsPerLane;
15402
15403 // We only need to get the elements into the right lane, not sublane.
15404 // So search all sublanes that make up the destination lane.
15405 bool Found = false;
15406 int DstSubStart = DstLane * NumSublanesPerLane;
15407 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15408 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15409 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15410 continue;
15411
15412 Found = true;
15413 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15414 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15415 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15416 DemandedCrossLane.setBit(InLaneMask[i]);
15417 break;
15418 }
15419 if (!Found)
15420 return SDValue();
15421 }
15422
15423 // Fill CrossLaneMask using CrossLaneMaskLarge.
15424 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15425
15426 if (!CanUseSublanes) {
15427 // If we're only shuffling a single lowest lane and the rest are identity
15428 // then don't bother.
15429 // TODO - isShuffleMaskInputInPlace could be extended to something like
15430 // this.
15431 int NumIdentityLanes = 0;
15432 bool OnlyShuffleLowestLane = true;
15433 for (int i = 0; i != NumLanes; ++i) {
15434 int LaneOffset = i * NumEltsPerLane;
15435 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15436 i * NumEltsPerLane))
15437 NumIdentityLanes++;
15438 else if (CrossLaneMask[LaneOffset] != 0)
15439 OnlyShuffleLowestLane = false;
15440 }
15441 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15442 return SDValue();
15443 }
15444
15445 // Simplify CrossLaneMask based on the actual demanded elements.
15446 if (V1.hasOneUse())
15447 for (int i = 0; i != NumElts; ++i)
15448 if (!DemandedCrossLane[i])
15449 CrossLaneMask[i] = SM_SentinelUndef;
15450
15451 // Avoid returning the same shuffle operation. For example,
15452 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15453 // undef:v16i16
15454 if (CrossLaneMask == Mask || InLaneMask == Mask)
15455 return SDValue();
15456
15457 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15458 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15459 InLaneMask);
15460 };
15461
15462 // First attempt a solution with full lanes.
15463 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15464 return V;
15465
15466 // The rest of the solutions use sublanes.
15467 if (!CanUseSublanes)
15468 return SDValue();
15469
15470 // Then attempt a solution with 64-bit sublanes (vpermq).
15471 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15472 return V;
15473
15474 // If that doesn't work and we have fast variable cross-lane shuffle,
15475 // attempt 32-bit sublanes (vpermd).
15476 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15477 return SDValue();
15478
15479 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15480}
15481
15482/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15483static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15484 SmallVector<int> &InLaneMask) {
15485 int Size = Mask.size();
15486 InLaneMask.assign(Mask.begin(), Mask.end());
15487 for (int i = 0; i < Size; ++i) {
15488 int &M = InLaneMask[i];
15489 if (M < 0)
15490 continue;
15491 if (((M % Size) / LaneSize) != (i / LaneSize))
15492 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15493 }
15494}
15495
15496/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15497/// source with a lane permutation.
15498///
15499/// This lowering strategy results in four instructions in the worst case for a
15500/// single-input cross lane shuffle which is lower than any other fully general
15501/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15502/// shuffle pattern should be handled prior to trying this lowering.
15504 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15505 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15506 // FIXME: This should probably be generalized for 512-bit vectors as well.
15507 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15508 int Size = Mask.size();
15509 int LaneSize = Size / 2;
15510
15511 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15512 // Only do this if the elements aren't all from the lower lane,
15513 // otherwise we're (probably) better off doing a split.
15514 if (VT == MVT::v4f64 &&
15515 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15516 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15517
15518 // If there are only inputs from one 128-bit lane, splitting will in fact be
15519 // less expensive. The flags track whether the given lane contains an element
15520 // that crosses to another lane.
15521 bool AllLanes;
15522 if (!Subtarget.hasAVX2()) {
15523 bool LaneCrossing[2] = {false, false};
15524 for (int i = 0; i < Size; ++i)
15525 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15526 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15527 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15528 } else {
15529 bool LaneUsed[2] = {false, false};
15530 for (int i = 0; i < Size; ++i)
15531 if (Mask[i] >= 0)
15532 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15533 AllLanes = LaneUsed[0] && LaneUsed[1];
15534 }
15535
15536 // TODO - we could support shuffling V2 in the Flipped input.
15537 assert(V2.isUndef() &&
15538 "This last part of this routine only works on single input shuffles");
15539
15540 SmallVector<int> InLaneMask;
15541 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15542
15543 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15544 "In-lane shuffle mask expected");
15545
15546 // If we're not using both lanes in each lane and the inlane mask is not
15547 // repeating, then we're better off splitting.
15548 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15549 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15550 /*SimpleOnly*/ false);
15551
15552 // Flip the lanes, and shuffle the results which should now be in-lane.
15553 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15554 SDValue Flipped = DAG.getBitcast(PVT, V1);
15555 Flipped =
15556 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15557 Flipped = DAG.getBitcast(VT, Flipped);
15558 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15559}
15560
15561/// Handle lowering 2-lane 128-bit shuffles.
15563 SDValue V2, ArrayRef<int> Mask,
15564 const APInt &Zeroable,
15565 const X86Subtarget &Subtarget,
15566 SelectionDAG &DAG) {
15567 if (V2.isUndef()) {
15568 // Attempt to match VBROADCAST*128 subvector broadcast load.
15569 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15570 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15571 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15573 MVT MemVT = VT.getHalfNumVectorElementsVT();
15574 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15577 VT, MemVT, Ld, Ofs, DAG))
15578 return BcstLd;
15579 }
15580
15581 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15582 if (Subtarget.hasAVX2())
15583 return SDValue();
15584 }
15585
15586 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15587
15588 SmallVector<int, 4> WidenedMask;
15589 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15590 return SDValue();
15591
15592 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15593 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15594
15595 // Try to use an insert into a zero vector.
15596 if (WidenedMask[0] == 0 && IsHighZero) {
15597 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15598 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15599 DAG.getVectorIdxConstant(0, DL));
15600 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15601 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15602 DAG.getVectorIdxConstant(0, DL));
15603 }
15604
15605 // TODO: If minimizing size and one of the inputs is a zero vector and the
15606 // the zero vector has only one use, we could use a VPERM2X128 to save the
15607 // instruction bytes needed to explicitly generate the zero vector.
15608
15609 // Blends are faster and handle all the non-lane-crossing cases.
15610 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15611 Subtarget, DAG))
15612 return Blend;
15613
15614 // If either input operand is a zero vector, use VPERM2X128 because its mask
15615 // allows us to replace the zero input with an implicit zero.
15616 if (!IsLowZero && !IsHighZero) {
15617 // Check for patterns which can be matched with a single insert of a 128-bit
15618 // subvector.
15619 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15620 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15621
15622 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15623 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15625 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15626 SDValue SubVec =
15627 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15628 DAG.getVectorIdxConstant(0, DL));
15629 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15630 DAG.getVectorIdxConstant(2, DL));
15631 }
15632 }
15633
15634 // Try to use SHUF128 if possible.
15635 if (Subtarget.hasVLX()) {
15636 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15637 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15638 ((WidenedMask[1] % 2) << 1);
15639 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15640 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15641 }
15642 }
15643 }
15644
15645 // Otherwise form a 128-bit permutation. After accounting for undefs,
15646 // convert the 64-bit shuffle mask selection values into 128-bit
15647 // selection bits by dividing the indexes by 2 and shifting into positions
15648 // defined by a vperm2*128 instruction's immediate control byte.
15649
15650 // The immediate permute control byte looks like this:
15651 // [1:0] - select 128 bits from sources for low half of destination
15652 // [2] - ignore
15653 // [3] - zero low half of destination
15654 // [5:4] - select 128 bits from sources for high half of destination
15655 // [6] - ignore
15656 // [7] - zero high half of destination
15657
15658 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15659 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15660
15661 unsigned PermMask = 0;
15662 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15663 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15664
15665 // Check the immediate mask and replace unused sources with undef.
15666 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15667 V1 = DAG.getUNDEF(VT);
15668 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15669 V2 = DAG.getUNDEF(VT);
15670
15671 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15672 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15673}
15674
15675/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15676/// shuffling each lane.
15677///
15678/// This attempts to create a repeated lane shuffle where each lane uses one
15679/// or two of the lanes of the inputs. The lanes of the input vectors are
15680/// shuffled in one or two independent shuffles to get the lanes into the
15681/// position needed by the final shuffle.
15683 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15684 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15685 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15686
15687 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15688 return SDValue();
15689
15690 int NumElts = Mask.size();
15691 int NumLanes = VT.getSizeInBits() / 128;
15692 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15693 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15694 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15695
15696 // First pass will try to fill in the RepeatMask from lanes that need two
15697 // sources.
15698 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15699 int Srcs[2] = {-1, -1};
15700 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15701 for (int i = 0; i != NumLaneElts; ++i) {
15702 int M = Mask[(Lane * NumLaneElts) + i];
15703 if (M < 0)
15704 continue;
15705 // Determine which of the possible input lanes (NumLanes from each source)
15706 // this element comes from. Assign that as one of the sources for this
15707 // lane. We can assign up to 2 sources for this lane. If we run out
15708 // sources we can't do anything.
15709 int LaneSrc = M / NumLaneElts;
15710 int Src;
15711 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15712 Src = 0;
15713 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15714 Src = 1;
15715 else
15716 return SDValue();
15717
15718 Srcs[Src] = LaneSrc;
15719 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15720 }
15721
15722 // If this lane has two sources, see if it fits with the repeat mask so far.
15723 if (Srcs[1] < 0)
15724 continue;
15725
15726 LaneSrcs[Lane][0] = Srcs[0];
15727 LaneSrcs[Lane][1] = Srcs[1];
15728
15729 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15730 assert(M1.size() == M2.size() && "Unexpected mask size");
15731 for (int i = 0, e = M1.size(); i != e; ++i)
15732 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15733 return false;
15734 return true;
15735 };
15736
15737 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15738 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15739 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15740 int M = Mask[i];
15741 if (M < 0)
15742 continue;
15743 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15744 "Unexpected mask element");
15745 MergedMask[i] = M;
15746 }
15747 };
15748
15749 if (MatchMasks(InLaneMask, RepeatMask)) {
15750 // Merge this lane mask into the final repeat mask.
15751 MergeMasks(InLaneMask, RepeatMask);
15752 continue;
15753 }
15754
15755 // Didn't find a match. Swap the operands and try again.
15756 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15758
15759 if (MatchMasks(InLaneMask, RepeatMask)) {
15760 // Merge this lane mask into the final repeat mask.
15761 MergeMasks(InLaneMask, RepeatMask);
15762 continue;
15763 }
15764
15765 // Couldn't find a match with the operands in either order.
15766 return SDValue();
15767 }
15768
15769 // Now handle any lanes with only one source.
15770 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15771 // If this lane has already been processed, skip it.
15772 if (LaneSrcs[Lane][0] >= 0)
15773 continue;
15774
15775 for (int i = 0; i != NumLaneElts; ++i) {
15776 int M = Mask[(Lane * NumLaneElts) + i];
15777 if (M < 0)
15778 continue;
15779
15780 // If RepeatMask isn't defined yet we can define it ourself.
15781 if (RepeatMask[i] < 0)
15782 RepeatMask[i] = M % NumLaneElts;
15783
15784 if (RepeatMask[i] < NumElts) {
15785 if (RepeatMask[i] != M % NumLaneElts)
15786 return SDValue();
15787 LaneSrcs[Lane][0] = M / NumLaneElts;
15788 } else {
15789 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15790 return SDValue();
15791 LaneSrcs[Lane][1] = M / NumLaneElts;
15792 }
15793 }
15794
15795 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15796 return SDValue();
15797 }
15798
15799 SmallVector<int, 16> NewMask(NumElts, -1);
15800 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15801 int Src = LaneSrcs[Lane][0];
15802 for (int i = 0; i != NumLaneElts; ++i) {
15803 int M = -1;
15804 if (Src >= 0)
15805 M = Src * NumLaneElts + i;
15806 NewMask[Lane * NumLaneElts + i] = M;
15807 }
15808 }
15809 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15810 // Ensure we didn't get back the shuffle we started with.
15811 // FIXME: This is a hack to make up for some splat handling code in
15812 // getVectorShuffle.
15813 if (isa<ShuffleVectorSDNode>(NewV1) &&
15814 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15815 return SDValue();
15816
15817 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15818 int Src = LaneSrcs[Lane][1];
15819 for (int i = 0; i != NumLaneElts; ++i) {
15820 int M = -1;
15821 if (Src >= 0)
15822 M = Src * NumLaneElts + i;
15823 NewMask[Lane * NumLaneElts + i] = M;
15824 }
15825 }
15826 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15827 // Ensure we didn't get back the shuffle we started with.
15828 // FIXME: This is a hack to make up for some splat handling code in
15829 // getVectorShuffle.
15830 if (isa<ShuffleVectorSDNode>(NewV2) &&
15831 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15832 return SDValue();
15833
15834 for (int i = 0; i != NumElts; ++i) {
15835 if (Mask[i] < 0) {
15836 NewMask[i] = -1;
15837 continue;
15838 }
15839 NewMask[i] = RepeatMask[i % NumLaneElts];
15840 if (NewMask[i] < 0)
15841 continue;
15842
15843 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15844 }
15845 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15846}
15847
15848/// If the input shuffle mask results in a vector that is undefined in all upper
15849/// or lower half elements and that mask accesses only 2 halves of the
15850/// shuffle's operands, return true. A mask of half the width with mask indexes
15851/// adjusted to access the extracted halves of the original shuffle operands is
15852/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15853/// lower half of each input operand is accessed.
15854static bool
15856 int &HalfIdx1, int &HalfIdx2) {
15857 assert((Mask.size() == HalfMask.size() * 2) &&
15858 "Expected input mask to be twice as long as output");
15859
15860 // Exactly one half of the result must be undef to allow narrowing.
15861 bool UndefLower = isUndefLowerHalf(Mask);
15862 bool UndefUpper = isUndefUpperHalf(Mask);
15863 if (UndefLower == UndefUpper)
15864 return false;
15865
15866 unsigned HalfNumElts = HalfMask.size();
15867 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15868 HalfIdx1 = -1;
15869 HalfIdx2 = -1;
15870 for (unsigned i = 0; i != HalfNumElts; ++i) {
15871 int M = Mask[i + MaskIndexOffset];
15872 if (M < 0) {
15873 HalfMask[i] = M;
15874 continue;
15875 }
15876
15877 // Determine which of the 4 half vectors this element is from.
15878 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15879 int HalfIdx = M / HalfNumElts;
15880
15881 // Determine the element index into its half vector source.
15882 int HalfElt = M % HalfNumElts;
15883
15884 // We can shuffle with up to 2 half vectors, set the new 'half'
15885 // shuffle mask accordingly.
15886 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15887 HalfMask[i] = HalfElt;
15888 HalfIdx1 = HalfIdx;
15889 continue;
15890 }
15891 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15892 HalfMask[i] = HalfElt + HalfNumElts;
15893 HalfIdx2 = HalfIdx;
15894 continue;
15895 }
15896
15897 // Too many half vectors referenced.
15898 return false;
15899 }
15900
15901 return true;
15902}
15903
15904/// Given the output values from getHalfShuffleMask(), create a half width
15905/// shuffle of extracted vectors followed by an insert back to full width.
15907 ArrayRef<int> HalfMask, int HalfIdx1,
15908 int HalfIdx2, bool UndefLower,
15909 SelectionDAG &DAG, bool UseConcat = false) {
15910 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15911 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15912
15913 MVT VT = V1.getSimpleValueType();
15914 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15915 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15916
15917 auto getHalfVector = [&](int HalfIdx) {
15918 if (HalfIdx < 0)
15919 return DAG.getUNDEF(HalfVT);
15920 SDValue V = (HalfIdx < 2 ? V1 : V2);
15921 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15922 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15923 DAG.getVectorIdxConstant(HalfIdx, DL));
15924 };
15925
15926 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15927 SDValue Half1 = getHalfVector(HalfIdx1);
15928 SDValue Half2 = getHalfVector(HalfIdx2);
15929 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15930 if (UseConcat) {
15931 SDValue Op0 = V;
15932 SDValue Op1 = DAG.getUNDEF(HalfVT);
15933 if (UndefLower)
15934 std::swap(Op0, Op1);
15935 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15936 }
15937
15938 unsigned Offset = UndefLower ? HalfNumElts : 0;
15939 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15941}
15942
15943/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15944/// This allows for fast cases such as subvector extraction/insertion
15945/// or shuffling smaller vector types which can lower more efficiently.
15947 SDValue V2, ArrayRef<int> Mask,
15948 const X86Subtarget &Subtarget,
15949 SelectionDAG &DAG) {
15950 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15951 "Expected 256-bit or 512-bit vector");
15952
15953 bool UndefLower = isUndefLowerHalf(Mask);
15954 if (!UndefLower && !isUndefUpperHalf(Mask))
15955 return SDValue();
15956
15957 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15958 "Completely undef shuffle mask should have been simplified already");
15959
15960 // Upper half is undef and lower half is whole upper subvector.
15961 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15962 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15963 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15964 if (!UndefLower &&
15965 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15966 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15967 DAG.getVectorIdxConstant(HalfNumElts, DL));
15968 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15969 DAG.getVectorIdxConstant(0, DL));
15970 }
15971
15972 // Lower half is undef and upper half is whole lower subvector.
15973 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15974 if (UndefLower &&
15975 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15976 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15977 DAG.getVectorIdxConstant(0, DL));
15978 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15979 DAG.getVectorIdxConstant(HalfNumElts, DL));
15980 }
15981
15982 int HalfIdx1, HalfIdx2;
15983 SmallVector<int, 8> HalfMask(HalfNumElts);
15984 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15985 return SDValue();
15986
15987 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15988
15989 // Only shuffle the halves of the inputs when useful.
15990 unsigned NumLowerHalves =
15991 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15992 unsigned NumUpperHalves =
15993 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15994 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15995
15996 // Determine the larger pattern of undef/halves, then decide if it's worth
15997 // splitting the shuffle based on subtarget capabilities and types.
15998 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15999 if (!UndefLower) {
16000 // XXXXuuuu: no insert is needed.
16001 // Always extract lowers when setting lower - these are all free subreg ops.
16002 if (NumUpperHalves == 0)
16003 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16004 UndefLower, DAG);
16005
16006 if (NumUpperHalves == 1) {
16007 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16008 if (Subtarget.hasAVX2()) {
16009 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16010 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16011 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16012 (!isSingleSHUFPSMask(HalfMask) ||
16013 Subtarget.hasFastVariableCrossLaneShuffle()))
16014 return SDValue();
16015 // If this is an unary shuffle (assume that the 2nd operand is
16016 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16017 // are better off extracting the upper half of 1 operand and using a
16018 // narrow shuffle.
16019 if (EltWidth == 64 && V2.isUndef())
16020 return SDValue();
16021 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16022 // full width pshufb, and then merge.
16023 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16024 return SDValue();
16025 }
16026 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16027 if (Subtarget.hasAVX512() && VT.is512BitVector())
16028 return SDValue();
16029 // Extract + narrow shuffle is better than the wide alternative.
16030 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16031 UndefLower, DAG);
16032 }
16033
16034 // Don't extract both uppers, instead shuffle and then extract.
16035 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16036 return SDValue();
16037 }
16038
16039 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16040 if (NumUpperHalves == 0) {
16041 // AVX2 has efficient 64-bit element cross-lane shuffles.
16042 // TODO: Refine to account for unary shuffle, splat, and other masks?
16043 if (Subtarget.hasAVX2() && EltWidth == 64)
16044 return SDValue();
16045 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16046 if (Subtarget.hasAVX512() && VT.is512BitVector())
16047 return SDValue();
16048 // Narrow shuffle + insert is better than the wide alternative.
16049 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16050 UndefLower, DAG);
16051 }
16052
16053 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16054 return SDValue();
16055}
16056
16057/// Handle case where shuffle sources are coming from the same 128-bit lane and
16058/// every lane can be represented as the same repeating mask - allowing us to
16059/// shuffle the sources with the repeating shuffle and then permute the result
16060/// to the destination lanes.
16062 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16063 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16064 int NumElts = VT.getVectorNumElements();
16065 int NumLanes = VT.getSizeInBits() / 128;
16066 int NumLaneElts = NumElts / NumLanes;
16067
16068 // On AVX2 we may be able to just shuffle the lowest elements and then
16069 // broadcast the result.
16070 if (Subtarget.hasAVX2()) {
16071 for (unsigned BroadcastSize : {16, 32, 64}) {
16072 if (BroadcastSize <= VT.getScalarSizeInBits())
16073 continue;
16074 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16075
16076 // Attempt to match a repeating pattern every NumBroadcastElts,
16077 // accounting for UNDEFs but only references the lowest 128-bit
16078 // lane of the inputs.
16079 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16080 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16081 for (int j = 0; j != NumBroadcastElts; ++j) {
16082 int M = Mask[i + j];
16083 if (M < 0)
16084 continue;
16085 int &R = RepeatMask[j];
16086 if (0 != ((M % NumElts) / NumLaneElts))
16087 return false;
16088 if (0 <= R && R != M)
16089 return false;
16090 R = M;
16091 }
16092 return true;
16093 };
16094
16095 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16096 if (!FindRepeatingBroadcastMask(RepeatMask))
16097 continue;
16098
16099 // Shuffle the (lowest) repeated elements in place for broadcast.
16100 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16101
16102 // Shuffle the actual broadcast.
16103 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16104 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16105 for (int j = 0; j != NumBroadcastElts; ++j)
16106 BroadcastMask[i + j] = j;
16107
16108 // Avoid returning the same shuffle operation. For example,
16109 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16110 if (BroadcastMask == Mask)
16111 return SDValue();
16112
16113 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16114 BroadcastMask);
16115 }
16116 }
16117
16118 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16119 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16120 return SDValue();
16121
16122 // Bail if we already have a repeated lane shuffle mask.
16123 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16124 return SDValue();
16125
16126 // Helper to look for repeated mask in each split sublane, and that those
16127 // sublanes can then be permuted into place.
16128 auto ShuffleSubLanes = [&](int SubLaneScale) {
16129 int NumSubLanes = NumLanes * SubLaneScale;
16130 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16131
16132 // Check that all the sources are coming from the same lane and see if we
16133 // can form a repeating shuffle mask (local to each sub-lane). At the same
16134 // time, determine the source sub-lane for each destination sub-lane.
16135 int TopSrcSubLane = -1;
16136 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16137 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16138 SubLaneScale,
16139 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16140
16141 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16142 // Extract the sub-lane mask, check that it all comes from the same lane
16143 // and normalize the mask entries to come from the first lane.
16144 int SrcLane = -1;
16145 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16146 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16147 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16148 if (M < 0)
16149 continue;
16150 int Lane = (M % NumElts) / NumLaneElts;
16151 if ((0 <= SrcLane) && (SrcLane != Lane))
16152 return SDValue();
16153 SrcLane = Lane;
16154 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16155 SubLaneMask[Elt] = LocalM;
16156 }
16157
16158 // Whole sub-lane is UNDEF.
16159 if (SrcLane < 0)
16160 continue;
16161
16162 // Attempt to match against the candidate repeated sub-lane masks.
16163 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16164 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16165 for (int i = 0; i != NumSubLaneElts; ++i) {
16166 if (M1[i] < 0 || M2[i] < 0)
16167 continue;
16168 if (M1[i] != M2[i])
16169 return false;
16170 }
16171 return true;
16172 };
16173
16174 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16175 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16176 continue;
16177
16178 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16179 for (int i = 0; i != NumSubLaneElts; ++i) {
16180 int M = SubLaneMask[i];
16181 if (M < 0)
16182 continue;
16183 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16184 "Unexpected mask element");
16185 RepeatedSubLaneMask[i] = M;
16186 }
16187
16188 // Track the top most source sub-lane - by setting the remaining to
16189 // UNDEF we can greatly simplify shuffle matching.
16190 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16191 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16192 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16193 break;
16194 }
16195
16196 // Bail if we failed to find a matching repeated sub-lane mask.
16197 if (Dst2SrcSubLanes[DstSubLane] < 0)
16198 return SDValue();
16199 }
16200 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16201 "Unexpected source lane");
16202
16203 // Create a repeating shuffle mask for the entire vector.
16204 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16205 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16206 int Lane = SubLane / SubLaneScale;
16207 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16208 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16209 int M = RepeatedSubLaneMask[Elt];
16210 if (M < 0)
16211 continue;
16212 int Idx = (SubLane * NumSubLaneElts) + Elt;
16213 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16214 }
16215 }
16216
16217 // Shuffle each source sub-lane to its destination.
16218 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16219 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16220 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16221 if (SrcSubLane < 0)
16222 continue;
16223 for (int j = 0; j != NumSubLaneElts; ++j)
16224 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16225 }
16226
16227 // Avoid returning the same shuffle operation.
16228 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16229 if (RepeatedMask == Mask || SubLaneMask == Mask)
16230 return SDValue();
16231
16232 SDValue RepeatedShuffle =
16233 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16234
16235 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16236 SubLaneMask);
16237 };
16238
16239 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16240 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16241 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16242 // Otherwise we can only permute whole 128-bit lanes.
16243 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16244 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16245 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16246 MinSubLaneScale = 2;
16247 MaxSubLaneScale =
16248 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16249 }
16250 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16251 MinSubLaneScale = MaxSubLaneScale = 4;
16252
16253 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16254 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16255 return Shuffle;
16256
16257 return SDValue();
16258}
16259
16261 bool &ForceV1Zero, bool &ForceV2Zero,
16262 unsigned &ShuffleImm, ArrayRef<int> Mask,
16263 const APInt &Zeroable) {
16264 int NumElts = VT.getVectorNumElements();
16265 assert(VT.getScalarSizeInBits() == 64 &&
16266 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16267 "Unexpected data type for VSHUFPD");
16268 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16269 "Illegal shuffle mask");
16270
16271 bool ZeroLane[2] = { true, true };
16272 for (int i = 0; i < NumElts; ++i)
16273 ZeroLane[i & 1] &= Zeroable[i];
16274
16275 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16276 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16277 bool IsSHUFPD = true;
16278 bool IsCommutable = true;
16279 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16280 for (int i = 0; i < NumElts; ++i) {
16281 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16282 continue;
16283 if (Mask[i] < 0)
16284 return false;
16285 int Val = (i & 6) + NumElts * (i & 1);
16286 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16287 if (Mask[i] < Val || Mask[i] > Val + 1)
16288 IsSHUFPD = false;
16289 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16290 IsCommutable = false;
16291 SHUFPDMask[i] = Mask[i] % 2;
16292 }
16293
16294 if (!IsSHUFPD && !IsCommutable)
16295 return false;
16296
16297 if (!IsSHUFPD && IsCommutable)
16298 std::swap(V1, V2);
16299
16300 ForceV1Zero = ZeroLane[0];
16301 ForceV2Zero = ZeroLane[1];
16302 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16303 return true;
16304}
16305
16307 SDValue V2, ArrayRef<int> Mask,
16308 const APInt &Zeroable,
16309 const X86Subtarget &Subtarget,
16310 SelectionDAG &DAG) {
16311 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16312 "Unexpected data type for VSHUFPD");
16313
16314 unsigned Immediate = 0;
16315 bool ForceV1Zero = false, ForceV2Zero = false;
16316 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16317 Mask, Zeroable))
16318 return SDValue();
16319
16320 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16321 if (ForceV1Zero)
16322 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16323 if (ForceV2Zero)
16324 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16325
16326 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16327 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16328}
16329
16330// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16331// by zeroable elements in the remaining 24 elements. Turn this into two
16332// vmovqb instructions shuffled together.
16334 SDValue V1, SDValue V2,
16335 ArrayRef<int> Mask,
16336 const APInt &Zeroable,
16337 SelectionDAG &DAG) {
16338 assert(VT == MVT::v32i8 && "Unexpected type!");
16339
16340 // The first 8 indices should be every 8th element.
16341 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16342 return SDValue();
16343
16344 // Remaining elements need to be zeroable.
16345 if (Zeroable.countl_one() < (Mask.size() - 8))
16346 return SDValue();
16347
16348 V1 = DAG.getBitcast(MVT::v4i64, V1);
16349 V2 = DAG.getBitcast(MVT::v4i64, V2);
16350
16351 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16352 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16353
16354 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16355 // the upper bits of the result using an unpckldq.
16356 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16357 { 0, 1, 2, 3, 16, 17, 18, 19,
16358 4, 5, 6, 7, 20, 21, 22, 23 });
16359 // Insert the unpckldq into a zero vector to widen to v32i8.
16360 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16361 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16362 DAG.getVectorIdxConstant(0, DL));
16363}
16364
16365// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16366// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16367// =>
16368// ul = unpckl v1, v2
16369// uh = unpckh v1, v2
16370// a = vperm ul, uh
16371// b = vperm ul, uh
16372//
16373// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16374// and permute. We cannot directly match v3 because it is split into two
16375// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16376// pair of 256-bit shuffles and makes sure the masks are consecutive.
16377//
16378// Once unpck and permute nodes are created, the permute corresponding to this
16379// shuffle is returned, while the other permute replaces the other half of the
16380// shuffle in the selection dag.
16382 SDValue V1, SDValue V2,
16383 ArrayRef<int> Mask,
16384 SelectionDAG &DAG) {
16385 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16386 VT != MVT::v32i8)
16387 return SDValue();
16388 // <B0, B1, B0+1, B1+1, ..., >
16389 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16390 unsigned Begin1) {
16391 size_t Size = Mask.size();
16392 assert(Size % 2 == 0 && "Expected even mask size");
16393 for (unsigned I = 0; I < Size; I += 2) {
16394 if (Mask[I] != (int)(Begin0 + I / 2) ||
16395 Mask[I + 1] != (int)(Begin1 + I / 2))
16396 return false;
16397 }
16398 return true;
16399 };
16400 // Check which half is this shuffle node
16401 int NumElts = VT.getVectorNumElements();
16402 size_t FirstQtr = NumElts / 2;
16403 size_t ThirdQtr = NumElts + NumElts / 2;
16404 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16405 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16406 if (!IsFirstHalf && !IsSecondHalf)
16407 return SDValue();
16408
16409 // Find the intersection between shuffle users of V1 and V2.
16410 SmallVector<SDNode *, 2> Shuffles;
16411 for (SDNode *User : V1->users())
16412 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16413 User->getOperand(1) == V2)
16414 Shuffles.push_back(User);
16415 // Limit user size to two for now.
16416 if (Shuffles.size() != 2)
16417 return SDValue();
16418 // Find out which half of the 512-bit shuffles is each smaller shuffle
16419 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16420 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16421 SDNode *FirstHalf;
16422 SDNode *SecondHalf;
16423 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16424 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16425 FirstHalf = Shuffles[0];
16426 SecondHalf = Shuffles[1];
16427 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16428 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16429 FirstHalf = Shuffles[1];
16430 SecondHalf = Shuffles[0];
16431 } else {
16432 return SDValue();
16433 }
16434 // Lower into unpck and perm. Return the perm of this shuffle and replace
16435 // the other.
16436 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16437 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16438 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16439 DAG.getTargetConstant(0x20, DL, MVT::i8));
16440 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16441 DAG.getTargetConstant(0x31, DL, MVT::i8));
16442 if (IsFirstHalf) {
16443 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16444 return Perm1;
16445 }
16446 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16447 return Perm2;
16448}
16449
16450/// Handle lowering of 4-lane 64-bit floating point shuffles.
16451///
16452/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16453/// isn't available.
16455 const APInt &Zeroable, SDValue V1, SDValue V2,
16456 const X86Subtarget &Subtarget,
16457 SelectionDAG &DAG) {
16458 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16459 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16460 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16461
16462 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16463 Subtarget, DAG))
16464 return V;
16465
16466 if (V2.isUndef()) {
16467 // Check for being able to broadcast a single element.
16468 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16469 Mask, Subtarget, DAG))
16470 return Broadcast;
16471
16472 // Use low duplicate instructions for masks that match their pattern.
16473 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16474 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16475
16476 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16477 // Non-half-crossing single input shuffles can be lowered with an
16478 // interleaved permutation.
16479 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16480 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16481 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16482 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16483 }
16484
16485 // With AVX2 we have direct support for this permutation.
16486 if (Subtarget.hasAVX2())
16487 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16488 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16489
16490 // Try to create an in-lane repeating shuffle mask and then shuffle the
16491 // results into the target lanes.
16493 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16494 return V;
16495
16496 // Try to permute the lanes and then use a per-lane permute.
16497 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16498 Mask, DAG, Subtarget))
16499 return V;
16500
16501 // Otherwise, fall back.
16502 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16503 DAG, Subtarget);
16504 }
16505
16506 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16507 Zeroable, Subtarget, DAG))
16508 return Blend;
16509
16510 // Use dedicated unpack instructions for masks that match their pattern.
16511 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16512 return V;
16513
16514 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16515 Zeroable, Subtarget, DAG))
16516 return Op;
16517
16518 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16519 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16520 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16521 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16522
16523 // If we have lane crossing shuffles AND they don't all come from the lower
16524 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16525 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16526 // canonicalize to a blend of splat which isn't necessary for this combine.
16527 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16528 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16529 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16530 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16531 (!Subtarget.hasAVX2() ||
16532 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16533 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16534
16535 // If we have one input in place, then we can permute the other input and
16536 // blend the result.
16537 if (V1IsInPlace || V2IsInPlace)
16538 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16539 Zeroable, Subtarget, DAG);
16540
16541 // Try to create an in-lane repeating shuffle mask and then shuffle the
16542 // results into the target lanes.
16544 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16545 return V;
16546
16547 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16548 // shuffle. However, if we have AVX2 and either inputs are already in place,
16549 // we will be able to shuffle even across lanes the other input in a single
16550 // instruction so skip this pattern.
16551 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16553 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16554 return V;
16555
16556 // If we have VLX support, we can use VEXPAND.
16557 if (Subtarget.hasVLX())
16558 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16559 Zeroable, Subtarget, DAG))
16560 return V;
16561
16562 // If we have AVX2 then we always want to lower with a blend because an v4 we
16563 // can fully permute the elements.
16564 if (Subtarget.hasAVX2())
16565 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16566 Zeroable, Subtarget, DAG);
16567
16568 // Otherwise fall back on generic lowering.
16569 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16570 Subtarget, DAG);
16571}
16572
16573/// Handle lowering of 4-lane 64-bit integer shuffles.
16574///
16575/// This routine is only called when we have AVX2 and thus a reasonable
16576/// instruction set for v4i64 shuffling..
16578 const APInt &Zeroable, SDValue V1, SDValue V2,
16579 const X86Subtarget &Subtarget,
16580 SelectionDAG &DAG) {
16581 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16582 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16583 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16584 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16585
16586 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16587 Subtarget, DAG))
16588 return V;
16589
16590 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16591 Zeroable, Subtarget, DAG))
16592 return Blend;
16593
16594 // Check for being able to broadcast a single element.
16595 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16596 Subtarget, DAG))
16597 return Broadcast;
16598
16599 // Try to use shift instructions if fast.
16600 if (Subtarget.preferLowerShuffleAsShift())
16601 if (SDValue Shift =
16602 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16603 Subtarget, DAG, /*BitwiseOnly*/ true))
16604 return Shift;
16605
16606 if (V2.isUndef()) {
16607 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16608 // can use lower latency instructions that will operate on both lanes.
16609 SmallVector<int, 2> RepeatedMask;
16610 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16611 SmallVector<int, 4> PSHUFDMask;
16612 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16613 return DAG.getBitcast(
16614 MVT::v4i64,
16615 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16616 DAG.getBitcast(MVT::v8i32, V1),
16617 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16618 }
16619
16620 // AVX2 provides a direct instruction for permuting a single input across
16621 // lanes.
16622 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16623 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16624 }
16625
16626 // Try to use shift instructions.
16627 if (SDValue Shift =
16628 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16629 DAG, /*BitwiseOnly*/ false))
16630 return Shift;
16631
16632 // If we have VLX support, we can use VALIGN or VEXPAND.
16633 if (Subtarget.hasVLX()) {
16634 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16635 Zeroable, Subtarget, DAG))
16636 return Rotate;
16637
16638 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16639 Zeroable, Subtarget, DAG))
16640 return V;
16641 }
16642
16643 // Try to use PALIGNR.
16644 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16645 Subtarget, DAG))
16646 return Rotate;
16647
16648 // Use dedicated unpack instructions for masks that match their pattern.
16649 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16650 return V;
16651
16652 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16653 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16654
16655 // If we have one input in place, then we can permute the other input and
16656 // blend the result.
16657 if (V1IsInPlace || V2IsInPlace)
16658 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16659 Zeroable, Subtarget, DAG);
16660
16661 // Try to create an in-lane repeating shuffle mask and then shuffle the
16662 // results into the target lanes.
16664 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16665 return V;
16666
16667 // Try to lower to PERMQ(BLENDD(V1,V2)).
16668 if (SDValue V =
16669 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16670 return V;
16671
16672 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16673 // shuffle. However, if we have AVX2 and either inputs are already in place,
16674 // we will be able to shuffle even across lanes the other input in a single
16675 // instruction so skip this pattern.
16676 if (!V1IsInPlace && !V2IsInPlace)
16678 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16679 return Result;
16680
16681 // Otherwise fall back on generic blend lowering.
16682 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16683 Zeroable, Subtarget, DAG);
16684}
16685
16686/// Handle lowering of 8-lane 32-bit floating point shuffles.
16687///
16688/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16689/// isn't available.
16691 const APInt &Zeroable, SDValue V1, SDValue V2,
16692 const X86Subtarget &Subtarget,
16693 SelectionDAG &DAG) {
16694 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16695 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16696 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16697
16698 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16699 Zeroable, Subtarget, DAG))
16700 return Blend;
16701
16702 // Check for being able to broadcast a single element.
16703 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16704 Subtarget, DAG))
16705 return Broadcast;
16706
16707 if (!Subtarget.hasAVX2()) {
16708 SmallVector<int> InLaneMask;
16709 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16710
16711 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16712 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16713 /*SimpleOnly*/ true))
16714 return R;
16715 }
16716 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16717 Zeroable, Subtarget, DAG))
16718 return DAG.getBitcast(MVT::v8f32, ZExt);
16719
16720 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16721 // options to efficiently lower the shuffle.
16722 SmallVector<int, 4> RepeatedMask;
16723 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16724 assert(RepeatedMask.size() == 4 &&
16725 "Repeated masks must be half the mask width!");
16726
16727 // Use even/odd duplicate instructions for masks that match their pattern.
16728 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16729 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16730 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16731 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16732
16733 if (V2.isUndef())
16734 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16735 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16736
16737 // Use dedicated unpack instructions for masks that match their pattern.
16738 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16739 return V;
16740
16741 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16742 // have already handled any direct blends.
16743 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16744 }
16745
16746 // Try to create an in-lane repeating shuffle mask and then shuffle the
16747 // results into the target lanes.
16749 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16750 return V;
16751
16752 // If we have a single input shuffle with different shuffle patterns in the
16753 // two 128-bit lanes use the variable mask to VPERMILPS.
16754 if (V2.isUndef()) {
16755 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16756 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16757 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16758 }
16759 if (Subtarget.hasAVX2()) {
16760 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16761 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16762 }
16763 // Otherwise, fall back.
16764 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16765 DAG, Subtarget);
16766 }
16767
16768 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16769 // shuffle.
16771 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16772 return Result;
16773
16774 // If we have VLX support, we can use VEXPAND.
16775 if (Subtarget.hasVLX())
16776 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16777 Zeroable, Subtarget, DAG))
16778 return V;
16779
16780 // Try to match an interleave of two v8f32s and lower them as unpck and
16781 // permutes using ymms. This needs to go before we try to split the vectors.
16782 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16783 if ((Subtarget.hasAVX2() ||
16786 !Subtarget.hasAVX512())
16787 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16788 Mask, DAG))
16789 return V;
16790
16791 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16792 // since after split we get a more efficient code using vpunpcklwd and
16793 // vpunpckhwd instrs than vblend.
16794 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16795 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16796 Subtarget, DAG);
16797
16798 // If we have AVX2 then we always want to lower with a blend because at v8 we
16799 // can fully permute the elements.
16800 if (Subtarget.hasAVX2())
16801 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16802 Zeroable, Subtarget, DAG);
16803
16804 // Otherwise fall back on generic lowering.
16805 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16806 Subtarget, DAG);
16807}
16808
16809/// Handle lowering of 8-lane 32-bit integer shuffles.
16810///
16811/// This routine is only called when we have AVX2 and thus a reasonable
16812/// instruction set for v8i32 shuffling..
16814 const APInt &Zeroable, SDValue V1, SDValue V2,
16815 const X86Subtarget &Subtarget,
16816 SelectionDAG &DAG) {
16817 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16818 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16819 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16820 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16821
16822 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16823
16824 // Whenever we can lower this as a zext, that instruction is strictly faster
16825 // than any alternative. It also allows us to fold memory operands into the
16826 // shuffle in many cases.
16827 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16828 Zeroable, Subtarget, DAG))
16829 return ZExt;
16830
16831 // Try to match an interleave of two v8i32s and lower them as unpck and
16832 // permutes using ymms. This needs to go before we try to split the vectors.
16833 if (!Subtarget.hasAVX512())
16834 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16835 Mask, DAG))
16836 return V;
16837
16838 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16839 // since after split we get a more efficient code than vblend by using
16840 // vpunpcklwd and vpunpckhwd instrs.
16841 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16842 !Subtarget.hasAVX512())
16843 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16844 Subtarget, DAG);
16845
16846 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16847 Zeroable, Subtarget, DAG))
16848 return Blend;
16849
16850 // Check for being able to broadcast a single element.
16851 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16852 Subtarget, DAG))
16853 return Broadcast;
16854
16855 // Try to use shift instructions if fast.
16856 if (Subtarget.preferLowerShuffleAsShift()) {
16857 if (SDValue Shift =
16858 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16859 Subtarget, DAG, /*BitwiseOnly*/ true))
16860 return Shift;
16861 if (NumV2Elements == 0)
16862 if (SDValue Rotate =
16863 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16864 return Rotate;
16865 }
16866
16867 // If the shuffle mask is repeated in each 128-bit lane we can use more
16868 // efficient instructions that mirror the shuffles across the two 128-bit
16869 // lanes.
16870 SmallVector<int, 4> RepeatedMask;
16871 bool Is128BitLaneRepeatedShuffle =
16872 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16873 if (Is128BitLaneRepeatedShuffle) {
16874 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16875 if (V2.isUndef())
16876 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16877 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16878
16879 // Use dedicated unpack instructions for masks that match their pattern.
16880 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16881 return V;
16882 }
16883
16884 // Try to use shift instructions.
16885 if (SDValue Shift =
16886 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16887 DAG, /*BitwiseOnly*/ false))
16888 return Shift;
16889
16890 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16891 if (SDValue Rotate =
16892 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16893 return Rotate;
16894
16895 // If we have VLX support, we can use VALIGN or EXPAND.
16896 if (Subtarget.hasVLX()) {
16897 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16898 Zeroable, Subtarget, DAG))
16899 return Rotate;
16900
16901 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16902 Zeroable, Subtarget, DAG))
16903 return V;
16904 }
16905
16906 // Try to use byte rotation instructions.
16907 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16908 Subtarget, DAG))
16909 return Rotate;
16910
16911 // Try to create an in-lane repeating shuffle mask and then shuffle the
16912 // results into the target lanes.
16914 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16915 return V;
16916
16917 if (V2.isUndef()) {
16918 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16919 // because that should be faster than the variable permute alternatives.
16920 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16921 return V;
16922
16923 // If the shuffle patterns aren't repeated but it's a single input, directly
16924 // generate a cross-lane VPERMD instruction.
16925 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16926 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16927 }
16928
16929 // Assume that a single SHUFPS is faster than an alternative sequence of
16930 // multiple instructions (even if the CPU has a domain penalty).
16931 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16932 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16933 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16934 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16935 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16936 CastV1, CastV2, DAG);
16937 return DAG.getBitcast(MVT::v8i32, ShufPS);
16938 }
16939
16940 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16941 // shuffle.
16943 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16944 return Result;
16945
16946 // Otherwise fall back on generic blend lowering.
16947 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16948 Zeroable, Subtarget, DAG);
16949}
16950
16951/// Handle lowering of 16-lane 16-bit integer shuffles.
16952///
16953/// This routine is only called when we have AVX2 and thus a reasonable
16954/// instruction set for v16i16 shuffling..
16956 const APInt &Zeroable, SDValue V1, SDValue V2,
16957 const X86Subtarget &Subtarget,
16958 SelectionDAG &DAG) {
16959 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16960 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16961 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16962 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16963
16964 // Whenever we can lower this as a zext, that instruction is strictly faster
16965 // than any alternative. It also allows us to fold memory operands into the
16966 // shuffle in many cases.
16968 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16969 return ZExt;
16970
16971 // Check for being able to broadcast a single element.
16972 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16973 Subtarget, DAG))
16974 return Broadcast;
16975
16976 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16977 Zeroable, Subtarget, DAG))
16978 return Blend;
16979
16980 // Use dedicated unpack instructions for masks that match their pattern.
16981 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16982 return V;
16983
16984 // Use dedicated pack instructions for masks that match their pattern.
16985 if (SDValue V =
16986 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16987 return V;
16988
16989 // Try to use lower using a truncation.
16990 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16991 Subtarget, DAG))
16992 return V;
16993
16994 // Try to use shift instructions.
16995 if (SDValue Shift =
16996 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16997 Subtarget, DAG, /*BitwiseOnly*/ false))
16998 return Shift;
16999
17000 // Try to use byte rotation instructions.
17001 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17002 Subtarget, DAG))
17003 return Rotate;
17004
17005 // Try to create an in-lane repeating shuffle mask and then shuffle the
17006 // results into the target lanes.
17008 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17009 return V;
17010
17011 if (V2.isUndef()) {
17012 // Try to use bit rotation instructions.
17013 if (SDValue Rotate =
17014 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17015 return Rotate;
17016
17017 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17018 // because that should be faster than the variable permute alternatives.
17019 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17020 return V;
17021
17022 // There are no generalized cross-lane shuffle operations available on i16
17023 // element types.
17024 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17026 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17027 return V;
17028
17029 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17030 DAG, Subtarget);
17031 }
17032
17033 SmallVector<int, 8> RepeatedMask;
17034 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17035 // As this is a single-input shuffle, the repeated mask should be
17036 // a strictly valid v8i16 mask that we can pass through to the v8i16
17037 // lowering to handle even the v16 case.
17039 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17040 }
17041 }
17042
17043 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17044 Zeroable, Subtarget, DAG))
17045 return PSHUFB;
17046
17047 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17048 if (Subtarget.hasBWI())
17049 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17050
17051 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17052 // shuffle.
17054 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17055 return Result;
17056
17057 // Try to permute the lanes and then use a per-lane permute.
17059 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17060 return V;
17061
17062 // Try to match an interleave of two v16i16s and lower them as unpck and
17063 // permutes using ymms.
17064 if (!Subtarget.hasAVX512())
17065 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17066 Mask, DAG))
17067 return V;
17068
17069 // Otherwise fall back on generic lowering.
17070 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17071 Subtarget, DAG);
17072}
17073
17074/// Handle lowering of 32-lane 8-bit integer shuffles.
17075///
17076/// This routine is only called when we have AVX2 and thus a reasonable
17077/// instruction set for v32i8 shuffling..
17079 const APInt &Zeroable, SDValue V1, SDValue V2,
17080 const X86Subtarget &Subtarget,
17081 SelectionDAG &DAG) {
17082 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17083 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17084 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17085 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17086
17087 // Whenever we can lower this as a zext, that instruction is strictly faster
17088 // than any alternative. It also allows us to fold memory operands into the
17089 // shuffle in many cases.
17090 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17091 Zeroable, Subtarget, DAG))
17092 return ZExt;
17093
17094 // Check for being able to broadcast a single element.
17095 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17096 Subtarget, DAG))
17097 return Broadcast;
17098
17099 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17100 Zeroable, Subtarget, DAG))
17101 return Blend;
17102
17103 // Use dedicated unpack instructions for masks that match their pattern.
17104 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17105 return V;
17106
17107 // Use dedicated pack instructions for masks that match their pattern.
17108 if (SDValue V =
17109 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17110 return V;
17111
17112 // Try to use lower using a truncation.
17113 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17114 Subtarget, DAG))
17115 return V;
17116
17117 // Try to use shift instructions.
17118 if (SDValue Shift =
17119 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17120 DAG, /*BitwiseOnly*/ false))
17121 return Shift;
17122
17123 // Try to use byte rotation instructions.
17124 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17125 Subtarget, DAG))
17126 return Rotate;
17127
17128 // Try to use bit rotation instructions.
17129 if (V2.isUndef())
17130 if (SDValue Rotate =
17131 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17132 return Rotate;
17133
17134 // Try to create an in-lane repeating shuffle mask and then shuffle the
17135 // results into the target lanes.
17137 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17138 return V;
17139
17140 // There are no generalized cross-lane shuffle operations available on i8
17141 // element types.
17142 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17143 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17144 // because that should be faster than the variable permute alternatives.
17145 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17146 return V;
17147
17149 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17150 return V;
17151
17152 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17153 DAG, Subtarget);
17154 }
17155
17156 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17157 Zeroable, Subtarget, DAG))
17158 return PSHUFB;
17159
17160 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17161 if (Subtarget.hasVBMI())
17162 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17163
17164 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17165 // shuffle.
17167 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17168 return Result;
17169
17170 // Try to permute the lanes and then use a per-lane permute.
17172 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17173 return V;
17174
17175 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17176 // by zeroable elements in the remaining 24 elements. Turn this into two
17177 // vmovqb instructions shuffled together.
17178 if (Subtarget.hasVLX())
17179 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17180 Mask, Zeroable, DAG))
17181 return V;
17182
17183 // Try to match an interleave of two v32i8s and lower them as unpck and
17184 // permutes using ymms.
17185 if (!Subtarget.hasAVX512())
17186 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17187 Mask, DAG))
17188 return V;
17189
17190 // Otherwise fall back on generic lowering.
17191 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17192 Subtarget, DAG);
17193}
17194
17195/// High-level routine to lower various 256-bit x86 vector shuffles.
17196///
17197/// This routine either breaks down the specific type of a 256-bit x86 vector
17198/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17199/// together based on the available instructions.
17201 SDValue V1, SDValue V2, const APInt &Zeroable,
17202 const X86Subtarget &Subtarget,
17203 SelectionDAG &DAG) {
17204 // If we have a single input to the zero element, insert that into V1 if we
17205 // can do so cheaply.
17206 int NumElts = VT.getVectorNumElements();
17207 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17208
17209 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17211 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17212 return Insertion;
17213
17214 // Handle special cases where the lower or upper half is UNDEF.
17215 if (SDValue V =
17216 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17217 return V;
17218
17219 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17220 // can check for those subtargets here and avoid much of the subtarget
17221 // querying in the per-vector-type lowering routines. With AVX1 we have
17222 // essentially *zero* ability to manipulate a 256-bit vector with integer
17223 // types. Since we'll use floating point types there eventually, just
17224 // immediately cast everything to a float and operate entirely in that domain.
17225 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17226 int ElementBits = VT.getScalarSizeInBits();
17227 if (ElementBits < 32) {
17228 // No floating point type available, if we can't use the bit operations
17229 // for masking/blending then decompose into 128-bit vectors.
17230 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17231 Subtarget, DAG))
17232 return V;
17233 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17234 return V;
17235 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17236 }
17237
17238 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17240 V1 = DAG.getBitcast(FpVT, V1);
17241 V2 = DAG.getBitcast(FpVT, V2);
17242 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17243 }
17244
17245 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17246 V1 = DAG.getBitcast(MVT::v16i16, V1);
17247 V2 = DAG.getBitcast(MVT::v16i16, V2);
17248 return DAG.getBitcast(VT,
17249 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17250 }
17251
17252 switch (VT.SimpleTy) {
17253 case MVT::v4f64:
17254 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17255 case MVT::v4i64:
17256 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17257 case MVT::v8f32:
17258 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17259 case MVT::v8i32:
17260 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17261 case MVT::v16i16:
17262 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17263 case MVT::v32i8:
17264 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17265
17266 default:
17267 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17268 }
17269}
17270
17271/// Try to lower a vector shuffle as a 128-bit shuffles.
17273 const APInt &Zeroable, SDValue V1, SDValue V2,
17274 const X86Subtarget &Subtarget,
17275 SelectionDAG &DAG) {
17276 assert(VT.getScalarSizeInBits() == 64 &&
17277 "Unexpected element type size for 128bit shuffle.");
17278
17279 // To handle 256 bit vector requires VLX and most probably
17280 // function lowerV2X128VectorShuffle() is better solution.
17281 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17282
17283 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17284 SmallVector<int, 4> Widened128Mask;
17285 if (!canWidenShuffleElements(Mask, Widened128Mask))
17286 return SDValue();
17287 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17288
17289 // Try to use an insert into a zero vector.
17290 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17291 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17292 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17293 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17294 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17295 DAG.getVectorIdxConstant(0, DL));
17296 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17297 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17298 DAG.getVectorIdxConstant(0, DL));
17299 }
17300
17301 // Check for patterns which can be matched with a single insert of a 256-bit
17302 // subvector.
17303 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17304 if (OnlyUsesV1 ||
17305 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17306 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17307 SDValue SubVec =
17308 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17309 DAG.getVectorIdxConstant(0, DL));
17310 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17311 DAG.getVectorIdxConstant(4, DL));
17312 }
17313
17314 // See if this is an insertion of the lower 128-bits of V2 into V1.
17315 bool IsInsert = true;
17316 int V2Index = -1;
17317 for (int i = 0; i < 4; ++i) {
17318 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17319 if (Widened128Mask[i] < 0)
17320 continue;
17321
17322 // Make sure all V1 subvectors are in place.
17323 if (Widened128Mask[i] < 4) {
17324 if (Widened128Mask[i] != i) {
17325 IsInsert = false;
17326 break;
17327 }
17328 } else {
17329 // Make sure we only have a single V2 index and its the lowest 128-bits.
17330 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17331 IsInsert = false;
17332 break;
17333 }
17334 V2Index = i;
17335 }
17336 }
17337 if (IsInsert && V2Index >= 0) {
17338 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17339 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17340 DAG.getVectorIdxConstant(0, DL));
17341 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17342 }
17343
17344 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17345 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17346 // possible we at least ensure the lanes stay sequential to help later
17347 // combines.
17348 SmallVector<int, 2> Widened256Mask;
17349 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17350 Widened128Mask.clear();
17351 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17352 }
17353
17354 // Try to lower to vshuf64x2/vshuf32x4.
17355 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17356 int PermMask[4] = {-1, -1, -1, -1};
17357 // Ensure elements came from the same Op.
17358 for (int i = 0; i < 4; ++i) {
17359 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17360 if (Widened128Mask[i] < 0)
17361 continue;
17362
17363 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17364 unsigned OpIndex = i / 2;
17365 if (Ops[OpIndex].isUndef())
17366 Ops[OpIndex] = Op;
17367 else if (Ops[OpIndex] != Op)
17368 return SDValue();
17369
17370 PermMask[i] = Widened128Mask[i] % 4;
17371 }
17372
17373 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17374 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17375}
17376
17377/// Handle lowering of 8-lane 64-bit floating point shuffles.
17379 const APInt &Zeroable, SDValue V1, SDValue V2,
17380 const X86Subtarget &Subtarget,
17381 SelectionDAG &DAG) {
17382 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17383 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17384 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17385
17386 if (V2.isUndef()) {
17387 // Use low duplicate instructions for masks that match their pattern.
17388 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17389 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17390
17391 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17392 // Non-half-crossing single input shuffles can be lowered with an
17393 // interleaved permutation.
17394 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17395 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17396 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17397 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17398 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17399 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17400 }
17401
17402 SmallVector<int, 4> RepeatedMask;
17403 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17404 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17405 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17406 }
17407
17408 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17409 V2, Subtarget, DAG))
17410 return Shuf128;
17411
17412 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17413 return Unpck;
17414
17415 // Check if the blend happens to exactly fit that of SHUFPD.
17416 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17417 Zeroable, Subtarget, DAG))
17418 return Op;
17419
17420 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17421 Subtarget, DAG))
17422 return V;
17423
17424 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17425 Zeroable, Subtarget, DAG))
17426 return Blend;
17427
17428 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17429}
17430
17431/// Handle lowering of 16-lane 32-bit floating point shuffles.
17433 const APInt &Zeroable, SDValue V1, SDValue V2,
17434 const X86Subtarget &Subtarget,
17435 SelectionDAG &DAG) {
17436 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17437 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17438 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17439
17440 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17441 // options to efficiently lower the shuffle.
17442 SmallVector<int, 4> RepeatedMask;
17443 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17444 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17445
17446 // Use even/odd duplicate instructions for masks that match their pattern.
17447 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17448 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17449 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17450 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17451
17452 if (V2.isUndef())
17453 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17454 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17455
17456 // Use dedicated unpack instructions for masks that match their pattern.
17457 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17458 return V;
17459
17460 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17461 Zeroable, Subtarget, DAG))
17462 return Blend;
17463
17464 // Otherwise, fall back to a SHUFPS sequence.
17465 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17466 }
17467
17468 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17469 Zeroable, Subtarget, DAG))
17470 return Blend;
17471
17473 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17474 return DAG.getBitcast(MVT::v16f32, ZExt);
17475
17476 // Try to create an in-lane repeating shuffle mask and then shuffle the
17477 // results into the target lanes.
17479 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17480 return V;
17481
17482 // If we have a single input shuffle with different shuffle patterns in the
17483 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17484 if (V2.isUndef() &&
17485 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17486 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17487 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17488 }
17489
17490 // If we have AVX512F support, we can use VEXPAND.
17491 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17492 Zeroable, Subtarget, DAG))
17493 return V;
17494
17495 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17496}
17497
17498/// Handle lowering of 8-lane 64-bit integer shuffles.
17500 const APInt &Zeroable, SDValue V1, SDValue V2,
17501 const X86Subtarget &Subtarget,
17502 SelectionDAG &DAG) {
17503 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17504 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17505 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17506
17507 // Try to use shift instructions if fast.
17508 if (Subtarget.preferLowerShuffleAsShift())
17509 if (SDValue Shift =
17510 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17511 Subtarget, DAG, /*BitwiseOnly*/ true))
17512 return Shift;
17513
17514 if (V2.isUndef()) {
17515 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17516 // can use lower latency instructions that will operate on all four
17517 // 128-bit lanes.
17518 SmallVector<int, 2> Repeated128Mask;
17519 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17520 SmallVector<int, 4> PSHUFDMask;
17521 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17522 return DAG.getBitcast(
17523 MVT::v8i64,
17524 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17525 DAG.getBitcast(MVT::v16i32, V1),
17526 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17527 }
17528
17529 SmallVector<int, 4> Repeated256Mask;
17530 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17531 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17532 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17533 }
17534
17535 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17536 V2, Subtarget, DAG))
17537 return Shuf128;
17538
17539 // Try to use shift instructions.
17540 if (SDValue Shift =
17541 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17542 DAG, /*BitwiseOnly*/ false))
17543 return Shift;
17544
17545 // Try to use VALIGN.
17546 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17547 Zeroable, Subtarget, DAG))
17548 return Rotate;
17549
17550 // Try to use PALIGNR.
17551 if (Subtarget.hasBWI())
17552 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17553 Subtarget, DAG))
17554 return Rotate;
17555
17556 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17557 return Unpck;
17558
17559 // If we have AVX512F support, we can use VEXPAND.
17560 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17561 Subtarget, DAG))
17562 return V;
17563
17564 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17565 Zeroable, Subtarget, DAG))
17566 return Blend;
17567
17568 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17569}
17570
17571/// Handle lowering of 16-lane 32-bit integer shuffles.
17573 const APInt &Zeroable, SDValue V1, SDValue V2,
17574 const X86Subtarget &Subtarget,
17575 SelectionDAG &DAG) {
17576 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17577 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17578 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17579
17580 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17581
17582 // Whenever we can lower this as a zext, that instruction is strictly faster
17583 // than any alternative. It also allows us to fold memory operands into the
17584 // shuffle in many cases.
17586 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17587 return ZExt;
17588
17589 // Try to use shift instructions if fast.
17590 if (Subtarget.preferLowerShuffleAsShift()) {
17591 if (SDValue Shift =
17592 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17593 Subtarget, DAG, /*BitwiseOnly*/ true))
17594 return Shift;
17595 if (NumV2Elements == 0)
17596 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17597 Subtarget, DAG))
17598 return Rotate;
17599 }
17600
17601 // If the shuffle mask is repeated in each 128-bit lane we can use more
17602 // efficient instructions that mirror the shuffles across the four 128-bit
17603 // lanes.
17604 SmallVector<int, 4> RepeatedMask;
17605 bool Is128BitLaneRepeatedShuffle =
17606 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17607 if (Is128BitLaneRepeatedShuffle) {
17608 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17609 if (V2.isUndef())
17610 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17611 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17612
17613 // Use dedicated unpack instructions for masks that match their pattern.
17614 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17615 return V;
17616 }
17617
17618 // Try to use shift instructions.
17619 if (SDValue Shift =
17620 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17621 Subtarget, DAG, /*BitwiseOnly*/ false))
17622 return Shift;
17623
17624 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17625 if (SDValue Rotate =
17626 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17627 return Rotate;
17628
17629 // Try to use VALIGN.
17630 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17631 Zeroable, Subtarget, DAG))
17632 return Rotate;
17633
17634 // Try to use byte rotation instructions.
17635 if (Subtarget.hasBWI())
17636 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17637 Subtarget, DAG))
17638 return Rotate;
17639
17640 // Assume that a single SHUFPS is faster than using a permv shuffle.
17641 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17642 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17643 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17644 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17645 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17646 CastV1, CastV2, DAG);
17647 return DAG.getBitcast(MVT::v16i32, ShufPS);
17648 }
17649
17650 // Try to create an in-lane repeating shuffle mask and then shuffle the
17651 // results into the target lanes.
17653 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17654 return V;
17655
17656 // If we have AVX512F support, we can use VEXPAND.
17657 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17658 Zeroable, Subtarget, DAG))
17659 return V;
17660
17661 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17662 Zeroable, Subtarget, DAG))
17663 return Blend;
17664
17665 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17666}
17667
17668/// Handle lowering of 32-lane 16-bit integer shuffles.
17670 const APInt &Zeroable, SDValue V1, SDValue V2,
17671 const X86Subtarget &Subtarget,
17672 SelectionDAG &DAG) {
17673 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17674 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17675 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17676 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17677
17678 // Whenever we can lower this as a zext, that instruction is strictly faster
17679 // than any alternative. It also allows us to fold memory operands into the
17680 // shuffle in many cases.
17682 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17683 return ZExt;
17684
17685 // Use dedicated unpack instructions for masks that match their pattern.
17686 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17687 return V;
17688
17689 // Use dedicated pack instructions for masks that match their pattern.
17690 if (SDValue V =
17691 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17692 return V;
17693
17694 // Try to use shift instructions.
17695 if (SDValue Shift =
17696 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17697 Subtarget, DAG, /*BitwiseOnly*/ false))
17698 return Shift;
17699
17700 // Try to use byte rotation instructions.
17701 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17702 Subtarget, DAG))
17703 return Rotate;
17704
17705 if (V2.isUndef()) {
17706 // Try to use bit rotation instructions.
17707 if (SDValue Rotate =
17708 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17709 return Rotate;
17710
17711 SmallVector<int, 8> RepeatedMask;
17712 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17713 // As this is a single-input shuffle, the repeated mask should be
17714 // a strictly valid v8i16 mask that we can pass through to the v8i16
17715 // lowering to handle even the v32 case.
17716 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17717 RepeatedMask, Subtarget, DAG);
17718 }
17719 }
17720
17721 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17722 Zeroable, Subtarget, DAG))
17723 return Blend;
17724
17725 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17726 Zeroable, Subtarget, DAG))
17727 return PSHUFB;
17728
17729 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17730 // shuffle.
17731 if (!V2.isUndef())
17733 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17734 return Result;
17735
17736 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17737}
17738
17739/// Handle lowering of 64-lane 8-bit integer shuffles.
17741 const APInt &Zeroable, SDValue V1, SDValue V2,
17742 const X86Subtarget &Subtarget,
17743 SelectionDAG &DAG) {
17744 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17745 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17746 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17747 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17748
17749 // Whenever we can lower this as a zext, that instruction is strictly faster
17750 // than any alternative. It also allows us to fold memory operands into the
17751 // shuffle in many cases.
17753 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17754 return ZExt;
17755
17756 // Use dedicated unpack instructions for masks that match their pattern.
17757 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17758 return V;
17759
17760 // Use dedicated pack instructions for masks that match their pattern.
17761 if (SDValue V =
17762 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17763 return V;
17764
17765 // Try to use shift instructions.
17766 if (SDValue Shift =
17767 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17768 DAG, /*BitwiseOnly*/ false))
17769 return Shift;
17770
17771 // Try to use byte rotation instructions.
17772 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17773 Subtarget, DAG))
17774 return Rotate;
17775
17776 // Try to use bit rotation instructions.
17777 if (V2.isUndef())
17778 if (SDValue Rotate =
17779 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17780 return Rotate;
17781
17782 // Lower as AND if possible.
17783 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17784 Zeroable, Subtarget, DAG))
17785 return Masked;
17786
17787 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17788 Zeroable, Subtarget, DAG))
17789 return PSHUFB;
17790
17791 // Try to create an in-lane repeating shuffle mask and then shuffle the
17792 // results into the target lanes.
17794 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17795 return V;
17796
17798 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17799 return Result;
17800
17801 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17802 Zeroable, Subtarget, DAG))
17803 return Blend;
17804
17805 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17806 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17807 // PALIGNR will be cheaper than the second PSHUFB+OR.
17808 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17809 Mask, Subtarget, DAG))
17810 return V;
17811
17812 // If we can't directly blend but can use PSHUFB, that will be better as it
17813 // can both shuffle and set up the inefficient blend.
17814 bool V1InUse, V2InUse;
17815 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17816 DAG, V1InUse, V2InUse);
17817 }
17818
17819 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17820 // shuffle.
17821 if (!V2.isUndef())
17823 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17824 return Result;
17825
17826 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17827 if (Subtarget.hasVBMI())
17828 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17829
17830 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17831}
17832
17833/// High-level routine to lower various 512-bit x86 vector shuffles.
17834///
17835/// This routine either breaks down the specific type of a 512-bit x86 vector
17836/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17837/// together based on the available instructions.
17839 MVT VT, SDValue V1, SDValue V2,
17840 const APInt &Zeroable,
17841 const X86Subtarget &Subtarget,
17842 SelectionDAG &DAG) {
17843 assert(Subtarget.hasAVX512() &&
17844 "Cannot lower 512-bit vectors w/ basic ISA!");
17845
17846 // If we have a single input to the zero element, insert that into V1 if we
17847 // can do so cheaply.
17848 int NumElts = Mask.size();
17849 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17850
17851 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17853 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17854 return Insertion;
17855
17856 // Handle special cases where the lower or upper half is UNDEF.
17857 if (SDValue V =
17858 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17859 return V;
17860
17861 // Check for being able to broadcast a single element.
17862 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17863 Subtarget, DAG))
17864 return Broadcast;
17865
17866 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17867 // Try using bit ops for masking and blending before falling back to
17868 // splitting.
17869 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17870 Subtarget, DAG))
17871 return V;
17872 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17873 return V;
17874
17875 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17876 }
17877
17878 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17879 if (!Subtarget.hasBWI())
17880 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17881 /*SimpleOnly*/ false);
17882
17883 V1 = DAG.getBitcast(MVT::v32i16, V1);
17884 V2 = DAG.getBitcast(MVT::v32i16, V2);
17885 return DAG.getBitcast(VT,
17886 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17887 }
17888
17889 // Dispatch to each element type for lowering. If we don't have support for
17890 // specific element type shuffles at 512 bits, immediately split them and
17891 // lower them. Each lowering routine of a given type is allowed to assume that
17892 // the requisite ISA extensions for that element type are available.
17893 switch (VT.SimpleTy) {
17894 case MVT::v8f64:
17895 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17896 case MVT::v16f32:
17897 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17898 case MVT::v8i64:
17899 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17900 case MVT::v16i32:
17901 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17902 case MVT::v32i16:
17903 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17904 case MVT::v64i8:
17905 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17906
17907 default:
17908 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17909 }
17910}
17911
17913 MVT VT, SDValue V1, SDValue V2,
17914 const X86Subtarget &Subtarget,
17915 SelectionDAG &DAG) {
17916 // Shuffle should be unary.
17917 if (!V2.isUndef())
17918 return SDValue();
17919
17920 int ShiftAmt = -1;
17921 int NumElts = Mask.size();
17922 for (int i = 0; i != NumElts; ++i) {
17923 int M = Mask[i];
17924 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17925 "Unexpected mask index.");
17926 if (M < 0)
17927 continue;
17928
17929 // The first non-undef element determines our shift amount.
17930 if (ShiftAmt < 0) {
17931 ShiftAmt = M - i;
17932 // Need to be shifting right.
17933 if (ShiftAmt <= 0)
17934 return SDValue();
17935 }
17936 // All non-undef elements must shift by the same amount.
17937 if (ShiftAmt != M - i)
17938 return SDValue();
17939 }
17940 assert(ShiftAmt >= 0 && "All undef?");
17941
17942 // Great we found a shift right.
17943 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17944 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17945 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17946 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17947 DAG.getVectorIdxConstant(0, DL));
17948}
17949
17950// Determine if this shuffle can be implemented with a KSHIFT instruction.
17951// Returns the shift amount if possible or -1 if not. This is a simplified
17952// version of matchShuffleAsShift.
17953static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17954 int MaskOffset, const APInt &Zeroable) {
17955 int Size = Mask.size();
17956
17957 auto CheckZeros = [&](int Shift, bool Left) {
17958 for (int j = 0; j < Shift; ++j)
17959 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17960 return false;
17961
17962 return true;
17963 };
17964
17965 auto MatchShift = [&](int Shift, bool Left) {
17966 unsigned Pos = Left ? Shift : 0;
17967 unsigned Low = Left ? 0 : Shift;
17968 unsigned Len = Size - Shift;
17969 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17970 };
17971
17972 for (int Shift = 1; Shift != Size; ++Shift)
17973 for (bool Left : {true, false})
17974 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17976 return Shift;
17977 }
17978
17979 return -1;
17980}
17981
17982
17983// Lower vXi1 vector shuffles.
17984// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17985// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17986// vector, shuffle and then truncate it back.
17988 MVT VT, SDValue V1, SDValue V2,
17989 const APInt &Zeroable,
17990 const X86Subtarget &Subtarget,
17991 SelectionDAG &DAG) {
17992 assert(Subtarget.hasAVX512() &&
17993 "Cannot lower 512-bit vectors w/o basic ISA!");
17994
17995 int NumElts = Mask.size();
17996 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17997
17998 // Try to recognize shuffles that are just padding a subvector with zeros.
17999 int SubvecElts = 0;
18000 int Src = -1;
18001 for (int i = 0; i != NumElts; ++i) {
18002 if (Mask[i] >= 0) {
18003 // Grab the source from the first valid mask. All subsequent elements need
18004 // to use this same source.
18005 if (Src < 0)
18006 Src = Mask[i] / NumElts;
18007 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18008 break;
18009 }
18010
18011 ++SubvecElts;
18012 }
18013 assert(SubvecElts != NumElts && "Identity shuffle?");
18014
18015 // Clip to a power 2.
18016 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18017
18018 // Make sure the number of zeroable bits in the top at least covers the bits
18019 // not covered by the subvector.
18020 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18021 assert(Src >= 0 && "Expected a source!");
18022 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18023 SDValue Extract =
18024 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18025 DAG.getVectorIdxConstant(0, DL));
18026 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18027 DAG.getConstant(0, DL, VT), Extract,
18028 DAG.getVectorIdxConstant(0, DL));
18029 }
18030
18031 // Try a simple shift right with undef elements. Later we'll try with zeros.
18032 if (SDValue Shift =
18033 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18034 return Shift;
18035
18036 // Try to match KSHIFTs.
18037 unsigned Offset = 0;
18038 for (SDValue V : {V1, V2}) {
18039 unsigned Opcode;
18040 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18041 if (ShiftAmt >= 0) {
18042 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18043 MVT WideVT = Res.getSimpleValueType();
18044 // Widened right shifts need two shifts to ensure we shift in zeroes.
18045 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18046 int WideElts = WideVT.getVectorNumElements();
18047 // Shift left to put the original vector in the MSBs of the new size.
18048 Res =
18049 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18050 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18051 // Increase the shift amount to account for the left shift.
18052 ShiftAmt += WideElts - NumElts;
18053 }
18054
18055 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18056 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18057 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18058 DAG.getVectorIdxConstant(0, DL));
18059 }
18060 Offset += NumElts; // Increment for next iteration.
18061 }
18062
18063 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18064 // ops instead.
18065 // TODO: What other unary shuffles would benefit from this?
18066 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18067 SDValue Op0 = V1.getOperand(0);
18068 SDValue Op1 = V1.getOperand(1);
18070 EVT OpVT = Op0.getValueType();
18071 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18072 return DAG.getSetCC(
18073 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18074 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18075 }
18076
18077 MVT ExtVT;
18078 switch (VT.SimpleTy) {
18079 default:
18080 llvm_unreachable("Expected a vector of i1 elements");
18081 case MVT::v2i1:
18082 ExtVT = MVT::v2i64;
18083 break;
18084 case MVT::v4i1:
18085 ExtVT = MVT::v4i32;
18086 break;
18087 case MVT::v8i1:
18088 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18089 // shuffle.
18090 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18091 break;
18092 case MVT::v16i1:
18093 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18094 // 256-bit operation available.
18095 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18096 break;
18097 case MVT::v32i1:
18098 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18099 // 256-bit operation available.
18100 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18101 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18102 break;
18103 case MVT::v64i1:
18104 // Fall back to scalarization. FIXME: We can do better if the shuffle
18105 // can be partitioned cleanly.
18106 if (!Subtarget.useBWIRegs())
18107 return SDValue();
18108 ExtVT = MVT::v64i8;
18109 break;
18110 }
18111
18112 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18113 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18114
18115 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18116 // i1 was sign extended we can use X86ISD::CVT2MASK.
18117 int NumElems = VT.getVectorNumElements();
18118 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18119 (Subtarget.hasDQI() && (NumElems < 32)))
18120 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18121 Shuffle, ISD::SETGT);
18122
18123 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18124}
18125
18126/// Helper function that returns true if the shuffle mask should be
18127/// commuted to improve canonicalization.
18129 int NumElements = Mask.size();
18130
18131 int NumV1Elements = 0, NumV2Elements = 0;
18132 for (int M : Mask)
18133 if (M < 0)
18134 continue;
18135 else if (M < NumElements)
18136 ++NumV1Elements;
18137 else
18138 ++NumV2Elements;
18139
18140 // Commute the shuffle as needed such that more elements come from V1 than
18141 // V2. This allows us to match the shuffle pattern strictly on how many
18142 // elements come from V1 without handling the symmetric cases.
18143 if (NumV2Elements > NumV1Elements)
18144 return true;
18145
18146 assert(NumV1Elements > 0 && "No V1 indices");
18147
18148 if (NumV2Elements == 0)
18149 return false;
18150
18151 // When the number of V1 and V2 elements are the same, try to minimize the
18152 // number of uses of V2 in the low half of the vector. When that is tied,
18153 // ensure that the sum of indices for V1 is equal to or lower than the sum
18154 // indices for V2. When those are equal, try to ensure that the number of odd
18155 // indices for V1 is lower than the number of odd indices for V2.
18156 if (NumV1Elements == NumV2Elements) {
18157 int LowV1Elements = 0, LowV2Elements = 0;
18158 for (int M : Mask.slice(0, NumElements / 2))
18159 if (M >= NumElements)
18160 ++LowV2Elements;
18161 else if (M >= 0)
18162 ++LowV1Elements;
18163 if (LowV2Elements > LowV1Elements)
18164 return true;
18165 if (LowV2Elements == LowV1Elements) {
18166 int SumV1Indices = 0, SumV2Indices = 0;
18167 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18168 if (Mask[i] >= NumElements)
18169 SumV2Indices += i;
18170 else if (Mask[i] >= 0)
18171 SumV1Indices += i;
18172 if (SumV2Indices < SumV1Indices)
18173 return true;
18174 if (SumV2Indices == SumV1Indices) {
18175 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18176 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18177 if (Mask[i] >= NumElements)
18178 NumV2OddIndices += i % 2;
18179 else if (Mask[i] >= 0)
18180 NumV1OddIndices += i % 2;
18181 if (NumV2OddIndices < NumV1OddIndices)
18182 return true;
18183 }
18184 }
18185 }
18186
18187 return false;
18188}
18189
18191 const X86Subtarget &Subtarget) {
18192 if (!Subtarget.hasAVX512())
18193 return false;
18194
18195 if (!V.getValueType().isSimple())
18196 return false;
18197
18198 MVT VT = V.getSimpleValueType().getScalarType();
18199 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18200 return false;
18201
18202 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18203 // are preferable to blendw/blendvb/masked-mov.
18204 if ((VT == MVT::i16 || VT == MVT::i8) &&
18205 V.getSimpleValueType().getSizeInBits() < 512)
18206 return false;
18207
18208 auto HasMaskOperation = [&](SDValue V) {
18209 // TODO: Currently we only check limited opcode. We probably extend
18210 // it to all binary operation by checking TLI.isBinOp().
18211 switch (V->getOpcode()) {
18212 default:
18213 return false;
18214 case ISD::ADD:
18215 case ISD::SUB:
18216 case ISD::AND:
18217 case ISD::XOR:
18218 case ISD::OR:
18219 case ISD::SMAX:
18220 case ISD::SMIN:
18221 case ISD::UMAX:
18222 case ISD::UMIN:
18223 case ISD::ABS:
18224 case ISD::SHL:
18225 case ISD::SRL:
18226 case ISD::SRA:
18227 case ISD::MUL:
18228 break;
18229 }
18230 if (!V->hasOneUse())
18231 return false;
18232
18233 return true;
18234 };
18235
18236 if (HasMaskOperation(V))
18237 return true;
18238
18239 return false;
18240}
18241
18242// Forward declaration.
18245 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18246 const X86Subtarget &Subtarget);
18247
18248 /// Top-level lowering for x86 vector shuffles.
18249///
18250/// This handles decomposition, canonicalization, and lowering of all x86
18251/// vector shuffles. Most of the specific lowering strategies are encapsulated
18252/// above in helper routines. The canonicalization attempts to widen shuffles
18253/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18254/// s.t. only one of the two inputs needs to be tested, etc.
18256 SelectionDAG &DAG) {
18258 ArrayRef<int> OrigMask = SVOp->getMask();
18259 SDValue V1 = Op.getOperand(0);
18260 SDValue V2 = Op.getOperand(1);
18261 MVT VT = Op.getSimpleValueType();
18262 int NumElements = VT.getVectorNumElements();
18263 SDLoc DL(Op);
18264 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18265
18266 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18267 "Can't lower MMX shuffles");
18268
18269 bool V1IsUndef = V1.isUndef();
18270 bool V2IsUndef = V2.isUndef();
18271 if (V1IsUndef && V2IsUndef)
18272 return DAG.getUNDEF(VT);
18273
18274 // When we create a shuffle node we put the UNDEF node to second operand,
18275 // but in some cases the first operand may be transformed to UNDEF.
18276 // In this case we should just commute the node.
18277 if (V1IsUndef)
18278 return DAG.getCommutedVectorShuffle(*SVOp);
18279
18280 // Check for non-undef masks pointing at an undef vector and make the masks
18281 // undef as well. This makes it easier to match the shuffle based solely on
18282 // the mask.
18283 if (V2IsUndef &&
18284 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18285 SmallVector<int, 8> NewMask(OrigMask);
18286 for (int &M : NewMask)
18287 if (M >= NumElements)
18288 M = -1;
18289 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18290 }
18291
18292 // Check for illegal shuffle mask element index values.
18293 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18294 (void)MaskUpperLimit;
18295 assert(llvm::all_of(OrigMask,
18296 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18297 "Out of bounds shuffle index");
18298
18299 // We actually see shuffles that are entirely re-arrangements of a set of
18300 // zero inputs. This mostly happens while decomposing complex shuffles into
18301 // simple ones. Directly lower these as a buildvector of zeros.
18302 APInt KnownUndef, KnownZero;
18303 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18304
18305 APInt Zeroable = KnownUndef | KnownZero;
18306 if (Zeroable.isAllOnes())
18307 return getZeroVector(VT, Subtarget, DAG, DL);
18308
18309 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18310
18311 // Try to collapse shuffles into using a vector type with fewer elements but
18312 // wider element types. We cap this to not form integers or floating point
18313 // elements wider than 64 bits. It does not seem beneficial to form i128
18314 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18315 SmallVector<int, 16> WidenedMask;
18316 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18317 !canCombineAsMaskOperation(V1, Subtarget) &&
18318 !canCombineAsMaskOperation(V2, Subtarget) &&
18319 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18320 // Shuffle mask widening should not interfere with a broadcast opportunity
18321 // by obfuscating the operands with bitcasts.
18322 // TODO: Avoid lowering directly from this top-level function: make this
18323 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18324 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18325 Subtarget, DAG))
18326 return Broadcast;
18327
18328 MVT NewEltVT = VT.isFloatingPoint()
18331 int NewNumElts = NumElements / 2;
18332 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18333 // Make sure that the new vector type is legal. For example, v2f64 isn't
18334 // legal on SSE1.
18335 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18336 if (V2IsZero) {
18337 // Modify the new Mask to take all zeros from the all-zero vector.
18338 // Choose indices that are blend-friendly.
18339 bool UsedZeroVector = false;
18340 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18341 "V2's non-undef elements are used?!");
18342 for (int i = 0; i != NewNumElts; ++i)
18343 if (WidenedMask[i] == SM_SentinelZero) {
18344 WidenedMask[i] = i + NewNumElts;
18345 UsedZeroVector = true;
18346 }
18347 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18348 // some elements to be undef.
18349 if (UsedZeroVector)
18350 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18351 }
18352 V1 = DAG.getBitcast(NewVT, V1);
18353 V2 = DAG.getBitcast(NewVT, V2);
18354 return DAG.getBitcast(
18355 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18356 }
18357 }
18358
18359 SmallVector<SDValue> Ops = {V1, V2};
18360 SmallVector<int> Mask(OrigMask);
18361
18362 // Canonicalize the shuffle with any horizontal ops inputs.
18363 // NOTE: This may update Ops and Mask.
18365 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18366 return DAG.getBitcast(VT, HOp);
18367
18368 V1 = DAG.getBitcast(VT, Ops[0]);
18369 V2 = DAG.getBitcast(VT, Ops[1]);
18370 assert(NumElements == (int)Mask.size() &&
18371 "canonicalizeShuffleMaskWithHorizOp "
18372 "shouldn't alter the shuffle mask size");
18373
18374 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18375 // These will be materialized uniformly anyway, so make splat matching easier.
18376 // TODO: Allow all int constants?
18377 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18378 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18379 BitVector Undefs;
18380 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18381 if (Undefs.any() &&
18384 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18385 }
18386 }
18387 }
18388 return V;
18389 };
18390 V1 = CanonicalizeConstant(V1);
18391 V2 = CanonicalizeConstant(V2);
18392
18393 // Commute the shuffle if it will improve canonicalization.
18396 std::swap(V1, V2);
18397 }
18398
18399 // For each vector width, delegate to a specialized lowering routine.
18400 if (VT.is128BitVector())
18401 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18402
18403 if (VT.is256BitVector())
18404 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18405
18406 if (VT.is512BitVector())
18407 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18408
18409 if (Is1BitVector)
18410 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18411
18412 llvm_unreachable("Unimplemented!");
18413}
18414
18415// As legal vpcompress instructions depend on various AVX512 extensions, try to
18416// convert illegal vector sizes to legal ones to avoid expansion.
18418 SelectionDAG &DAG) {
18419 assert(Subtarget.hasAVX512() &&
18420 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18421
18422 SDLoc DL(Op);
18423 SDValue Vec = Op.getOperand(0);
18424 SDValue Mask = Op.getOperand(1);
18425 SDValue Passthru = Op.getOperand(2);
18426
18427 EVT VecVT = Vec.getValueType();
18428 EVT ElementVT = VecVT.getVectorElementType();
18429 unsigned NumElements = VecVT.getVectorNumElements();
18430 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18431 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18432
18433 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18434 // compressed as 512-bit vectors in AVX512F.
18435 if (NumVecBits != 128 && NumVecBits != 256)
18436 return SDValue();
18437
18438 if (NumElementBits == 32 || NumElementBits == 64) {
18439 unsigned NumLargeElements = 512 / NumElementBits;
18440 MVT LargeVecVT =
18441 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18442 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18443
18444 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18445 DAG, DL);
18446 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18447 Subtarget, DAG, DL);
18448 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18449 : widenSubVector(LargeVecVT, Passthru,
18450 /*ZeroNewElements=*/false,
18451 Subtarget, DAG, DL);
18452
18453 SDValue Compressed =
18454 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18455 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18456 DAG.getConstant(0, DL, MVT::i64));
18457 }
18458
18459 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18460 VecVT == MVT::v16i16) {
18461 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18462 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18463
18464 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18465 Passthru = Passthru.isUndef()
18466 ? DAG.getUNDEF(LargeVecVT)
18467 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18468
18469 SDValue Compressed =
18470 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18471 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18472 }
18473
18474 return SDValue();
18475}
18476
18477/// Try to lower a VSELECT instruction to a vector shuffle.
18479 const X86Subtarget &Subtarget,
18480 SelectionDAG &DAG) {
18481 SDValue Cond = Op.getOperand(0);
18482 SDValue LHS = Op.getOperand(1);
18483 SDValue RHS = Op.getOperand(2);
18484 MVT VT = Op.getSimpleValueType();
18485
18486 // Only non-legal VSELECTs reach this lowering, convert those into generic
18487 // shuffles and re-use the shuffle lowering path for blends.
18491 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18492 }
18493
18494 return SDValue();
18495}
18496
18497SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18498 SDValue Cond = Op.getOperand(0);
18499 SDValue LHS = Op.getOperand(1);
18500 SDValue RHS = Op.getOperand(2);
18501
18502 SDLoc dl(Op);
18503 MVT VT = Op.getSimpleValueType();
18504 if (isSoftF16(VT, Subtarget)) {
18505 MVT NVT = VT.changeVectorElementTypeToInteger();
18506 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18507 DAG.getBitcast(NVT, LHS),
18508 DAG.getBitcast(NVT, RHS)));
18509 }
18510
18511 // A vselect where all conditions and data are constants can be optimized into
18512 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18516 return SDValue();
18517
18518 // Try to lower this to a blend-style vector shuffle. This can handle all
18519 // constant condition cases.
18520 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18521 return BlendOp;
18522
18523 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18524 // with patterns on the mask registers on AVX-512.
18525 MVT CondVT = Cond.getSimpleValueType();
18526 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18527 if (CondEltSize == 1)
18528 return Op;
18529
18530 // Variable blends are only legal from SSE4.1 onward.
18531 if (!Subtarget.hasSSE41())
18532 return SDValue();
18533
18534 unsigned EltSize = VT.getScalarSizeInBits();
18535 unsigned NumElts = VT.getVectorNumElements();
18536
18537 // Expand v32i16/v64i8 without BWI.
18538 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18539 return SDValue();
18540
18541 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18542 // into an i1 condition so that we can use the mask-based 512-bit blend
18543 // instructions.
18544 if (VT.getSizeInBits() == 512) {
18545 // Build a mask by testing the condition against zero.
18546 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18547 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18548 DAG.getConstant(0, dl, CondVT),
18549 ISD::SETNE);
18550 // Now return a new VSELECT using the mask.
18551 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18552 }
18553
18554 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18555 if (CondEltSize != EltSize) {
18556 // If we don't have a sign splat, rely on the expansion.
18557 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18558 return SDValue();
18559
18560 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18561 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18562 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18563 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18564 }
18565
18566 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18567 // are free to split, then better to split before expanding the
18568 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18569 // TODO: This is very similar to narrowVectorSelect.
18570 // TODO: Add Load splitting to isFreeToSplitVector ?
18571 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18572 !Subtarget.hasXOP()) {
18573 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18574 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18575 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18576 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18577 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18578 if (FreeCond && (FreeLHS || FreeRHS))
18579 return splitVectorOp(Op, DAG, dl);
18580 }
18581
18582 // Only some types will be legal on some subtargets. If we can emit a legal
18583 // VSELECT-matching blend, return Op, and but if we need to expand, return
18584 // a null value.
18585 switch (VT.SimpleTy) {
18586 default:
18587 // Most of the vector types have blends past SSE4.1.
18588 return Op;
18589
18590 case MVT::v32i8:
18591 // The byte blends for AVX vectors were introduced only in AVX2.
18592 if (Subtarget.hasAVX2())
18593 return Op;
18594
18595 return SDValue();
18596
18597 case MVT::v8i16:
18598 case MVT::v16i16:
18599 case MVT::v8f16:
18600 case MVT::v16f16: {
18601 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18602 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18603 Cond = DAG.getBitcast(CastVT, Cond);
18604 LHS = DAG.getBitcast(CastVT, LHS);
18605 RHS = DAG.getBitcast(CastVT, RHS);
18606 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18607 return DAG.getBitcast(VT, Select);
18608 }
18609 }
18610}
18611
18613 MVT VT = Op.getSimpleValueType();
18614 SDValue Vec = Op.getOperand(0);
18615 SDValue Idx = Op.getOperand(1);
18616 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18617 SDLoc dl(Op);
18618
18620 return SDValue();
18621
18622 if (VT.getSizeInBits() == 8) {
18623 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18624 // we're going to zero extend the register or fold the store.
18627 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18628 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18629 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18630
18631 unsigned IdxVal = Idx->getAsZExtVal();
18632 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18633 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18634 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18635 }
18636
18637 if (VT == MVT::f32) {
18638 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18639 // the result back to FR32 register. It's only worth matching if the
18640 // result has a single use which is a store or a bitcast to i32. And in
18641 // the case of a store, it's not worth it if the index is a constant 0,
18642 // because a MOVSSmr can be used instead, which is smaller and faster.
18643 if (!Op.hasOneUse())
18644 return SDValue();
18645 SDNode *User = *Op.getNode()->user_begin();
18646 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18647 (User->getOpcode() != ISD::BITCAST ||
18648 User->getValueType(0) != MVT::i32))
18649 return SDValue();
18650 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18651 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18652 return DAG.getBitcast(MVT::f32, Extract);
18653 }
18654
18655 if (VT == MVT::i32 || VT == MVT::i64)
18656 return Op;
18657
18658 return SDValue();
18659}
18660
18661/// Extract one bit from mask vector, like v16i1 or v8i1.
18662/// AVX-512 feature.
18664 const X86Subtarget &Subtarget) {
18665 SDValue Vec = Op.getOperand(0);
18666 SDLoc dl(Vec);
18667 MVT VecVT = Vec.getSimpleValueType();
18668 SDValue Idx = Op.getOperand(1);
18669 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18670 MVT EltVT = Op.getSimpleValueType();
18671
18672 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18673 "Unexpected vector type in ExtractBitFromMaskVector");
18674
18675 // variable index can't be handled in mask registers,
18676 // extend vector to VR512/128
18677 if (!IdxC) {
18678 unsigned NumElts = VecVT.getVectorNumElements();
18679 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18680 // than extending to 128/256bit.
18681 if (NumElts == 1) {
18682 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18684 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18685 }
18686 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18687 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18688 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18689 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18690 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18691 }
18692
18693 unsigned IdxVal = IdxC->getZExtValue();
18694 if (IdxVal == 0) // the operation is legal
18695 return Op;
18696
18697 // Extend to natively supported kshift.
18698 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18699
18700 // Use kshiftr instruction to move to the lower element.
18701 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18702 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18703
18704 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18705 DAG.getVectorIdxConstant(0, dl));
18706}
18707
18708// Helper to find all the extracted elements from a vector.
18710 MVT VT = N->getSimpleValueType(0);
18711 unsigned NumElts = VT.getVectorNumElements();
18712 APInt DemandedElts = APInt::getZero(NumElts);
18713 for (SDNode *User : N->users()) {
18714 switch (User->getOpcode()) {
18715 case X86ISD::PEXTRB:
18716 case X86ISD::PEXTRW:
18719 DemandedElts.setAllBits();
18720 return DemandedElts;
18721 }
18722 DemandedElts.setBit(User->getConstantOperandVal(1));
18723 break;
18724 case ISD::BITCAST: {
18725 if (!User->getValueType(0).isSimple() ||
18726 !User->getValueType(0).isVector()) {
18727 DemandedElts.setAllBits();
18728 return DemandedElts;
18729 }
18730 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18731 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18732 break;
18733 }
18734 default:
18735 DemandedElts.setAllBits();
18736 return DemandedElts;
18737 }
18738 }
18739 return DemandedElts;
18740}
18741
18742SDValue
18743X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18744 SelectionDAG &DAG) const {
18745 SDLoc dl(Op);
18746 SDValue Vec = Op.getOperand(0);
18747 MVT VecVT = Vec.getSimpleValueType();
18748 SDValue Idx = Op.getOperand(1);
18749 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18750
18751 if (VecVT.getVectorElementType() == MVT::i1)
18752 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18753
18754 if (!IdxC) {
18755 // Its more profitable to go through memory (1 cycles throughput)
18756 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18757 // IACA tool was used to get performance estimation
18758 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18759 //
18760 // example : extractelement <16 x i8> %a, i32 %i
18761 //
18762 // Block Throughput: 3.00 Cycles
18763 // Throughput Bottleneck: Port5
18764 //
18765 // | Num Of | Ports pressure in cycles | |
18766 // | Uops | 0 - DV | 5 | 6 | 7 | |
18767 // ---------------------------------------------
18768 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18769 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18770 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18771 // Total Num Of Uops: 4
18772 //
18773 //
18774 // Block Throughput: 1.00 Cycles
18775 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18776 //
18777 // | | Ports pressure in cycles | |
18778 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18779 // ---------------------------------------------------------
18780 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18781 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18782 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18783 // Total Num Of Uops: 4
18784
18785 return SDValue();
18786 }
18787
18788 unsigned IdxVal = IdxC->getZExtValue();
18789
18790 // If this is a 256-bit vector result, first extract the 128-bit vector and
18791 // then extract the element from the 128-bit vector.
18792 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18793 // Get the 128-bit vector.
18794 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18795 MVT EltVT = VecVT.getVectorElementType();
18796
18797 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18798 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18799
18800 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18801 // this can be done with a mask.
18802 IdxVal &= ElemsPerChunk - 1;
18803 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18804 DAG.getVectorIdxConstant(IdxVal, dl));
18805 }
18806
18807 assert(VecVT.is128BitVector() && "Unexpected vector length");
18808
18809 MVT VT = Op.getSimpleValueType();
18810
18811 if (VT == MVT::i16) {
18812 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18813 // we're going to zero extend the register or fold the store (SSE41 only).
18814 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18815 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18816 if (Subtarget.hasFP16())
18817 return Op;
18818
18819 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18820 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18821 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18822 }
18823
18824 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18825 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18826 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18827 }
18828
18829 if (Subtarget.hasSSE41())
18830 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18831 return Res;
18832
18833 // Only extract a single element from a v16i8 source - determine the common
18834 // DWORD/WORD that all extractions share, and extract the sub-byte.
18835 // TODO: Add QWORD MOVQ extraction?
18836 if (VT == MVT::i8) {
18837 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18838 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18839
18840 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18841 int DWordIdx = IdxVal / 4;
18842 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18843 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18844 DAG.getBitcast(MVT::v4i32, Vec),
18845 DAG.getVectorIdxConstant(DWordIdx, dl));
18846 int ShiftVal = (IdxVal % 4) * 8;
18847 if (ShiftVal != 0)
18848 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18849 DAG.getConstant(ShiftVal, dl, MVT::i8));
18850 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18851 }
18852
18853 int WordIdx = IdxVal / 2;
18854 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18855 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18856 DAG.getBitcast(MVT::v8i16, Vec),
18857 DAG.getVectorIdxConstant(WordIdx, dl));
18858 int ShiftVal = (IdxVal % 2) * 8;
18859 if (ShiftVal != 0)
18860 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18861 DAG.getConstant(ShiftVal, dl, MVT::i8));
18862 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18863 }
18864 }
18865
18866 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18867 if (IdxVal == 0)
18868 return Op;
18869
18870 // Shuffle the element to the lowest element, then movss or movsh.
18871 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18872 Mask[0] = static_cast<int>(IdxVal);
18873 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18874 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18875 DAG.getVectorIdxConstant(0, dl));
18876 }
18877
18878 if (VT.getSizeInBits() == 64) {
18879 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18880 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18881 // to match extract_elt for f64.
18882 if (IdxVal == 0)
18883 return Op;
18884
18885 // UNPCKHPD the element to the lowest double word, then movsd.
18886 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18887 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18888 int Mask[2] = { 1, -1 };
18889 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18890 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18891 DAG.getVectorIdxConstant(0, dl));
18892 }
18893
18894 return SDValue();
18895}
18896
18897/// Insert one bit to mask vector, like v16i1 or v8i1.
18898/// AVX-512 feature.
18900 const X86Subtarget &Subtarget) {
18901 SDLoc dl(Op);
18902 SDValue Vec = Op.getOperand(0);
18903 SDValue Elt = Op.getOperand(1);
18904 SDValue Idx = Op.getOperand(2);
18905 MVT VecVT = Vec.getSimpleValueType();
18906
18907 if (!isa<ConstantSDNode>(Idx)) {
18908 // Non constant index. Extend source and destination,
18909 // insert element and then truncate the result.
18910 unsigned NumElts = VecVT.getVectorNumElements();
18911 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18912 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18913 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18914 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18915 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18916 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18917 }
18918
18919 // Copy into a k-register, extract to v1i1 and insert_subvector.
18920 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18921 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18922}
18923
18924SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18925 SelectionDAG &DAG) const {
18926 MVT VT = Op.getSimpleValueType();
18927 MVT EltVT = VT.getVectorElementType();
18928 unsigned NumElts = VT.getVectorNumElements();
18929 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18930
18931 if (EltVT == MVT::i1)
18932 return InsertBitToMaskVector(Op, DAG, Subtarget);
18933
18934 SDLoc dl(Op);
18935 SDValue N0 = Op.getOperand(0);
18936 SDValue N1 = Op.getOperand(1);
18937 SDValue N2 = Op.getOperand(2);
18938 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18939
18940 if (EltVT == MVT::bf16) {
18941 MVT IVT = VT.changeVectorElementTypeToInteger();
18942 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18943 DAG.getBitcast(IVT, N0),
18944 DAG.getBitcast(MVT::i16, N1), N2);
18945 return DAG.getBitcast(VT, Res);
18946 }
18947
18948 if (!N2C) {
18949 // Variable insertion indices, usually we're better off spilling to stack,
18950 // but AVX512 can use a variable compare+select by comparing against all
18951 // possible vector indices, and FP insertion has less gpr->simd traffic.
18952 if (!(Subtarget.hasBWI() ||
18953 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18954 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18955 return SDValue();
18956
18957 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18958 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18959 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18960 return SDValue();
18961
18962 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18963 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18964 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18965
18966 SmallVector<SDValue, 16> RawIndices;
18967 for (unsigned I = 0; I != NumElts; ++I)
18968 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18969 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18970
18971 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18972 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18974 }
18975
18976 if (N2C->getAPIntValue().uge(NumElts))
18977 return SDValue();
18978 uint64_t IdxVal = N2C->getZExtValue();
18979
18980 bool IsZeroElt = X86::isZeroNode(N1);
18981 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18982
18983 if (IsZeroElt || IsAllOnesElt) {
18984 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18985 // We don't deal with i8 0 since it appears to be handled elsewhere.
18986 if (IsAllOnesElt &&
18987 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18988 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18989 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18990 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18991 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18992 CstVectorElts[IdxVal] = OnesCst;
18993 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18994 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18995 }
18996 // See if we can do this more efficiently with a blend shuffle with a
18997 // rematerializable vector.
18998 if (Subtarget.hasSSE41() &&
18999 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19000 SmallVector<int, 8> BlendMask;
19001 for (unsigned i = 0; i != NumElts; ++i)
19002 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19003 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19004 : getOnesVector(VT, DAG, dl);
19005 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19006 }
19007 }
19008
19009 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19010 // into that, and then insert the subvector back into the result.
19011 if (VT.is256BitVector() || VT.is512BitVector()) {
19012 // With a 256-bit vector, we can insert into the zero element efficiently
19013 // using a blend if we have AVX or AVX2 and the right data type.
19014 if (VT.is256BitVector() && IdxVal == 0) {
19015 // TODO: It is worthwhile to cast integer to floating point and back
19016 // and incur a domain crossing penalty if that's what we'll end up
19017 // doing anyway after extracting to a 128-bit vector.
19018 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19019 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19020 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19021 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19022 DAG.getTargetConstant(1, dl, MVT::i8));
19023 }
19024 }
19025
19026 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19027 assert(isPowerOf2_32(NumEltsIn128) &&
19028 "Vectors will always have power-of-two number of elements.");
19029
19030 // If we are not inserting into the low 128-bit vector chunk,
19031 // then prefer the broadcast+blend sequence.
19032 // FIXME: relax the profitability check iff all N1 uses are insertions.
19033 if (IdxVal >= NumEltsIn128 &&
19034 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19035 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19036 X86::mayFoldLoad(N1, Subtarget)))) {
19037 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19038 SmallVector<int, 8> BlendMask;
19039 for (unsigned i = 0; i != NumElts; ++i)
19040 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19041 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19042 }
19043
19044 // Get the desired 128-bit vector chunk.
19045 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19046
19047 // Insert the element into the desired chunk.
19048 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19049 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19050
19051 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19052 DAG.getVectorIdxConstant(IdxIn128, dl));
19053
19054 // Insert the changed part back into the bigger vector
19055 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19056 }
19057 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19058
19059 // This will be just movw/movd/movq/movsh/movss/movsd.
19060 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19061 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19062 EltVT == MVT::f16 || EltVT == MVT::i64) {
19063 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19064 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19065 }
19066
19067 // We can't directly insert an i8 or i16 into a vector, so zero extend
19068 // it to i32 first.
19069 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19070 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19071 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19072 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19073 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19074 return DAG.getBitcast(VT, N1);
19075 }
19076 }
19077
19078 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19079 // argument. SSE41 required for pinsrb.
19080 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19081 unsigned Opc;
19082 if (VT == MVT::v8i16) {
19083 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19085 } else {
19086 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19087 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19089 }
19090
19091 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19092 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19093 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19094 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19095 }
19096
19097 if (Subtarget.hasSSE41()) {
19098 if (EltVT == MVT::f32) {
19099 // Bits [7:6] of the constant are the source select. This will always be
19100 // zero here. The DAG Combiner may combine an extract_elt index into
19101 // these bits. For example (insert (extract, 3), 2) could be matched by
19102 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19103 // Bits [5:4] of the constant are the destination select. This is the
19104 // value of the incoming immediate.
19105 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19106 // combine either bitwise AND or insert of float 0.0 to set these bits.
19107
19108 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19109 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19110 // If this is an insertion of 32-bits into the low 32-bits of
19111 // a vector, we prefer to generate a blend with immediate rather
19112 // than an insertps. Blends are simpler operations in hardware and so
19113 // will always have equal or better performance than insertps.
19114 // But if optimizing for size and there's a load folding opportunity,
19115 // generate insertps because blendps does not have a 32-bit memory
19116 // operand form.
19117 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19118 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19119 DAG.getTargetConstant(1, dl, MVT::i8));
19120 }
19121 // Create this as a scalar to vector..
19122 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19123 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19124 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19125 }
19126
19127 // PINSR* works with constant index.
19128 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19129 return Op;
19130 }
19131
19132 return SDValue();
19133}
19134
19136 SelectionDAG &DAG) {
19137 SDLoc dl(Op);
19138 MVT OpVT = Op.getSimpleValueType();
19139
19140 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19141 // combines.
19142 if (X86::isZeroNode(Op.getOperand(0)))
19143 return getZeroVector(OpVT, Subtarget, DAG, dl);
19144
19145 // If this is a 256-bit vector result, first insert into a 128-bit
19146 // vector and then insert into the 256-bit vector.
19147 if (!OpVT.is128BitVector()) {
19148 // Insert into a 128-bit vector.
19149 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19151 OpVT.getVectorNumElements() / SizeFactor);
19152
19153 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19154
19155 // Insert the 128-bit vector.
19156 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19157 }
19158 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19159 "Expected an SSE type!");
19160
19161 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19162 // tblgen.
19163 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19164 return Op;
19165
19166 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19167 return DAG.getBitcast(
19168 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19169}
19170
19171// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19172// simple superregister reference or explicit instructions to insert
19173// the upper bits of a vector.
19175 SelectionDAG &DAG) {
19176 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19177
19178 return insert1BitVector(Op, DAG, Subtarget);
19179}
19180
19182 SelectionDAG &DAG) {
19183 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19184 "Only vXi1 extract_subvectors need custom lowering");
19185
19186 SDLoc dl(Op);
19187 SDValue Vec = Op.getOperand(0);
19188 uint64_t IdxVal = Op.getConstantOperandVal(1);
19189
19190 if (IdxVal == 0) // the operation is legal
19191 return Op;
19192
19193 // Extend to natively supported kshift.
19194 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19195
19196 // Shift to the LSB.
19197 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19198 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19199
19200 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19201 DAG.getVectorIdxConstant(0, dl));
19202}
19203
19204// Returns the appropriate wrapper opcode for a global reference.
19205unsigned X86TargetLowering::getGlobalWrapperKind(
19206 const GlobalValue *GV, const unsigned char OpFlags) const {
19207 // References to absolute symbols are never PC-relative.
19208 if (GV && GV->isAbsoluteSymbolRef())
19209 return X86ISD::Wrapper;
19210
19211 // The following OpFlags under RIP-rel PIC use RIP.
19212 if (Subtarget.isPICStyleRIPRel() &&
19213 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19214 OpFlags == X86II::MO_DLLIMPORT))
19215 return X86ISD::WrapperRIP;
19216
19217 // GOTPCREL references must always use RIP.
19218 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19219 return X86ISD::WrapperRIP;
19220
19221 return X86ISD::Wrapper;
19222}
19223
19224// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19225// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19226// one of the above mentioned nodes. It has to be wrapped because otherwise
19227// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19228// be used to form addressing mode. These wrapped nodes will be selected
19229// into MOV32ri.
19230SDValue
19231X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19232 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19233
19234 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19235 // global base reg.
19236 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19237
19238 auto PtrVT = getPointerTy(DAG.getDataLayout());
19240 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19241 SDLoc DL(CP);
19242 Result =
19243 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19244 // With PIC, the address is actually $g + Offset.
19245 if (OpFlag) {
19246 Result =
19247 DAG.getNode(ISD::ADD, DL, PtrVT,
19248 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19249 }
19250
19251 return Result;
19252}
19253
19254SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19255 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19256
19257 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19258 // global base reg.
19259 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19260
19261 EVT PtrVT = Op.getValueType();
19262 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19263 SDLoc DL(JT);
19264 Result =
19265 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19266
19267 // With PIC, the address is actually $g + Offset.
19268 if (OpFlag)
19269 Result =
19270 DAG.getNode(ISD::ADD, DL, PtrVT,
19271 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19272
19273 return Result;
19274}
19275
19276SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19277 SelectionDAG &DAG) const {
19278 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19279}
19280
19281SDValue
19282X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19283 // Create the TargetBlockAddressAddress node.
19284 unsigned char OpFlags =
19285 Subtarget.classifyBlockAddressReference();
19286 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19287 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19288 SDLoc dl(Op);
19289 EVT PtrVT = Op.getValueType();
19290 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19291 Result =
19292 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19293
19294 // With PIC, the address is actually $g + Offset.
19295 if (isGlobalRelativeToPICBase(OpFlags)) {
19296 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19297 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19298 }
19299
19300 return Result;
19301}
19302
19303/// Creates target global address or external symbol nodes for calls or
19304/// other uses.
19305SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19306 bool ForCall,
19307 bool *IsImpCall) const {
19308 // Unpack the global address or external symbol.
19309 SDLoc dl(Op);
19310 const GlobalValue *GV = nullptr;
19311 int64_t Offset = 0;
19312 const char *ExternalSym = nullptr;
19313 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19314 GV = G->getGlobal();
19315 Offset = G->getOffset();
19316 } else {
19317 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19318 ExternalSym = ES->getSymbol();
19319 }
19320
19321 // Calculate some flags for address lowering.
19323 unsigned char OpFlags;
19324 if (ForCall)
19325 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19326 else
19327 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19328 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19329 bool NeedsLoad = isGlobalStubReference(OpFlags);
19330
19332 EVT PtrVT = Op.getValueType();
19334
19335 if (GV) {
19336 // Create a target global address if this is a global. If possible, fold the
19337 // offset into the global address reference. Otherwise, ADD it on later.
19338 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19339 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19340 // relocation will compute to a negative value, which is invalid.
19341 int64_t GlobalOffset = 0;
19342 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19344 std::swap(GlobalOffset, Offset);
19345 }
19346 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19347 } else {
19348 // If this is not a global address, this must be an external symbol.
19349 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19350 }
19351
19352 // If this is a direct call, avoid the wrapper if we don't need to do any
19353 // loads or adds. This allows SDAG ISel to match direct calls.
19354 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19355 return Result;
19356
19357 // If Import Call Optimization is enabled and this is an imported function
19358 // then make a note of it and return the global address without wrapping.
19359 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19360 Mod.getModuleFlag("import-call-optimization")) {
19361 assert(ForCall && "Should only enable import call optimization if we are "
19362 "lowering a call");
19363 *IsImpCall = true;
19364 return Result;
19365 }
19366
19367 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19368
19369 // With PIC, the address is actually $g + Offset.
19370 if (HasPICReg) {
19371 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19372 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19373 }
19374
19375 // For globals that require a load from a stub to get the address, emit the
19376 // load.
19377 if (NeedsLoad)
19378 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19380
19381 // If there was a non-zero offset that we didn't fold, create an explicit
19382 // addition for it.
19383 if (Offset != 0)
19384 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19385 DAG.getSignedConstant(Offset, dl, PtrVT));
19386
19387 return Result;
19388}
19389
19390SDValue
19391X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19392 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19393}
19394
19396 const EVT PtrVT, unsigned ReturnReg,
19397 unsigned char OperandFlags,
19398 bool LoadGlobalBaseReg = false,
19399 bool LocalDynamic = false) {
19401 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19402 SDLoc dl(GA);
19403 SDValue TGA;
19404 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19405 SDValue Chain = DAG.getEntryNode();
19406 SDValue Ret;
19407 if (LocalDynamic && UseTLSDESC) {
19408 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19409 // Reuse existing GetTLSADDR node if we can find it.
19410 if (TGA->hasOneUse()) {
19411 // TLSDESC uses TGA.
19412 SDNode *TLSDescOp = *TGA->user_begin();
19413 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19414 "Unexpected TLSDESC DAG");
19415 // CALLSEQ_END uses TGA via a chain and glue.
19416 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19417 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19418 "Unexpected TLSDESC DAG");
19419 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19420 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19421 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19422 "Unexpected TLSDESC DAG");
19423 Ret = SDValue(CopyFromRegOp, 0);
19424 }
19425 } else {
19426 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19427 GA->getOffset(), OperandFlags);
19428 }
19429
19430 if (!Ret) {
19431 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19432 : LocalDynamic ? X86ISD::TLSBASEADDR
19434
19435 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19436 if (LoadGlobalBaseReg) {
19437 SDValue InGlue;
19438 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19439 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19440 InGlue);
19441 InGlue = Chain.getValue(1);
19442 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19443 } else {
19444 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19445 }
19446 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19447
19448 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19449 MFI.setHasCalls(true);
19450
19451 SDValue Glue = Chain.getValue(1);
19452 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19453 }
19454
19455 if (!UseTLSDESC)
19456 return Ret;
19457
19458 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19459 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19460
19462 SDValue Offset =
19463 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19465 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19466}
19467
19468// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19469static SDValue
19471 const EVT PtrVT) {
19472 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19473 /*LoadGlobalBaseReg=*/true);
19474}
19475
19476// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19477static SDValue
19479 const EVT PtrVT) {
19480 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19481}
19482
19483// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19484static SDValue
19486 const EVT PtrVT) {
19487 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19488}
19489
19491 SelectionDAG &DAG, const EVT PtrVT,
19492 bool Is64Bit, bool Is64BitLP64) {
19493 SDLoc dl(GA);
19494
19495 // Get the start address of the TLS block for this module.
19499
19500 SDValue Base;
19501 if (Is64Bit) {
19502 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19503 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19504 /*LoadGlobalBaseReg=*/false,
19505 /*LocalDynamic=*/true);
19506 } else {
19507 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19508 /*LoadGlobalBaseReg=*/true,
19509 /*LocalDynamic=*/true);
19510 }
19511
19512 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19513 // of Base.
19514
19515 // Build x@dtpoff.
19516 unsigned char OperandFlags = X86II::MO_DTPOFF;
19517 unsigned WrapperKind = X86ISD::Wrapper;
19518 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19519 GA->getValueType(0),
19520 GA->getOffset(), OperandFlags);
19521 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19522
19523 // Add x@dtpoff with the base.
19524 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19525}
19526
19527// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19529 const EVT PtrVT, TLSModel::Model model,
19530 bool is64Bit, bool isPIC) {
19531 SDLoc dl(GA);
19532
19533 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19536
19537 SDValue ThreadPointer =
19538 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19540
19541 unsigned char OperandFlags = 0;
19542 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19543 // initialexec.
19544 unsigned WrapperKind = X86ISD::Wrapper;
19545 if (model == TLSModel::LocalExec) {
19546 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19547 } else if (model == TLSModel::InitialExec) {
19548 if (is64Bit) {
19549 OperandFlags = X86II::MO_GOTTPOFF;
19550 WrapperKind = X86ISD::WrapperRIP;
19551 } else {
19552 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19553 }
19554 } else {
19555 llvm_unreachable("Unexpected model");
19556 }
19557
19558 // emit "addl x@ntpoff,%eax" (local exec)
19559 // or "addl x@indntpoff,%eax" (initial exec)
19560 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19561 SDValue TGA =
19562 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19563 GA->getOffset(), OperandFlags);
19564 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19565
19566 if (model == TLSModel::InitialExec) {
19567 if (isPIC && !is64Bit) {
19568 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19569 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19570 Offset);
19571 }
19572
19573 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19575 }
19576
19577 // The address of the thread local variable is the add of the thread
19578 // pointer with the offset of the variable.
19579 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19580}
19581
19582SDValue
19583X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19584
19585 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19586
19587 if (DAG.getTarget().useEmulatedTLS())
19588 return LowerToTLSEmulatedModel(GA, DAG);
19589
19590 const GlobalValue *GV = GA->getGlobal();
19591 EVT PtrVT = Op.getValueType();
19592 bool PositionIndependent = isPositionIndependent();
19593
19594 if (Subtarget.isTargetELF()) {
19595 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19596 switch (model) {
19598 if (Subtarget.is64Bit()) {
19599 if (Subtarget.isTarget64BitLP64())
19600 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19601 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19602 }
19603 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19605 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19606 Subtarget.isTarget64BitLP64());
19609 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19610 PositionIndependent);
19611 }
19612 llvm_unreachable("Unknown TLS model.");
19613 }
19614
19615 if (Subtarget.isTargetDarwin()) {
19616 // Darwin only has one model of TLS. Lower to that.
19617 unsigned char OpFlag = 0;
19618 unsigned WrapperKind = 0;
19619
19620 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19621 // global base reg.
19622 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19623 if (PIC32) {
19624 OpFlag = X86II::MO_TLVP_PIC_BASE;
19625 WrapperKind = X86ISD::Wrapper;
19626 } else {
19627 OpFlag = X86II::MO_TLVP;
19628 WrapperKind = X86ISD::WrapperRIP;
19629 }
19630 SDLoc DL(Op);
19632 GA->getValueType(0),
19633 GA->getOffset(), OpFlag);
19634 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19635
19636 // With PIC32, the address is actually $g + Offset.
19637 if (PIC32)
19638 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19639 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19640 Offset);
19641
19642 // Lowering the machine isd will make sure everything is in the right
19643 // location.
19644 SDValue Chain = DAG.getEntryNode();
19645 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19646 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19647 SDValue Args[] = { Chain, Offset };
19648 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19649 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19650
19651 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19652 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19653 MFI.setAdjustsStack(true);
19654
19655 // And our return value (tls address) is in the standard call return value
19656 // location.
19657 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19658 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19659 }
19660
19661 if (Subtarget.isOSWindows()) {
19662 // Just use the implicit TLS architecture
19663 // Need to generate something similar to:
19664 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19665 // ; from TEB
19666 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19667 // mov rcx, qword [rdx+rcx*8]
19668 // mov eax, .tls$:tlsvar
19669 // [rax+rcx] contains the address
19670 // Windows 64bit: gs:0x58
19671 // Windows 32bit: fs:__tls_array
19672
19673 SDLoc dl(GA);
19674 SDValue Chain = DAG.getEntryNode();
19675
19676 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19677 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19678 // use its literal value of 0x2C.
19680 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19682
19683 SDValue TlsArray = Subtarget.is64Bit()
19684 ? DAG.getIntPtrConstant(0x58, dl)
19685 : (Subtarget.isTargetWindowsGNU()
19686 ? DAG.getIntPtrConstant(0x2C, dl)
19687 : DAG.getExternalSymbol("_tls_array", PtrVT));
19688
19690 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19691
19692 SDValue res;
19694 res = ThreadPointer;
19695 } else {
19696 // Load the _tls_index variable
19697 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19698 if (Subtarget.is64Bit())
19699 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19700 MachinePointerInfo(), MVT::i32);
19701 else
19702 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19703
19704 const DataLayout &DL = DAG.getDataLayout();
19705 SDValue Scale =
19706 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19707 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19708
19709 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19710 }
19711
19712 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19713
19714 // Get the offset of start of .tls section
19715 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19716 GA->getValueType(0),
19718 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19719
19720 // The address of the thread local variable is the add of the thread
19721 // pointer with the offset of the variable.
19722 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19723 }
19724
19725 llvm_unreachable("TLS not implemented for this target.");
19726}
19727
19729 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19730 const TargetMachine &TM = getTargetMachine();
19731 TLSModel::Model Model = TM.getTLSModel(&GV);
19732 switch (Model) {
19735 // We can include the %fs segment register in addressing modes.
19736 return true;
19739 // These models do not result in %fs relative addresses unless
19740 // TLS descriptior are used.
19741 //
19742 // Even in the case of TLS descriptors we currently have no way to model
19743 // the difference between %fs access and the computations needed for the
19744 // offset and returning `true` for TLS-desc currently duplicates both
19745 // which is detrimental :-/
19746 return false;
19747 }
19748 }
19749 return false;
19750}
19751
19752/// Lower SRA_PARTS and friends, which return two i32 values
19753/// and take a 2 x i32 value to shift plus a shift amount.
19754/// TODO: Can this be moved to general expansion code?
19756 SDValue Lo, Hi;
19757 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19758 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19759}
19760
19761// Try to use a packed vector operation to handle i64 on 32-bit targets when
19762// AVX512DQ is enabled.
19764 SelectionDAG &DAG,
19765 const X86Subtarget &Subtarget) {
19766 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19767 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19768 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19769 Op.getOpcode() == ISD::UINT_TO_FP) &&
19770 "Unexpected opcode!");
19771 bool IsStrict = Op->isStrictFPOpcode();
19772 unsigned OpNo = IsStrict ? 1 : 0;
19773 SDValue Src = Op.getOperand(OpNo);
19774 MVT SrcVT = Src.getSimpleValueType();
19775 MVT VT = Op.getSimpleValueType();
19776
19777 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19778 (VT != MVT::f32 && VT != MVT::f64))
19779 return SDValue();
19780
19781 // Pack the i64 into a vector, do the operation and extract.
19782
19783 // Using 256-bit to ensure result is 128-bits for f32 case.
19784 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19785 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19786 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19787
19788 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19789 if (IsStrict) {
19790 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19791 {Op.getOperand(0), InVec});
19792 SDValue Chain = CvtVec.getValue(1);
19793 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19794 DAG.getVectorIdxConstant(0, dl));
19795 return DAG.getMergeValues({Value, Chain}, dl);
19796 }
19797
19798 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19799
19800 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19801 DAG.getVectorIdxConstant(0, dl));
19802}
19803
19804// Try to use a packed vector operation to handle i64 on 32-bit targets.
19806 const X86Subtarget &Subtarget) {
19807 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19808 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19809 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19810 Op.getOpcode() == ISD::UINT_TO_FP) &&
19811 "Unexpected opcode!");
19812 bool IsStrict = Op->isStrictFPOpcode();
19813 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19814 MVT SrcVT = Src.getSimpleValueType();
19815 MVT VT = Op.getSimpleValueType();
19816
19817 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19818 return SDValue();
19819
19820 // Pack the i64 into a vector, do the operation and extract.
19821
19822 assert(Subtarget.hasFP16() && "Expected FP16");
19823
19824 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19825 if (IsStrict) {
19826 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19827 {Op.getOperand(0), InVec});
19828 SDValue Chain = CvtVec.getValue(1);
19829 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19830 DAG.getVectorIdxConstant(0, dl));
19831 return DAG.getMergeValues({Value, Chain}, dl);
19832 }
19833
19834 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19835
19836 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19837 DAG.getVectorIdxConstant(0, dl));
19838}
19839
19840static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19841 const X86Subtarget &Subtarget) {
19842 switch (Opcode) {
19843 case ISD::SINT_TO_FP:
19844 // TODO: Handle wider types with AVX/AVX512.
19845 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19846 return false;
19847 // CVTDQ2PS or (V)CVTDQ2PD
19848 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19849
19850 case ISD::UINT_TO_FP:
19851 // TODO: Handle wider types and i64 elements.
19852 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19853 return false;
19854 // VCVTUDQ2PS or VCVTUDQ2PD
19855 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19856
19857 default:
19858 return false;
19859 }
19860}
19861
19862/// Given a scalar cast operation that is extracted from a vector, try to
19863/// vectorize the cast op followed by extraction. This will avoid an expensive
19864/// round-trip between XMM and GPR.
19866 SelectionDAG &DAG,
19867 const X86Subtarget &Subtarget) {
19868 // TODO: This could be enhanced to handle smaller integer types by peeking
19869 // through an extend.
19870 SDValue Extract = Cast.getOperand(0);
19871 MVT DestVT = Cast.getSimpleValueType();
19872 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19873 !isa<ConstantSDNode>(Extract.getOperand(1)))
19874 return SDValue();
19875
19876 // See if we have a 128-bit vector cast op for this type of cast.
19877 SDValue VecOp = Extract.getOperand(0);
19878 MVT FromVT = VecOp.getSimpleValueType();
19879 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19880 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19881 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19882 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19883 return SDValue();
19884
19885 // If we are extracting from a non-zero element, first shuffle the source
19886 // vector to allow extracting from element zero.
19887 if (!isNullConstant(Extract.getOperand(1))) {
19888 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19889 Mask[0] = Extract.getConstantOperandVal(1);
19890 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19891 }
19892 // If the source vector is wider than 128-bits, extract the low part. Do not
19893 // create an unnecessarily wide vector cast op.
19894 if (FromVT != Vec128VT)
19895 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19896
19897 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19898 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19899 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19900 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19901 DAG.getVectorIdxConstant(0, DL));
19902}
19903
19904/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19905/// try to vectorize the cast ops. This will avoid an expensive round-trip
19906/// between XMM and GPR.
19907static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19908 SelectionDAG &DAG,
19909 const X86Subtarget &Subtarget) {
19910 // TODO: Allow FP_TO_UINT.
19911 SDValue CastToInt = CastToFP.getOperand(0);
19912 MVT VT = CastToFP.getSimpleValueType();
19913 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19914 return SDValue();
19915
19916 MVT IntVT = CastToInt.getSimpleValueType();
19917 SDValue X = CastToInt.getOperand(0);
19918 MVT SrcVT = X.getSimpleValueType();
19919 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19920 return SDValue();
19921
19922 // See if we have 128-bit vector cast instructions for this type of cast.
19923 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19924 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19925 IntVT != MVT::i32)
19926 return SDValue();
19927
19928 unsigned SrcSize = SrcVT.getSizeInBits();
19929 unsigned IntSize = IntVT.getSizeInBits();
19930 unsigned VTSize = VT.getSizeInBits();
19931 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19932 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19933 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19934
19935 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19936 unsigned ToIntOpcode =
19937 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19938 unsigned ToFPOpcode =
19939 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19940
19941 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19942 //
19943 // We are not defining the high elements (for example, zero them) because
19944 // that could nullify any performance advantage that we hoped to gain from
19945 // this vector op hack. We do not expect any adverse effects (like denorm
19946 // penalties) with cast ops.
19947 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19948 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19949 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19950 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19951 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19952}
19953
19955 SelectionDAG &DAG,
19956 const X86Subtarget &Subtarget) {
19957 bool IsStrict = Op->isStrictFPOpcode();
19958 MVT VT = Op->getSimpleValueType(0);
19959 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19960
19961 if (Subtarget.hasDQI()) {
19962 assert(!Subtarget.hasVLX() && "Unexpected features");
19963
19964 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19965 Src.getSimpleValueType() == MVT::v4i64) &&
19966 "Unsupported custom type");
19967
19968 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19969 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19970 "Unexpected VT!");
19971 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19972
19973 // Need to concat with zero vector for strict fp to avoid spurious
19974 // exceptions.
19975 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19976 : DAG.getUNDEF(MVT::v8i64);
19977 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19978 DAG.getVectorIdxConstant(0, DL));
19979 SDValue Res, Chain;
19980 if (IsStrict) {
19981 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19982 {Op->getOperand(0), Src});
19983 Chain = Res.getValue(1);
19984 } else {
19985 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19986 }
19987
19988 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19989 DAG.getVectorIdxConstant(0, DL));
19990
19991 if (IsStrict)
19992 return DAG.getMergeValues({Res, Chain}, DL);
19993 return Res;
19994 }
19995
19996 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19997 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19998 if (VT != MVT::v4f32 || IsSigned)
19999 return SDValue();
20000
20001 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20002 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20003 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20004 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20005 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20006 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20007 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20008 SmallVector<SDValue, 4> SignCvts(4);
20009 SmallVector<SDValue, 4> Chains(4);
20010 for (int i = 0; i != 4; ++i) {
20011 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20012 DAG.getVectorIdxConstant(i, DL));
20013 if (IsStrict) {
20014 SignCvts[i] =
20015 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20016 {Op.getOperand(0), Elt});
20017 Chains[i] = SignCvts[i].getValue(1);
20018 } else {
20019 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20020 }
20021 }
20022 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20023
20024 SDValue Slow, Chain;
20025 if (IsStrict) {
20026 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20027 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20028 {Chain, SignCvt, SignCvt});
20029 Chain = Slow.getValue(1);
20030 } else {
20031 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20032 }
20033
20034 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20035 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20036
20037 if (IsStrict)
20038 return DAG.getMergeValues({Cvt, Chain}, DL);
20039
20040 return Cvt;
20041}
20042
20044 SelectionDAG &DAG) {
20045 bool IsStrict = Op->isStrictFPOpcode();
20046 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20047 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20048 MVT VT = Op.getSimpleValueType();
20049 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20050
20051 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20052 if (IsStrict)
20053 return DAG.getNode(
20054 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20055 {Chain,
20056 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20057 Rnd});
20058 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20059 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20060}
20061
20062static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20063 const X86Subtarget &Subtarget) {
20064 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20065 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20066 return true;
20067 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20068 return true;
20069 }
20070 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20071 return true;
20072 if (Subtarget.useAVX512Regs()) {
20073 if (VT == MVT::v16i32)
20074 return true;
20075 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20076 return true;
20077 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20078 return true;
20079 }
20080 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20081 (VT == MVT::v2i64 || VT == MVT::v4i64))
20082 return true;
20083 return false;
20084}
20085
20086SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20087 SelectionDAG &DAG) const {
20088 bool IsStrict = Op->isStrictFPOpcode();
20089 unsigned OpNo = IsStrict ? 1 : 0;
20090 SDValue Src = Op.getOperand(OpNo);
20091 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20092 MVT SrcVT = Src.getSimpleValueType();
20093 MVT VT = Op.getSimpleValueType();
20094 SDLoc dl(Op);
20095
20096 if (isSoftF16(VT, Subtarget))
20097 return promoteXINT_TO_FP(Op, dl, DAG);
20098 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20099 return Op;
20100
20101 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20102 return LowerWin64_INT128_TO_FP(Op, DAG);
20103
20104 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20105 return Extract;
20106
20107 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20108 return R;
20109
20110 if (SrcVT.isVector()) {
20111 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20112 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20113 // source for strict FP.
20114 if (IsStrict)
20115 return DAG.getNode(
20116 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20117 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20118 DAG.getUNDEF(SrcVT))});
20119 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20120 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20121 DAG.getUNDEF(SrcVT)));
20122 }
20123 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20124 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20125
20126 return SDValue();
20127 }
20128
20129 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20130 "Unknown SINT_TO_FP to lower!");
20131
20132 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20133
20134 // These are really Legal; return the operand so the caller accepts it as
20135 // Legal.
20136 if (SrcVT == MVT::i32 && UseSSEReg)
20137 return Op;
20138 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20139 return Op;
20140
20141 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20142 return V;
20143 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20144 return V;
20145
20146 // SSE doesn't have an i16 conversion so we need to promote.
20147 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20148 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20149 if (IsStrict)
20150 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20151 {Chain, Ext});
20152
20153 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20154 }
20155
20156 if (VT == MVT::f128 || !Subtarget.hasX87())
20157 return SDValue();
20158
20159 SDValue ValueToStore = Src;
20160 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20161 // Bitcasting to f64 here allows us to do a single 64-bit store from
20162 // an SSE register, avoiding the store forwarding penalty that would come
20163 // with two 32-bit stores.
20164 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20165
20166 unsigned Size = SrcVT.getStoreSize();
20167 Align Alignment(Size);
20168 MachineFunction &MF = DAG.getMachineFunction();
20169 auto PtrVT = getPointerTy(MF.getDataLayout());
20170 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20171 MachinePointerInfo MPI =
20173 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20174 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20175 std::pair<SDValue, SDValue> Tmp =
20176 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20177
20178 if (IsStrict)
20179 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20180
20181 return Tmp.first;
20182}
20183
20184std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20185 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20186 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20187 // Build the FILD
20188 SDVTList Tys;
20189 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20190 if (useSSE)
20191 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20192 else
20193 Tys = DAG.getVTList(DstVT, MVT::Other);
20194
20195 SDValue FILDOps[] = {Chain, Pointer};
20196 SDValue Result =
20197 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20198 Alignment, MachineMemOperand::MOLoad);
20199 Chain = Result.getValue(1);
20200
20201 if (useSSE) {
20203 unsigned SSFISize = DstVT.getStoreSize();
20204 int SSFI =
20205 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20206 auto PtrVT = getPointerTy(MF.getDataLayout());
20207 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20208 Tys = DAG.getVTList(MVT::Other);
20209 SDValue FSTOps[] = {Chain, Result, StackSlot};
20212 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20213
20214 Chain =
20215 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20216 Result = DAG.getLoad(
20217 DstVT, DL, Chain, StackSlot,
20219 Chain = Result.getValue(1);
20220 }
20221
20222 return { Result, Chain };
20223}
20224
20225/// Horizontal vector math instructions may be slower than normal math with
20226/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20227/// implementation, and likely shuffle complexity of the alternate sequence.
20228static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20229 const X86Subtarget &Subtarget) {
20230 bool IsOptimizingSize = DAG.shouldOptForSize();
20231 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20232 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20233}
20234
20235/// 64-bit unsigned integer to double expansion.
20237 SelectionDAG &DAG,
20238 const X86Subtarget &Subtarget) {
20239 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20240 // when converting 0 when rounding toward negative infinity. Caller will
20241 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20242 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20243 // This algorithm is not obvious. Here it is what we're trying to output:
20244 /*
20245 movq %rax, %xmm0
20246 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20247 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20248 #ifdef __SSE3__
20249 haddpd %xmm0, %xmm0
20250 #else
20251 pshufd $0x4e, %xmm0, %xmm1
20252 addpd %xmm1, %xmm0
20253 #endif
20254 */
20255
20256 LLVMContext *Context = DAG.getContext();
20257
20258 // Build some magic constants.
20259 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20260 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20261 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20262 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20263
20265 CV1.push_back(
20266 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20267 APInt(64, 0x4330000000000000ULL))));
20268 CV1.push_back(
20269 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20270 APInt(64, 0x4530000000000000ULL))));
20271 Constant *C1 = ConstantVector::get(CV1);
20272 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20273
20274 // Load the 64-bit value into an XMM register.
20275 SDValue XR1 =
20276 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20277 SDValue CLod0 = DAG.getLoad(
20278 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20280 SDValue Unpck1 =
20281 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20282
20283 SDValue CLod1 = DAG.getLoad(
20284 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20286 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20287 // TODO: Are there any fast-math-flags to propagate here?
20288 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20289 SDValue Result;
20290
20291 if (Subtarget.hasSSE3() &&
20292 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20293 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20294 } else {
20295 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20296 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20297 }
20298 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20299 DAG.getVectorIdxConstant(0, dl));
20300 return Result;
20301}
20302
20303/// 32-bit unsigned integer to float expansion.
20305 SelectionDAG &DAG,
20306 const X86Subtarget &Subtarget) {
20307 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20308 // FP constant to bias correct the final result.
20309 SDValue Bias = DAG.getConstantFP(
20310 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20311
20312 // Load the 32-bit value into an XMM register.
20313 SDValue Load =
20314 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20315
20316 // Zero out the upper parts of the register.
20317 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20318
20319 // Or the load with the bias.
20320 SDValue Or = DAG.getNode(
20321 ISD::OR, dl, MVT::v2i64,
20322 DAG.getBitcast(MVT::v2i64, Load),
20323 DAG.getBitcast(MVT::v2i64,
20324 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20325 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20326 DAG.getBitcast(MVT::v2f64, Or),
20327 DAG.getVectorIdxConstant(0, dl));
20328
20329 if (Op.getNode()->isStrictFPOpcode()) {
20330 // Subtract the bias.
20331 // TODO: Are there any fast-math-flags to propagate here?
20332 SDValue Chain = Op.getOperand(0);
20333 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20334 {Chain, Or, Bias});
20335
20336 if (Op.getValueType() == Sub.getValueType())
20337 return Sub;
20338
20339 // Handle final rounding.
20340 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20341 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20342
20343 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20344 }
20345
20346 // Subtract the bias.
20347 // TODO: Are there any fast-math-flags to propagate here?
20348 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20349
20350 // Handle final rounding.
20351 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20352}
20353
20355 SelectionDAG &DAG,
20356 const X86Subtarget &Subtarget) {
20357 if (Op.getSimpleValueType() != MVT::v2f64)
20358 return SDValue();
20359
20360 bool IsStrict = Op->isStrictFPOpcode();
20361
20362 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20363 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20364
20365 if (Subtarget.hasAVX512()) {
20366 if (!Subtarget.hasVLX()) {
20367 // Let generic type legalization widen this.
20368 if (!IsStrict)
20369 return SDValue();
20370 // Otherwise pad the integer input with 0s and widen the operation.
20371 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20372 DAG.getConstant(0, DL, MVT::v2i32));
20373 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20374 {Op.getOperand(0), N0});
20375 SDValue Chain = Res.getValue(1);
20376 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20377 DAG.getVectorIdxConstant(0, DL));
20378 return DAG.getMergeValues({Res, Chain}, DL);
20379 }
20380
20381 // Legalize to v4i32 type.
20382 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20383 DAG.getUNDEF(MVT::v2i32));
20384 if (IsStrict)
20385 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20386 {Op.getOperand(0), N0});
20387 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20388 }
20389
20390 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20391 // This gives us the floating point equivalent of 2^52 + the i32 integer
20392 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20393 // point leaving just our i32 integers in double format.
20394 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20395 SDValue VBias = DAG.getConstantFP(
20396 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20397 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20398 DAG.getBitcast(MVT::v2i64, VBias));
20399 Or = DAG.getBitcast(MVT::v2f64, Or);
20400
20401 if (IsStrict)
20402 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20403 {Op.getOperand(0), Or, VBias});
20404 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20405}
20406
20408 SelectionDAG &DAG,
20409 const X86Subtarget &Subtarget) {
20410 bool IsStrict = Op->isStrictFPOpcode();
20411 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20412 MVT VecIntVT = V.getSimpleValueType();
20413 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20414 "Unsupported custom type");
20415
20416 if (Subtarget.hasAVX512()) {
20417 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20418 assert(!Subtarget.hasVLX() && "Unexpected features");
20419 MVT VT = Op->getSimpleValueType(0);
20420
20421 // v8i32->v8f64 is legal with AVX512 so just return it.
20422 if (VT == MVT::v8f64)
20423 return Op;
20424
20425 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20426 VT == MVT::v8f16) &&
20427 "Unexpected VT!");
20428 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20429 MVT WideIntVT = MVT::v16i32;
20430 if (VT == MVT::v4f64) {
20431 WideVT = MVT::v8f64;
20432 WideIntVT = MVT::v8i32;
20433 }
20434
20435 // Need to concat with zero vector for strict fp to avoid spurious
20436 // exceptions.
20437 SDValue Tmp =
20438 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20439 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20440 DAG.getVectorIdxConstant(0, DL));
20441 SDValue Res, Chain;
20442 if (IsStrict) {
20443 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20444 {Op->getOperand(0), V});
20445 Chain = Res.getValue(1);
20446 } else {
20447 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20448 }
20449
20450 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20451 DAG.getVectorIdxConstant(0, DL));
20452
20453 if (IsStrict)
20454 return DAG.getMergeValues({Res, Chain}, DL);
20455 return Res;
20456 }
20457
20458 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20459 Op->getSimpleValueType(0) == MVT::v4f64) {
20460 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20461 Constant *Bias = ConstantFP::get(
20462 *DAG.getContext(),
20463 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20464 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20465 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20466 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20467 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20468 SDValue VBias = DAG.getMemIntrinsicNode(
20469 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20472
20473 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20474 DAG.getBitcast(MVT::v4i64, VBias));
20475 Or = DAG.getBitcast(MVT::v4f64, Or);
20476
20477 if (IsStrict)
20478 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20479 {Op.getOperand(0), Or, VBias});
20480 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20481 }
20482
20483 // The algorithm is the following:
20484 // #ifdef __SSE4_1__
20485 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20486 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20487 // (uint4) 0x53000000, 0xaa);
20488 // #else
20489 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20490 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20491 // #endif
20492 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20493 // return (float4) lo + fhi;
20494
20495 bool Is128 = VecIntVT == MVT::v4i32;
20496 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20497 // If we convert to something else than the supported type, e.g., to v4f64,
20498 // abort early.
20499 if (VecFloatVT != Op->getSimpleValueType(0))
20500 return SDValue();
20501
20502 // In the #idef/#else code, we have in common:
20503 // - The vector of constants:
20504 // -- 0x4b000000
20505 // -- 0x53000000
20506 // - A shift:
20507 // -- v >> 16
20508
20509 // Create the splat vector for 0x4b000000.
20510 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20511 // Create the splat vector for 0x53000000.
20512 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20513
20514 // Create the right shift.
20515 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20516 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20517
20518 SDValue Low, High;
20519 if (Subtarget.hasSSE41()) {
20520 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20521 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20522 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20523 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20524 // Low will be bitcasted right away, so do not bother bitcasting back to its
20525 // original type.
20526 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20527 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20528 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20529 // (uint4) 0x53000000, 0xaa);
20530 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20531 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20532 // High will be bitcasted right away, so do not bother bitcasting back to
20533 // its original type.
20534 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20535 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20536 } else {
20537 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20538 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20539 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20540 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20541
20542 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20543 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20544 }
20545
20546 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20547 SDValue VecCstFSub = DAG.getConstantFP(
20548 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20549
20550 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20551 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20552 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20553 // enabled. See PR24512.
20554 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20555 // TODO: Are there any fast-math-flags to propagate here?
20556 // (float4) lo;
20557 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20558 // return (float4) lo + fhi;
20559 if (IsStrict) {
20560 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20561 {Op.getOperand(0), HighBitcast, VecCstFSub});
20562 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20563 {FHigh.getValue(1), LowBitcast, FHigh});
20564 }
20565
20566 SDValue FHigh =
20567 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20568 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20569}
20570
20572 const X86Subtarget &Subtarget) {
20573 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20574 SDValue N0 = Op.getOperand(OpNo);
20575 MVT SrcVT = N0.getSimpleValueType();
20576
20577 switch (SrcVT.SimpleTy) {
20578 default:
20579 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20580 case MVT::v2i32:
20581 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20582 case MVT::v4i32:
20583 case MVT::v8i32:
20584 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20585 case MVT::v2i64:
20586 case MVT::v4i64:
20587 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20588 }
20589}
20590
20591SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20592 SelectionDAG &DAG) const {
20593 bool IsStrict = Op->isStrictFPOpcode();
20594 unsigned OpNo = IsStrict ? 1 : 0;
20595 SDValue Src = Op.getOperand(OpNo);
20596 SDLoc dl(Op);
20597 auto PtrVT = getPointerTy(DAG.getDataLayout());
20598 MVT SrcVT = Src.getSimpleValueType();
20599 MVT DstVT = Op->getSimpleValueType(0);
20600 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20601
20602 // Bail out when we don't have native conversion instructions.
20603 if (DstVT == MVT::f128)
20604 return SDValue();
20605
20606 if (isSoftF16(DstVT, Subtarget))
20607 return promoteXINT_TO_FP(Op, dl, DAG);
20608 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20609 return Op;
20610
20611 if (DstVT.isVector())
20612 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20613
20614 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20615 return LowerWin64_INT128_TO_FP(Op, DAG);
20616
20617 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20618 return Extract;
20619
20620 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20621 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20622 // Conversions from unsigned i32 to f32/f64 are legal,
20623 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20624 return Op;
20625 }
20626
20627 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20628 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20629 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20630 if (IsStrict)
20631 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20632 {Chain, Src});
20633 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20634 }
20635
20636 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20637 return V;
20638 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20639 return V;
20640
20641 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20642 // infinity. It produces -0.0, so disable under strictfp.
20643 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20644 !IsStrict)
20645 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20646 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20647 // negative infinity. So disable under strictfp. Using FILD instead.
20648 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20649 !IsStrict)
20650 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20651 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20652 (DstVT == MVT::f32 || DstVT == MVT::f64))
20653 return SDValue();
20654
20655 // Make a 64-bit buffer, and use it to build an FILD.
20656 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20657 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20658 Align SlotAlign(8);
20659 MachinePointerInfo MPI =
20661 if (SrcVT == MVT::i32) {
20662 SDValue OffsetSlot =
20663 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20664 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20665 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20666 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20667 std::pair<SDValue, SDValue> Tmp =
20668 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20669 if (IsStrict)
20670 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20671
20672 return Tmp.first;
20673 }
20674
20675 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20676 SDValue ValueToStore = Src;
20677 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20678 // Bitcasting to f64 here allows us to do a single 64-bit store from
20679 // an SSE register, avoiding the store forwarding penalty that would come
20680 // with two 32-bit stores.
20681 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20682 }
20683 SDValue Store =
20684 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20685 // For i64 source, we need to add the appropriate power of 2 if the input
20686 // was negative. We must be careful to do the computation in x87 extended
20687 // precision, not in SSE.
20688 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20689 SDValue Ops[] = {Store, StackSlot};
20690 SDValue Fild =
20691 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20692 SlotAlign, MachineMemOperand::MOLoad);
20693 Chain = Fild.getValue(1);
20694
20695 // Check whether the sign bit is set.
20696 SDValue SignSet = DAG.getSetCC(
20697 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20698 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20699
20700 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20701 APInt FF(64, 0x5F80000000000000ULL);
20702 SDValue FudgePtr =
20703 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20704 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20705
20706 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20707 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20708 SDValue Four = DAG.getIntPtrConstant(4, dl);
20709 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20710 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20711
20712 // Load the value out, extending it from f32 to f80.
20713 SDValue Fudge = DAG.getExtLoad(
20714 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20716 CPAlignment);
20717 Chain = Fudge.getValue(1);
20718 // Extend everything to 80 bits to force it to be done on x87.
20719 // TODO: Are there any fast-math-flags to propagate here?
20720 if (IsStrict) {
20721 unsigned Opc = ISD::STRICT_FADD;
20722 // Windows needs the precision control changed to 80bits around this add.
20723 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20725
20726 SDValue Add =
20727 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20728 // STRICT_FP_ROUND can't handle equal types.
20729 if (DstVT == MVT::f80)
20730 return Add;
20731 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20732 {Add.getValue(1), Add,
20733 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20734 }
20735 unsigned Opc = ISD::FADD;
20736 // Windows needs the precision control changed to 80bits around this add.
20737 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20739
20740 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20741 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20742 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20743}
20744
20745// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20746// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20747// just return an SDValue().
20748// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20749// to i16, i32 or i64, and we lower it to a legal sequence and return the
20750// result.
20751SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20752 bool IsSigned,
20753 SDValue &Chain) const {
20754 bool IsStrict = Op->isStrictFPOpcode();
20755 SDLoc DL(Op);
20756
20757 EVT DstTy = Op.getValueType();
20758 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20759 EVT TheVT = Value.getValueType();
20760 auto PtrVT = getPointerTy(DAG.getDataLayout());
20761
20762 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20763 // f16 must be promoted before using the lowering in this routine.
20764 // fp128 does not use this lowering.
20765 return SDValue();
20766 }
20767
20768 // If using FIST to compute an unsigned i64, we'll need some fixup
20769 // to handle values above the maximum signed i64. A FIST is always
20770 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20771 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20772
20773 // FIXME: This does not generate an invalid exception if the input does not
20774 // fit in i32. PR44019
20775 if (!IsSigned && DstTy != MVT::i64) {
20776 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20777 // The low 32 bits of the fist result will have the correct uint32 result.
20778 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20779 DstTy = MVT::i64;
20780 }
20781
20782 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20783 DstTy.getSimpleVT() >= MVT::i16 &&
20784 "Unknown FP_TO_INT to lower!");
20785
20786 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20787 // stack slot.
20788 MachineFunction &MF = DAG.getMachineFunction();
20789 unsigned MemSize = DstTy.getStoreSize();
20790 int SSFI =
20791 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20792 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20793
20794 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20795
20796 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20797
20798 if (UnsignedFixup) {
20799 //
20800 // Conversion to unsigned i64 is implemented with a select,
20801 // depending on whether the source value fits in the range
20802 // of a signed i64. Let Thresh be the FP equivalent of
20803 // 0x8000000000000000ULL.
20804 //
20805 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20806 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20807 // FistSrc = (Value - FltOfs);
20808 // Fist-to-mem64 FistSrc
20809 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20810 // to XOR'ing the high 32 bits with Adjust.
20811 //
20812 // Being a power of 2, Thresh is exactly representable in all FP formats.
20813 // For X87 we'd like to use the smallest FP type for this constant, but
20814 // for DAG type consistency we have to match the FP operand type.
20815
20816 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20818 bool LosesInfo = false;
20819 if (TheVT == MVT::f64)
20820 // The rounding mode is irrelevant as the conversion should be exact.
20821 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20822 &LosesInfo);
20823 else if (TheVT == MVT::f80)
20824 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20825 APFloat::rmNearestTiesToEven, &LosesInfo);
20826
20827 assert(Status == APFloat::opOK && !LosesInfo &&
20828 "FP conversion should have been exact");
20829
20830 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20831
20832 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20833 *DAG.getContext(), TheVT);
20834 SDValue Cmp;
20835 if (IsStrict) {
20836 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20837 /*IsSignaling*/ true);
20838 Chain = Cmp.getValue(1);
20839 } else {
20840 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20841 }
20842
20843 // Our preferred lowering of
20844 //
20845 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20846 //
20847 // is
20848 //
20849 // (Value >= Thresh) << 63
20850 //
20851 // but since we can get here after LegalOperations, DAGCombine might do the
20852 // wrong thing if we create a select. So, directly create the preferred
20853 // version.
20854 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20855 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20856 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20857
20858 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20859 DAG.getConstantFP(0.0, DL, TheVT));
20860
20861 if (IsStrict) {
20862 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20863 { Chain, Value, FltOfs });
20864 Chain = Value.getValue(1);
20865 } else
20866 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20867 }
20868
20869 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20870
20871 // FIXME This causes a redundant load/store if the SSE-class value is already
20872 // in memory, such as if it is on the callstack.
20873 if (isScalarFPTypeInSSEReg(TheVT)) {
20874 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20875 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20876 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20877 SDValue Ops[] = { Chain, StackSlot };
20878
20879 unsigned FLDSize = TheVT.getStoreSize();
20880 assert(FLDSize <= MemSize && "Stack slot not big enough");
20881 MachineMemOperand *MMO = MF.getMachineMemOperand(
20882 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20883 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20884 Chain = Value.getValue(1);
20885 }
20886
20887 // Build the FP_TO_INT*_IN_MEM
20888 MachineMemOperand *MMO = MF.getMachineMemOperand(
20889 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20890 SDValue Ops[] = { Chain, Value, StackSlot };
20892 DAG.getVTList(MVT::Other),
20893 Ops, DstTy, MMO);
20894
20895 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20896 Chain = Res.getValue(1);
20897
20898 // If we need an unsigned fixup, XOR the result with adjust.
20899 if (UnsignedFixup)
20900 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20901
20902 return Res;
20903}
20904
20906 const X86Subtarget &Subtarget) {
20907 MVT VT = Op.getSimpleValueType();
20908 SDValue In = Op.getOperand(0);
20909 MVT InVT = In.getSimpleValueType();
20910 unsigned Opc = Op.getOpcode();
20911
20912 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20914 "Unexpected extension opcode");
20916 "Expected same number of elements");
20917 assert((VT.getVectorElementType() == MVT::i16 ||
20918 VT.getVectorElementType() == MVT::i32 ||
20919 VT.getVectorElementType() == MVT::i64) &&
20920 "Unexpected element type");
20921 assert((InVT.getVectorElementType() == MVT::i8 ||
20922 InVT.getVectorElementType() == MVT::i16 ||
20923 InVT.getVectorElementType() == MVT::i32) &&
20924 "Unexpected element type");
20925
20926 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20927
20928 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20929 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20930 return splitVectorIntUnary(Op, DAG, dl);
20931 }
20932
20933 if (Subtarget.hasInt256())
20934 return Op;
20935
20936 // Optimize vectors in AVX mode:
20937 //
20938 // v8i16 -> v8i32
20939 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20940 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20941 // Concat upper and lower parts.
20942 //
20943 // v4i32 -> v4i64
20944 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20945 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20946 // Concat upper and lower parts.
20947 //
20948 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20949 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20950
20951 // Short-circuit if we can determine that each 128-bit half is the same value.
20952 // Otherwise, this is difficult to match and optimize.
20953 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20954 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20955 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20956
20957 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20958 SDValue Undef = DAG.getUNDEF(InVT);
20959 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20960 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20961 OpHi = DAG.getBitcast(HalfVT, OpHi);
20962
20963 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20964}
20965
20966// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20967static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20968 const SDLoc &dl, SelectionDAG &DAG) {
20969 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20970 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20971 DAG.getVectorIdxConstant(0, dl));
20972 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20973 DAG.getVectorIdxConstant(8, dl));
20974 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20975 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20976 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20977 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20978}
20979
20981 const X86Subtarget &Subtarget,
20982 SelectionDAG &DAG) {
20983 MVT VT = Op->getSimpleValueType(0);
20984 SDValue In = Op->getOperand(0);
20985 MVT InVT = In.getSimpleValueType();
20986 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20987 unsigned NumElts = VT.getVectorNumElements();
20988
20989 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20990 // avoids a constant pool load.
20991 if (VT.getVectorElementType() != MVT::i8) {
20992 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20993 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20994 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20995 }
20996
20997 // Extend VT if BWI is not supported.
20998 MVT ExtVT = VT;
20999 if (!Subtarget.hasBWI()) {
21000 // If v16i32 is to be avoided, we'll need to split and concatenate.
21001 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21002 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21003
21004 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21005 }
21006
21007 // Widen to 512-bits if VLX is not supported.
21008 MVT WideVT = ExtVT;
21009 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21010 NumElts *= 512 / ExtVT.getSizeInBits();
21011 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21012 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
21013 DAG.getVectorIdxConstant(0, DL));
21014 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
21015 }
21016
21017 SDValue One = DAG.getConstant(1, DL, WideVT);
21018 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21019
21020 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21021
21022 // Truncate if we had to extend above.
21023 if (VT != ExtVT) {
21024 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21025 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21026 }
21027
21028 // Extract back to 128/256-bit if we widened.
21029 if (WideVT != VT)
21030 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21031 DAG.getVectorIdxConstant(0, DL));
21032
21033 return SelectedVal;
21034}
21035
21037 SelectionDAG &DAG) {
21038 SDValue In = Op.getOperand(0);
21039 MVT SVT = In.getSimpleValueType();
21040 SDLoc DL(Op);
21041
21042 if (SVT.getVectorElementType() == MVT::i1)
21043 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21044
21045 assert(Subtarget.hasAVX() && "Expected AVX support");
21046 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21047}
21048
21049/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21050/// It makes use of the fact that vectors with enough leading sign/zero bits
21051/// prevent the PACKSS/PACKUS from saturating the results.
21052/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21053/// within each 128-bit lane.
21054static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21055 const SDLoc &DL, SelectionDAG &DAG,
21056 const X86Subtarget &Subtarget) {
21057 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21058 "Unexpected PACK opcode");
21059 assert(DstVT.isVector() && "VT not a vector?");
21060
21061 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21062 if (!Subtarget.hasSSE2())
21063 return SDValue();
21064
21065 EVT SrcVT = In.getValueType();
21066
21067 // No truncation required, we might get here due to recursive calls.
21068 if (SrcVT == DstVT)
21069 return In;
21070
21071 unsigned NumElems = SrcVT.getVectorNumElements();
21072 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21073 return SDValue();
21074
21075 unsigned DstSizeInBits = DstVT.getSizeInBits();
21076 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21077 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21078 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21079
21080 LLVMContext &Ctx = *DAG.getContext();
21081 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21082 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21083
21084 // Pack to the largest type possible:
21085 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21086 EVT InVT = MVT::i16, OutVT = MVT::i8;
21087 if (SrcVT.getScalarSizeInBits() > 16 &&
21088 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21089 InVT = MVT::i32;
21090 OutVT = MVT::i16;
21091 }
21092
21093 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21094 // On pre-AVX512, pack the src in both halves to help value tracking.
21095 if (SrcSizeInBits <= 128) {
21096 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21097 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21098 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21099 SDValue LHS = DAG.getBitcast(InVT, In);
21100 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21101 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21102 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21103 Res = DAG.getBitcast(PackedVT, Res);
21104 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21105 }
21106
21107 // Split lower/upper subvectors.
21108 SDValue Lo, Hi;
21109 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21110
21111 // If Hi is undef, then don't bother packing it and widen the result instead.
21112 if (Hi.isUndef()) {
21113 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21114 if (SDValue Res =
21115 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21116 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21117 }
21118
21119 unsigned SubSizeInBits = SrcSizeInBits / 2;
21120 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21121 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21122
21123 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21124 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21125 Lo = DAG.getBitcast(InVT, Lo);
21126 Hi = DAG.getBitcast(InVT, Hi);
21127 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21128 return DAG.getBitcast(DstVT, Res);
21129 }
21130
21131 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21132 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21133 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21134 Lo = DAG.getBitcast(InVT, Lo);
21135 Hi = DAG.getBitcast(InVT, Hi);
21136 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21137
21138 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21139 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21140 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21142 int Scale = 64 / OutVT.getScalarSizeInBits();
21143 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21144 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21145
21146 if (DstVT.is256BitVector())
21147 return DAG.getBitcast(DstVT, Res);
21148
21149 // If 512bit -> 128bit truncate another stage.
21150 Res = DAG.getBitcast(PackedVT, Res);
21151 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21152 }
21153
21154 // Recursively pack lower/upper subvectors, concat result and pack again.
21155 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21156
21157 if (PackedVT.is128BitVector()) {
21158 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21159 // type legalization.
21160 SDValue Res =
21161 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21162 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21163 }
21164
21165 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21166 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21167 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21168 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21169 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21170}
21171
21172/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21173/// e.g. trunc <8 x i32> X to <8 x i16> -->
21174/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21175/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21177 const X86Subtarget &Subtarget,
21178 SelectionDAG &DAG) {
21179 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21180 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21181}
21182
21183/// Truncate using inreg sign extension and X86ISD::PACKSS.
21185 const X86Subtarget &Subtarget,
21186 SelectionDAG &DAG) {
21187 EVT SrcVT = In.getValueType();
21188 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21189 DAG.getValueType(DstVT));
21190 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21191}
21192
21193/// Helper to determine if \p In truncated to \p DstVT has the necessary
21194/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21195/// possibly by converting a SRL node to SRA for sign extension.
21196static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21197 SDValue In, const SDLoc &DL,
21198 SelectionDAG &DAG,
21199 const X86Subtarget &Subtarget,
21200 const SDNodeFlags Flags = SDNodeFlags()) {
21201 // Requires SSE2.
21202 if (!Subtarget.hasSSE2())
21203 return SDValue();
21204
21205 EVT SrcVT = In.getValueType();
21206 EVT DstSVT = DstVT.getVectorElementType();
21207 EVT SrcSVT = SrcVT.getVectorElementType();
21208 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21209 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21210
21211 // Check we have a truncation suited for PACKSS/PACKUS.
21212 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21213 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21214 return SDValue();
21215
21216 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21217 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21218
21219 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21220 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21221 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21222 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21223 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21224 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21225 return SDValue();
21226
21227 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21228 // split this for packing.
21229 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21230 !isFreeToSplitVector(In, DAG) &&
21231 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21232 return SDValue();
21233
21234 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21235 if (Subtarget.hasAVX512() && NumStages > 1)
21236 return SDValue();
21237
21238 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21239 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21240
21241 // Truncate with PACKUS if we are truncating a vector with leading zero
21242 // bits that extend all the way to the packed/truncated value.
21243 // e.g. Masks, zext_in_reg, etc.
21244 // Pre-SSE41 we can only use PACKUSWB.
21245 KnownBits Known = DAG.computeKnownBits(In);
21246 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21247 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21248 PackOpcode = X86ISD::PACKUS;
21249 return In;
21250 }
21251
21252 // Truncate with PACKSS if we are truncating a vector with sign-bits
21253 // that extend all the way to the packed/truncated value.
21254 // e.g. Comparison result, sext_in_reg, etc.
21255 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21256
21257 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21258 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21259 // see through BITCASTs later on and combines/simplifications can't then use
21260 // it.
21261 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21262 !Subtarget.hasAVX512())
21263 return SDValue();
21264
21265 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21266 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21267 MinSignBits < NumSignBits) {
21268 PackOpcode = X86ISD::PACKSS;
21269 return In;
21270 }
21271
21272 // If we have a srl that only generates signbits that we will discard in
21273 // the truncation then we can use PACKSS by converting the srl to a sra.
21274 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21275 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21276 if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
21277 if (*ShAmt == MinSignBits) {
21278 PackOpcode = X86ISD::PACKSS;
21279 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21280 }
21281 }
21282
21283 return SDValue();
21284}
21285
21286/// This function lowers a vector truncation of 'extended sign-bits' or
21287/// 'extended zero-bits' values.
21288/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21290 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21291 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21292 MVT SrcVT = In.getSimpleValueType();
21293 MVT DstSVT = DstVT.getVectorElementType();
21294 MVT SrcSVT = SrcVT.getVectorElementType();
21295 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21296 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21297 return SDValue();
21298
21299 // If the upper half of the source is undef, then attempt to split and
21300 // only truncate the lower half.
21301 if (DstVT.getSizeInBits() >= 128) {
21302 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21303 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21304 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21305 Subtarget, DAG))
21306 return widenSubVector(Res, false, Subtarget, DAG, DL,
21307 DstVT.getSizeInBits());
21308 }
21309 }
21310
21311 unsigned PackOpcode;
21312 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21313 Subtarget, Flags))
21314 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21315
21316 return SDValue();
21317}
21318
21319/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21320/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21322 const X86Subtarget &Subtarget,
21323 SelectionDAG &DAG) {
21324 MVT SrcVT = In.getSimpleValueType();
21325 MVT DstSVT = DstVT.getVectorElementType();
21326 MVT SrcSVT = SrcVT.getVectorElementType();
21327 unsigned NumElems = DstVT.getVectorNumElements();
21328 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21329 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21330 NumElems >= 8))
21331 return SDValue();
21332
21333 // SSSE3's pshufb results in less instructions in the cases below.
21334 if (Subtarget.hasSSSE3() && NumElems == 8) {
21335 if (SrcSVT == MVT::i16)
21336 return SDValue();
21337 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21338 return SDValue();
21339 }
21340
21341 // If the upper half of the source is undef, then attempt to split and
21342 // only truncate the lower half.
21343 if (DstVT.getSizeInBits() >= 128) {
21344 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21345 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21346 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21347 return widenSubVector(Res, false, Subtarget, DAG, DL,
21348 DstVT.getSizeInBits());
21349 }
21350 }
21351
21352 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21353 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21354 // truncate 2 x v4i32 to v8i16.
21355 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21356 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21357
21358 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21359 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21360
21361 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21362 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21363 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21364 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21365 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21366 }
21367
21368 return SDValue();
21369}
21370
21372 SelectionDAG &DAG,
21373 const X86Subtarget &Subtarget) {
21374 MVT VT = Op.getSimpleValueType();
21375 SDValue In = Op.getOperand(0);
21376 MVT InVT = In.getSimpleValueType();
21377 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21378
21379 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21380 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21381 if (InVT.getScalarSizeInBits() <= 16) {
21382 if (Subtarget.hasBWI()) {
21383 // legal, will go to VPMOVB2M, VPMOVW2M
21384 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21385 // We need to shift to get the lsb into sign position.
21386 // Shift packed bytes not supported natively, bitcast to word
21387 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21388 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21389 DAG.getBitcast(ExtVT, In),
21390 DAG.getConstant(ShiftInx, DL, ExtVT));
21391 In = DAG.getBitcast(InVT, In);
21392 }
21393 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21394 In, ISD::SETGT);
21395 }
21396 // Use TESTD/Q, extended vector to packed dword/qword.
21397 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21398 "Unexpected vector type.");
21399 unsigned NumElts = InVT.getVectorNumElements();
21400 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21401 // We need to change to a wider element type that we have support for.
21402 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21403 // For 16 element vectors we extend to v16i32 unless we are explicitly
21404 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21405 // we need to split into two 8 element vectors which we can extend to v8i32,
21406 // truncate and concat the results. There's an additional complication if
21407 // the original type is v16i8. In that case we can't split the v16i8
21408 // directly, so we need to shuffle high elements to low and use
21409 // sign_extend_vector_inreg.
21410 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21411 SDValue Lo, Hi;
21412 if (InVT == MVT::v16i8) {
21413 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21414 Hi = DAG.getVectorShuffle(
21415 InVT, DL, In, In,
21416 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21417 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21418 } else {
21419 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21420 Lo = extract128BitVector(In, 0, DAG, DL);
21421 Hi = extract128BitVector(In, 8, DAG, DL);
21422 }
21423 // We're split now, just emit two truncates and a concat. The two
21424 // truncates will trigger legalization to come back to this function.
21425 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21426 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21427 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21428 }
21429 // We either have 8 elements or we're allowed to use 512-bit vectors.
21430 // If we have VLX, we want to use the narrowest vector that can get the
21431 // job done so we use vXi32.
21432 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21433 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21434 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21435 InVT = ExtVT;
21436 ShiftInx = InVT.getScalarSizeInBits() - 1;
21437 }
21438
21439 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21440 // We need to shift to get the lsb into sign position.
21441 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21442 DAG.getConstant(ShiftInx, DL, InVT));
21443 }
21444 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21445 if (Subtarget.hasDQI())
21446 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21447 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21448}
21449
21450SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21451 SDLoc DL(Op);
21452 MVT VT = Op.getSimpleValueType();
21453 SDValue In = Op.getOperand(0);
21454 MVT InVT = In.getSimpleValueType();
21456 "Invalid TRUNCATE operation");
21457
21458 // If we're called by the type legalizer, handle a few cases.
21459 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21460 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21461 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21462 VT.is128BitVector() && Subtarget.hasAVX512()) {
21463 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21464 "Unexpected subtarget!");
21465 // The default behavior is to truncate one step, concatenate, and then
21466 // truncate the remainder. We'd rather produce two 64-bit results and
21467 // concatenate those.
21468 SDValue Lo, Hi;
21469 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21470
21471 EVT LoVT, HiVT;
21472 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21473
21474 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21475 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21476 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21477 }
21478
21479 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21480 if (!Subtarget.hasAVX512() ||
21481 (InVT.is512BitVector() && VT.is256BitVector()))
21483 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21484 return SignPack;
21485
21486 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21487 if (!Subtarget.hasAVX512())
21488 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21489
21490 // Otherwise let default legalization handle it.
21491 return SDValue();
21492 }
21493
21494 if (VT.getVectorElementType() == MVT::i1)
21495 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21496
21497 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21498 // concat from subvectors to use VPTRUNC etc.
21499 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21501 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21502 return SignPack;
21503
21504 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21505 if (Subtarget.hasAVX512()) {
21506 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21507 assert(VT == MVT::v32i8 && "Unexpected VT!");
21508 return splitVectorIntUnary(Op, DAG, DL);
21509 }
21510
21511 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21512 // and then truncate that. But we should only do that if we haven't been
21513 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21514 // handled by isel patterns.
21515 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21516 Subtarget.canExtendTo512DQ())
21517 return Op;
21518 }
21519
21520 // Handle truncation of V256 to V128 using shuffles.
21521 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21522
21523 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21524 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21525 if (Subtarget.hasInt256()) {
21526 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21527 In = DAG.getBitcast(MVT::v8i32, In);
21528 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21529 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21530 DAG.getVectorIdxConstant(0, DL));
21531 }
21532
21533 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21534 DAG.getVectorIdxConstant(0, DL));
21535 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21536 DAG.getVectorIdxConstant(2, DL));
21537 static const int ShufMask[] = {0, 2, 4, 6};
21538 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21539 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21540 }
21541
21542 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21543 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21544 if (Subtarget.hasInt256()) {
21545 // The PSHUFB mask:
21546 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21547 -1, -1, -1, -1, -1, -1, -1, -1,
21548 16, 17, 20, 21, 24, 25, 28, 29,
21549 -1, -1, -1, -1, -1, -1, -1, -1 };
21550 In = DAG.getBitcast(MVT::v32i8, In);
21551 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21552 In = DAG.getBitcast(MVT::v4i64, In);
21553
21554 static const int ShufMask2[] = {0, 2, -1, -1};
21555 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21556 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21557 DAG.getVectorIdxConstant(0, DL));
21558 return DAG.getBitcast(MVT::v8i16, In);
21559 }
21560
21561 return Subtarget.hasSSE41()
21562 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21563 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21564 }
21565
21566 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21567 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21568
21569 llvm_unreachable("All 256->128 cases should have been handled above!");
21570}
21571
21572// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21573// behaves on out of range inputs to generate optimized conversions.
21575 SelectionDAG &DAG,
21576 const X86Subtarget &Subtarget) {
21577 MVT SrcVT = Src.getSimpleValueType();
21578 unsigned DstBits = VT.getScalarSizeInBits();
21579 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21580
21581 // Calculate the converted result for values in the range 0 to
21582 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21583 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21584 SDValue Big =
21585 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21586 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21587 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21588
21589 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21590 // and only if the value was out of range. So we can use that
21591 // as our indicator that we rather use "Big" instead of "Small".
21592 //
21593 // Use "Small" if "IsOverflown" has all bits cleared
21594 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21595
21596 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21597 // use the slightly slower blendv select instead.
21598 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21599 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21600 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21601 }
21602
21603 SDValue IsOverflown =
21604 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21605 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21606 return DAG.getNode(ISD::OR, dl, VT, Small,
21607 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21608}
21609
21610SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21611 bool IsStrict = Op->isStrictFPOpcode();
21612 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21613 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21614 bool HasVLX = Subtarget.hasVLX();
21615 MVT VT = Op->getSimpleValueType(0);
21616 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21617 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21618 MVT SrcVT = Src.getSimpleValueType();
21619 SDLoc dl(Op);
21620
21621 SDValue Res;
21622 if (isSoftF16(SrcVT, Subtarget)) {
21623 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21624 if (IsStrict)
21625 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21626 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21627 {NVT, MVT::Other}, {Chain, Src})});
21628 return DAG.getNode(Op.getOpcode(), dl, VT,
21629 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21630 } else if (isTypeLegal(SrcVT) &&
21631 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21632 return Op;
21633 }
21634
21635 if (VT.isVector()) {
21636 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21637 MVT ResVT = MVT::v4i32;
21638 MVT TruncVT = MVT::v4i1;
21639 unsigned Opc;
21640 if (IsStrict)
21642 else
21643 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21644
21645 if (!IsSigned && !HasVLX) {
21646 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21647 // Widen to 512-bits.
21648 ResVT = MVT::v8i32;
21649 TruncVT = MVT::v8i1;
21650 Opc = Op.getOpcode();
21651 // Need to concat with zero vector for strict fp to avoid spurious
21652 // exceptions.
21653 // TODO: Should we just do this for non-strict as well?
21654 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21655 : DAG.getUNDEF(MVT::v8f64);
21656 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21657 DAG.getVectorIdxConstant(0, dl));
21658 }
21659 if (IsStrict) {
21660 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21661 Chain = Res.getValue(1);
21662 } else {
21663 Res = DAG.getNode(Opc, dl, ResVT, Src);
21664 }
21665
21666 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21667 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21668 DAG.getVectorIdxConstant(0, dl));
21669 if (IsStrict)
21670 return DAG.getMergeValues({Res, Chain}, dl);
21671 return Res;
21672 }
21673
21674 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21675 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21676 VT == MVT::v32i16)
21677 return Op;
21678
21679 MVT ResVT = VT;
21680 MVT EleVT = VT.getVectorElementType();
21681 if (EleVT != MVT::i64)
21682 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21683
21684 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21685 SDValue Tmp =
21686 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21687 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21688 Ops[0] = Src;
21689 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21690 }
21691
21692 if (!HasVLX) {
21693 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21694 // Widen to 512-bits.
21695 unsigned IntSize = EleVT.getSizeInBits();
21696 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21697 ResVT = MVT::getVectorVT(EleVT, Num);
21698 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21699 Subtarget, DAG, dl);
21700 }
21701
21702 if (IsStrict) {
21703 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21705 dl, {ResVT, MVT::Other}, {Chain, Src});
21706 Chain = Res.getValue(1);
21707 } else {
21708 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21709 ResVT, Src);
21710 }
21711
21712 // TODO: Need to add exception check code for strict FP.
21713 if (EleVT.getSizeInBits() < 16) {
21714 if (HasVLX)
21715 ResVT = MVT::getVectorVT(EleVT, 8);
21716 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21717 }
21718
21719 if (ResVT != VT)
21720 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21721 DAG.getVectorIdxConstant(0, dl));
21722
21723 if (IsStrict)
21724 return DAG.getMergeValues({Res, Chain}, dl);
21725 return Res;
21726 }
21727
21728 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21729 if (VT.getVectorElementType() == MVT::i16) {
21730 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21731 SrcVT.getVectorElementType() == MVT::f64) &&
21732 "Expected f32/f64 vector!");
21733 MVT NVT = VT.changeVectorElementType(MVT::i32);
21734 if (IsStrict) {
21735 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21737 dl, {NVT, MVT::Other}, {Chain, Src});
21738 Chain = Res.getValue(1);
21739 } else {
21740 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21741 NVT, Src);
21742 }
21743
21744 // TODO: Need to add exception check code for strict FP.
21745 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21746
21747 if (IsStrict)
21748 return DAG.getMergeValues({Res, Chain}, dl);
21749 return Res;
21750 }
21751
21752 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21753 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21754 assert(!IsSigned && "Expected unsigned conversion!");
21755 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21756 return Op;
21757 }
21758
21759 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21760 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21761 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21762 Subtarget.useAVX512Regs()) {
21763 assert(!IsSigned && "Expected unsigned conversion!");
21764 assert(!Subtarget.hasVLX() && "Unexpected features!");
21765 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21766 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21767 // Need to concat with zero vector for strict fp to avoid spurious
21768 // exceptions.
21769 // TODO: Should we just do this for non-strict as well?
21770 SDValue Tmp =
21771 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21772 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21773 DAG.getVectorIdxConstant(0, dl));
21774
21775 if (IsStrict) {
21776 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21777 {Chain, Src});
21778 Chain = Res.getValue(1);
21779 } else {
21780 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21781 }
21782
21783 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21784 DAG.getVectorIdxConstant(0, dl));
21785
21786 if (IsStrict)
21787 return DAG.getMergeValues({Res, Chain}, dl);
21788 return Res;
21789 }
21790
21791 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21792 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21793 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21794 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21795 assert(!Subtarget.hasVLX() && "Unexpected features!");
21796 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21797 // Need to concat with zero vector for strict fp to avoid spurious
21798 // exceptions.
21799 // TODO: Should we just do this for non-strict as well?
21800 SDValue Tmp =
21801 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21802 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21803 DAG.getVectorIdxConstant(0, dl));
21804
21805 if (IsStrict) {
21806 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21807 {Chain, Src});
21808 Chain = Res.getValue(1);
21809 } else {
21810 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21811 }
21812
21813 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21814 DAG.getVectorIdxConstant(0, dl));
21815
21816 if (IsStrict)
21817 return DAG.getMergeValues({Res, Chain}, dl);
21818 return Res;
21819 }
21820
21821 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21822 if (!Subtarget.hasVLX()) {
21823 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21824 // legalizer and then widened again by vector op legalization.
21825 if (!IsStrict)
21826 return SDValue();
21827
21828 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21829 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21830 {Src, Zero, Zero, Zero});
21831 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21832 {Chain, Tmp});
21833 SDValue Chain = Tmp.getValue(1);
21834 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21835 DAG.getVectorIdxConstant(0, dl));
21836 return DAG.getMergeValues({Tmp, Chain}, dl);
21837 }
21838
21839 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21840 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21841 DAG.getUNDEF(MVT::v2f32));
21842 if (IsStrict) {
21843 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21845 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21846 }
21847 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21848 return DAG.getNode(Opc, dl, VT, Tmp);
21849 }
21850
21851 // Generate optimized instructions for pre AVX512 unsigned conversions from
21852 // vXf32 to vXi32.
21853 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21854 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21855 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21856 assert(!IsSigned && "Expected unsigned conversion!");
21857 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21858 }
21859
21860 return SDValue();
21861 }
21862
21863 assert(!VT.isVector());
21864
21865 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21866
21867 if (!IsSigned && UseSSEReg) {
21868 // Conversions from f32/f64 with AVX512 should be legal.
21869 if (Subtarget.hasAVX512())
21870 return Op;
21871
21872 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21873 // behaves on out of range inputs to generate optimized conversions.
21874 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21875 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21876 unsigned DstBits = VT.getScalarSizeInBits();
21877 APInt UIntLimit = APInt::getSignMask(DstBits);
21878 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21879 DAG.getConstant(UIntLimit, dl, VT));
21880 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21881
21882 // Calculate the converted result for values in the range:
21883 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21884 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21885 SDValue Small =
21886 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21887 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21888 SDValue Big = DAG.getNode(
21889 X86ISD::CVTTS2SI, dl, VT,
21890 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21891 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21892
21893 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21894 // and only if the value was out of range. So we can use that
21895 // as our indicator that we rather use "Big" instead of "Small".
21896 //
21897 // Use "Small" if "IsOverflown" has all bits cleared
21898 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21899 SDValue IsOverflown = DAG.getNode(
21900 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21901 return DAG.getNode(ISD::OR, dl, VT, Small,
21902 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21903 }
21904
21905 // Use default expansion for i64.
21906 if (VT == MVT::i64)
21907 return SDValue();
21908
21909 assert(VT == MVT::i32 && "Unexpected VT!");
21910
21911 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21912 // FIXME: This does not generate an invalid exception if the input does not
21913 // fit in i32. PR44019
21914 if (Subtarget.is64Bit()) {
21915 if (IsStrict) {
21916 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21917 {Chain, Src});
21918 Chain = Res.getValue(1);
21919 } else
21920 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21921
21922 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21923 if (IsStrict)
21924 return DAG.getMergeValues({Res, Chain}, dl);
21925 return Res;
21926 }
21927
21928 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21929 // use fisttp which will be handled later.
21930 if (!Subtarget.hasSSE3())
21931 return SDValue();
21932 }
21933
21934 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21935 // FIXME: This does not generate an invalid exception if the input does not
21936 // fit in i16. PR44019
21937 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21938 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21939 if (IsStrict) {
21940 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21941 {Chain, Src});
21942 Chain = Res.getValue(1);
21943 } else
21944 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21945
21946 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21947 if (IsStrict)
21948 return DAG.getMergeValues({Res, Chain}, dl);
21949 return Res;
21950 }
21951
21952 // If this is a FP_TO_SINT using SSEReg we're done.
21953 if (UseSSEReg && IsSigned)
21954 return Op;
21955
21956 // fp128 needs to use a libcall.
21957 if (SrcVT == MVT::f128) {
21958 RTLIB::Libcall LC;
21959 if (IsSigned)
21960 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21961 else
21962 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21963
21964 MakeLibCallOptions CallOptions;
21965 std::pair<SDValue, SDValue> Tmp =
21966 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21967
21968 if (IsStrict)
21969 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21970
21971 return Tmp.first;
21972 }
21973
21974 // Fall back to X87.
21975 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21976 if (IsStrict)
21977 return DAG.getMergeValues({V, Chain}, dl);
21978 return V;
21979 }
21980
21981 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21982}
21983
21984SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21985 SelectionDAG &DAG) const {
21986 SDValue Src = Op.getOperand(0);
21987 EVT DstVT = Op.getSimpleValueType();
21988 MVT SrcVT = Src.getSimpleValueType();
21989
21990 if (SrcVT.isVector())
21991 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21992
21993 if (SrcVT == MVT::f16)
21994 return SDValue();
21995
21996 // If the source is in an SSE register, the node is Legal.
21997 if (isScalarFPTypeInSSEReg(SrcVT))
21998 return Op;
21999
22000 return LRINT_LLRINTHelper(Op.getNode(), DAG);
22001}
22002
22003SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22004 SelectionDAG &DAG) const {
22005 EVT DstVT = N->getValueType(0);
22006 SDValue Src = N->getOperand(0);
22007 EVT SrcVT = Src.getValueType();
22008
22009 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22010 // f16 must be promoted before using the lowering in this routine.
22011 // fp128 does not use this lowering.
22012 return SDValue();
22013 }
22014
22015 SDLoc DL(N);
22016 SDValue Chain = DAG.getEntryNode();
22017
22018 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22019
22020 // If we're converting from SSE, the stack slot needs to hold both types.
22021 // Otherwise it only needs to hold the DstVT.
22022 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22023 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22024 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22025 MachinePointerInfo MPI =
22027
22028 if (UseSSE) {
22029 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22030 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22031 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22032 SDValue Ops[] = { Chain, StackPtr };
22033
22034 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22035 /*Align*/ std::nullopt,
22037 Chain = Src.getValue(1);
22038 }
22039
22040 SDValue StoreOps[] = { Chain, Src, StackPtr };
22041 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22042 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22044
22045 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22046}
22047
22048SDValue
22049X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22050 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22051 // but making use of X86 specifics to produce better instruction sequences.
22052 SDNode *Node = Op.getNode();
22053 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22054 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22055 SDLoc dl(SDValue(Node, 0));
22056 SDValue Src = Node->getOperand(0);
22057
22058 // There are three types involved here: SrcVT is the source floating point
22059 // type, DstVT is the type of the result, and TmpVT is the result of the
22060 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22061 // DstVT).
22062 EVT SrcVT = Src.getValueType();
22063 EVT DstVT = Node->getValueType(0);
22064 EVT TmpVT = DstVT;
22065
22066 // This code is only for floats and doubles. Fall back to generic code for
22067 // anything else.
22068 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22069 return SDValue();
22070
22071 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22072 unsigned SatWidth = SatVT.getScalarSizeInBits();
22073 unsigned DstWidth = DstVT.getScalarSizeInBits();
22074 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22075 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22076 "Expected saturation width smaller than result width");
22077
22078 // Promote result of FP_TO_*INT to at least 32 bits.
22079 if (TmpWidth < 32) {
22080 TmpVT = MVT::i32;
22081 TmpWidth = 32;
22082 }
22083
22084 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22085 // us to use a native signed conversion instead.
22086 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22087 TmpVT = MVT::i64;
22088 TmpWidth = 64;
22089 }
22090
22091 // If the saturation width is smaller than the size of the temporary result,
22092 // we can always use signed conversion, which is native.
22093 if (SatWidth < TmpWidth)
22094 FpToIntOpcode = ISD::FP_TO_SINT;
22095
22096 // Determine minimum and maximum integer values and their corresponding
22097 // floating-point values.
22098 APInt MinInt, MaxInt;
22099 if (IsSigned) {
22100 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22101 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22102 } else {
22103 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22104 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22105 }
22106
22107 const fltSemantics &Sem = SrcVT.getFltSemantics();
22108 APFloat MinFloat(Sem);
22109 APFloat MaxFloat(Sem);
22110
22111 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22112 MinInt, IsSigned, APFloat::rmTowardZero);
22113 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22114 MaxInt, IsSigned, APFloat::rmTowardZero);
22115 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22116 && !(MaxStatus & APFloat::opStatus::opInexact);
22117
22118 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22119 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22120
22121 // If the integer bounds are exactly representable as floats, emit a
22122 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22123 if (AreExactFloatBounds) {
22124 if (DstVT != TmpVT) {
22125 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22126 SDValue MinClamped = DAG.getNode(
22127 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22128 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22129 SDValue BothClamped = DAG.getNode(
22130 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22131 // Convert clamped value to integer.
22132 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22133
22134 // NaN will become INDVAL, with the top bit set and the rest zero.
22135 // Truncation will discard the top bit, resulting in zero.
22136 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22137 }
22138
22139 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22140 SDValue MinClamped = DAG.getNode(
22141 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22142 // Clamp by MaxFloat from above. NaN cannot occur.
22143 SDValue BothClamped = DAG.getNode(
22144 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22145 // Convert clamped value to integer.
22146 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22147
22148 if (!IsSigned) {
22149 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22150 // which is zero.
22151 return FpToInt;
22152 }
22153
22154 // Otherwise, select zero if Src is NaN.
22155 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22156 return DAG.getSelectCC(
22157 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22158 }
22159
22160 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22161 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22162
22163 // Result of direct conversion, which may be selected away.
22164 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22165
22166 if (DstVT != TmpVT) {
22167 // NaN will become INDVAL, with the top bit set and the rest zero.
22168 // Truncation will discard the top bit, resulting in zero.
22169 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22170 }
22171
22172 SDValue Select = FpToInt;
22173 // For signed conversions where we saturate to the same size as the
22174 // result type of the fptoi instructions, INDVAL coincides with integer
22175 // minimum, so we don't need to explicitly check it.
22176 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22177 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22178 // MinInt if Src is NaN.
22179 Select = DAG.getSelectCC(
22180 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22181 }
22182
22183 // If Src OGT MaxFloat, select MaxInt.
22184 Select = DAG.getSelectCC(
22185 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22186
22187 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22188 // is already zero. The promoted case was already handled above.
22189 if (!IsSigned || DstVT != TmpVT) {
22190 return Select;
22191 }
22192
22193 // Otherwise, select 0 if Src is NaN.
22194 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22195 return DAG.getSelectCC(
22196 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22197}
22198
22199SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22200 bool IsStrict = Op->isStrictFPOpcode();
22201
22202 SDLoc DL(Op);
22203 MVT VT = Op.getSimpleValueType();
22204 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22205 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22206 MVT SVT = In.getSimpleValueType();
22207
22208 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22209 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22210 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22211 !Subtarget.getTargetTriple().isOSDarwin()))
22212 return SDValue();
22213
22214 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22215 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22216 return Op;
22217
22218 if (SVT == MVT::f16) {
22219 if (Subtarget.hasFP16())
22220 return Op;
22221
22222 if (VT != MVT::f32) {
22223 if (IsStrict)
22224 return DAG.getNode(
22225 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22226 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22227 {MVT::f32, MVT::Other}, {Chain, In})});
22228
22229 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22230 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22231 }
22232
22233 if (!Subtarget.hasF16C()) {
22234 if (!Subtarget.getTargetTriple().isOSDarwin())
22235 return SDValue();
22236
22237 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22238
22239 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22240 TargetLowering::CallLoweringInfo CLI(DAG);
22241 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22242
22243 In = DAG.getBitcast(MVT::i16, In);
22245 TargetLowering::ArgListEntry Entry(
22246 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22247 Entry.IsSExt = false;
22248 Entry.IsZExt = true;
22249 Args.push_back(Entry);
22250
22252 getLibcallName(RTLIB::FPEXT_F16_F32),
22254 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22255 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22256 std::move(Args));
22257
22258 SDValue Res;
22259 std::tie(Res,Chain) = LowerCallTo(CLI);
22260 if (IsStrict)
22261 Res = DAG.getMergeValues({Res, Chain}, DL);
22262
22263 return Res;
22264 }
22265
22266 In = DAG.getBitcast(MVT::i16, In);
22267 SDValue Res;
22268 if (IsStrict) {
22269 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22270 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22271 DAG.getVectorIdxConstant(0, DL));
22272 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22273 {Chain, In});
22274 Chain = Res.getValue(1);
22275 } else {
22276 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22277 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22278 DAG.getUNDEF(MVT::v4i32), In,
22279 DAG.getVectorIdxConstant(0, DL));
22280 In = DAG.getBitcast(MVT::v8i16, In);
22281 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22282 DAG.getTargetConstant(4, DL, MVT::i32));
22283 }
22284 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22285 DAG.getVectorIdxConstant(0, DL));
22286 if (IsStrict)
22287 return DAG.getMergeValues({Res, Chain}, DL);
22288 return Res;
22289 }
22290
22291 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22292 return Op;
22293
22294 if (SVT.getVectorElementType() == MVT::f16) {
22295 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22296 return Op;
22297 assert(Subtarget.hasF16C() && "Unexpected features!");
22298 if (SVT == MVT::v2f16)
22299 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22300 DAG.getUNDEF(MVT::v2f16));
22301 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22302 DAG.getUNDEF(MVT::v4f16));
22303 if (IsStrict)
22304 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22305 {Op->getOperand(0), Res});
22306 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22307 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22308 return Op;
22309 }
22310
22311 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22312
22313 SDValue Res =
22314 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22315 if (IsStrict)
22316 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22317 {Op->getOperand(0), Res});
22318 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22319}
22320
22321SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22322 bool IsStrict = Op->isStrictFPOpcode();
22323
22324 SDLoc DL(Op);
22325 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22326 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22327 MVT VT = Op.getSimpleValueType();
22328 MVT SVT = In.getSimpleValueType();
22329
22330 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22331 return SDValue();
22332
22333 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22334 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22335 if (!Subtarget.getTargetTriple().isOSDarwin())
22336 return SDValue();
22337
22338 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22339 TargetLowering::CallLoweringInfo CLI(DAG);
22340 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22341
22343 TargetLowering::ArgListEntry Entry(
22344 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22345 Entry.IsSExt = false;
22346 Entry.IsZExt = true;
22347 Args.push_back(Entry);
22348
22350 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22351 : RTLIB::FPROUND_F32_F16),
22353 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22354 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22355 std::move(Args));
22356
22357 SDValue Res;
22358 std::tie(Res, Chain) = LowerCallTo(CLI);
22359
22360 Res = DAG.getBitcast(MVT::f16, Res);
22361
22362 if (IsStrict)
22363 Res = DAG.getMergeValues({Res, Chain}, DL);
22364
22365 return Res;
22366 }
22367
22368 if (VT.getScalarType() == MVT::bf16) {
22369 if (SVT.getScalarType() == MVT::f32 &&
22370 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22371 Subtarget.hasAVXNECONVERT()))
22372 return Op;
22373 return SDValue();
22374 }
22375
22376 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22377 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22378 return SDValue();
22379
22380 if (VT.isVector())
22381 return Op;
22382
22383 SDValue Res;
22385 MVT::i32);
22386 if (IsStrict) {
22387 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22388 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22389 DAG.getVectorIdxConstant(0, DL));
22390 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22391 {Chain, Res, Rnd});
22392 Chain = Res.getValue(1);
22393 } else {
22394 // FIXME: Should we use zeros for upper elements for non-strict?
22395 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22396 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22397 }
22398
22399 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22400 DAG.getVectorIdxConstant(0, DL));
22401 Res = DAG.getBitcast(MVT::f16, Res);
22402
22403 if (IsStrict)
22404 return DAG.getMergeValues({Res, Chain}, DL);
22405
22406 return Res;
22407 }
22408
22409 return Op;
22410}
22411
22413 bool IsStrict = Op->isStrictFPOpcode();
22414 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22415 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22416 "Unexpected VT!");
22417
22418 SDLoc dl(Op);
22419 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22420 DAG.getConstant(0, dl, MVT::v8i16), Src,
22421 DAG.getVectorIdxConstant(0, dl));
22422
22423 SDValue Chain;
22424 if (IsStrict) {
22425 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22426 {Op.getOperand(0), Res});
22427 Chain = Res.getValue(1);
22428 } else {
22429 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22430 }
22431
22432 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22433 DAG.getVectorIdxConstant(0, dl));
22434
22435 if (IsStrict)
22436 return DAG.getMergeValues({Res, Chain}, dl);
22437
22438 return Res;
22439}
22440
22442 bool IsStrict = Op->isStrictFPOpcode();
22443 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22444 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22445 "Unexpected VT!");
22446
22447 SDLoc dl(Op);
22448 SDValue Res, Chain;
22449 if (IsStrict) {
22450 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22451 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22452 DAG.getVectorIdxConstant(0, dl));
22453 Res = DAG.getNode(
22454 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22455 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22456 Chain = Res.getValue(1);
22457 } else {
22458 // FIXME: Should we use zeros for upper elements for non-strict?
22459 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22460 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22461 DAG.getTargetConstant(4, dl, MVT::i32));
22462 }
22463
22464 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22465 DAG.getVectorIdxConstant(0, dl));
22466
22467 if (IsStrict)
22468 return DAG.getMergeValues({Res, Chain}, dl);
22469
22470 return Res;
22471}
22472
22473SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22474 SelectionDAG &DAG) const {
22475 SDLoc DL(Op);
22476
22477 MVT SVT = Op.getOperand(0).getSimpleValueType();
22478 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22479 Subtarget.hasAVXNECONVERT())) {
22480 SDValue Res;
22481 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22482 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22483 Res = DAG.getBitcast(MVT::v8i16, Res);
22484 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22485 DAG.getVectorIdxConstant(0, DL));
22486 }
22487
22488 MakeLibCallOptions CallOptions;
22489 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22490 SDValue Res =
22491 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22492 return DAG.getBitcast(MVT::i16, Res);
22493}
22494
22495/// Depending on uarch and/or optimizing for size, we might prefer to use a
22496/// vector operation in place of the typical scalar operation.
22498 SelectionDAG &DAG,
22499 const X86Subtarget &Subtarget) {
22500 // If both operands have other uses, this is probably not profitable.
22501 SDValue LHS = Op.getOperand(0);
22502 SDValue RHS = Op.getOperand(1);
22503 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22504 return Op;
22505
22506 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22507 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22508 if (IsFP && !Subtarget.hasSSE3())
22509 return Op;
22510 if (!IsFP && !Subtarget.hasSSSE3())
22511 return Op;
22512
22513 // Extract from a common vector.
22514 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22515 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22516 LHS.getOperand(0) != RHS.getOperand(0) ||
22517 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22518 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22519 !shouldUseHorizontalOp(true, DAG, Subtarget))
22520 return Op;
22521
22522 // Allow commuted 'hadd' ops.
22523 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22524 unsigned HOpcode;
22525 switch (Op.getOpcode()) {
22526 // clang-format off
22527 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22528 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22529 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22530 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22531 default:
22532 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22533 // clang-format on
22534 }
22535 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22536 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22537 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22538 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22539 std::swap(LExtIndex, RExtIndex);
22540
22541 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22542 return Op;
22543
22544 SDValue X = LHS.getOperand(0);
22545 EVT VecVT = X.getValueType();
22546 unsigned BitWidth = VecVT.getSizeInBits();
22547 unsigned NumLanes = BitWidth / 128;
22548 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22549 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22550 "Not expecting illegal vector widths here");
22551
22552 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22553 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22554 if (BitWidth == 256 || BitWidth == 512) {
22555 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22556 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22557 LExtIndex %= NumEltsPerLane;
22558 }
22559
22560 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22561 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22562 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22563 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22564 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22565 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22566 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22567}
22568
22569/// Depending on uarch and/or optimizing for size, we might prefer to use a
22570/// vector operation in place of the typical scalar operation.
22571SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22572 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22573 "Only expecting float/double");
22574 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22575}
22576
22577/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22578/// This mode isn't supported in hardware on X86. But as long as we aren't
22579/// compiling with trapping math, we can emulate this with
22580/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22582 SDValue N0 = Op.getOperand(0);
22583 SDLoc dl(Op);
22584 MVT VT = Op.getSimpleValueType();
22585
22586 // N0 += copysign(nextafter(0.5, 0.0), N0)
22587 const fltSemantics &Sem = VT.getFltSemantics();
22588 bool Ignored;
22589 APFloat Point5Pred = APFloat(0.5f);
22590 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22591 Point5Pred.next(/*nextDown*/true);
22592
22593 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22594 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22595 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22596
22597 // Truncate the result to remove fraction.
22598 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22599}
22600
22601/// The only differences between FABS and FNEG are the mask and the logic op.
22602/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22604 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22605 "Wrong opcode for lowering FABS or FNEG.");
22606
22607 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22608
22609 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22610 // into an FNABS. We'll lower the FABS after that if it is still in use.
22611 if (IsFABS)
22612 for (SDNode *User : Op->users())
22613 if (User->getOpcode() == ISD::FNEG)
22614 return Op;
22615
22616 SDLoc dl(Op);
22617 MVT VT = Op.getSimpleValueType();
22618
22619 bool IsF128 = (VT == MVT::f128);
22620 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22622 "Unexpected type in LowerFABSorFNEG");
22623
22624 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22625 // decide if we should generate a 16-byte constant mask when we only need 4 or
22626 // 8 bytes for the scalar case.
22627
22628 // There are no scalar bitwise logical SSE/AVX instructions, so we
22629 // generate a 16-byte vector constant and logic op even for the scalar case.
22630 // Using a 16-byte mask allows folding the load of the mask with
22631 // the logic op, so it can save (~4 bytes) on code size.
22632 bool IsFakeVector = !VT.isVector() && !IsF128;
22633 MVT LogicVT = VT;
22634 if (IsFakeVector)
22635 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22636 : (VT == MVT::f32) ? MVT::v4f32
22637 : MVT::v8f16;
22638
22639 unsigned EltBits = VT.getScalarSizeInBits();
22640 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22641 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22642 APInt::getSignMask(EltBits);
22643 const fltSemantics &Sem = VT.getFltSemantics();
22644 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22645
22646 SDValue Op0 = Op.getOperand(0);
22647 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22648 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22649 IsFNABS ? X86ISD::FOR :
22651 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22652
22653 if (VT.isVector() || IsF128)
22654 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22655
22656 // For the scalar case extend to a 128-bit vector, perform the logic op,
22657 // and extract the scalar result back out.
22658 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22659 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22660 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22661 DAG.getVectorIdxConstant(0, dl));
22662}
22663
22665 SDValue Mag = Op.getOperand(0);
22666 SDValue Sign = Op.getOperand(1);
22667 SDLoc dl(Op);
22668
22669 // If the sign operand is smaller, extend it first.
22670 MVT VT = Op.getSimpleValueType();
22671 if (Sign.getSimpleValueType().bitsLT(VT))
22672 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22673
22674 // And if it is bigger, shrink it first.
22675 if (Sign.getSimpleValueType().bitsGT(VT))
22676 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22677 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22678
22679 // At this point the operands and the result should have the same
22680 // type, and that won't be f80 since that is not custom lowered.
22681 bool IsF128 = (VT == MVT::f128);
22682 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22684 "Unexpected type in LowerFCOPYSIGN");
22685
22686 const fltSemantics &Sem = VT.getFltSemantics();
22687
22688 // Perform all scalar logic operations as 16-byte vectors because there are no
22689 // scalar FP logic instructions in SSE.
22690 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22691 // unnecessary splats, but we might miss load folding opportunities. Should
22692 // this decision be based on OptimizeForSize?
22693 bool IsFakeVector = !VT.isVector() && !IsF128;
22694 MVT LogicVT = VT;
22695 if (IsFakeVector)
22696 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22697 : (VT == MVT::f32) ? MVT::v4f32
22698 : MVT::v8f16;
22699
22700 // The mask constants are automatically splatted for vector types.
22701 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22702 SDValue SignMask = DAG.getConstantFP(
22703 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22704 SDValue MagMask = DAG.getConstantFP(
22705 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22706
22707 // First, clear all bits but the sign bit from the second operand (sign).
22708 if (IsFakeVector)
22709 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22710 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22711
22712 // Next, clear the sign bit from the first operand (magnitude).
22713 // TODO: If we had general constant folding for FP logic ops, this check
22714 // wouldn't be necessary.
22715 SDValue MagBits;
22716 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22717 APFloat APF = Op0CN->getValueAPF();
22718 APF.clearSign();
22719 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22720 } else {
22721 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22722 if (IsFakeVector)
22723 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22724 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22725 }
22726
22727 // OR the magnitude value with the sign bit.
22728 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22729 return !IsFakeVector ? Or
22730 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22731 DAG.getVectorIdxConstant(0, dl));
22732}
22733
22735 SDValue N0 = Op.getOperand(0);
22736 SDLoc dl(Op);
22737 MVT VT = Op.getSimpleValueType();
22738
22739 MVT OpVT = N0.getSimpleValueType();
22740 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22741 "Unexpected type for FGETSIGN");
22742
22743 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22744 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22745 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22746 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22747 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22748 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22749 return Res;
22750}
22751
22752/// Helper for attempting to create a X86ISD::BT node.
22753static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22754 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22755 // instruction. Since the shift amount is in-range-or-undefined, we know
22756 // that doing a bittest on the i32 value is ok. We extend to i32 because
22757 // the encoding for the i16 version is larger than the i32 version.
22758 // Also promote i16 to i32 for performance / code size reason.
22759 if (Src.getValueType().getScalarSizeInBits() < 32)
22760 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22761
22762 // No legal type found, give up.
22763 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22764 return SDValue();
22765
22766 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22767 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22768 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22769 // known to be zero.
22770 if (Src.getValueType() == MVT::i64 &&
22771 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22772 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22773
22774 // If the operand types disagree, extend the shift amount to match. Since
22775 // BT ignores high bits (like shifts) we can use anyextend.
22776 if (Src.getValueType() != BitNo.getValueType()) {
22777 // Peek through a mask/modulo operation.
22778 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22779 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22780 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22781 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22782 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22783 BitNo.getOperand(0)),
22784 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22785 BitNo.getOperand(1)));
22786 else
22787 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22788 }
22789
22790 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22791}
22792
22793/// Helper for creating a X86ISD::SETCC node.
22795 SelectionDAG &DAG) {
22796 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22797 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22798}
22799
22800/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22801/// recognizable memcmp expansion.
22802static bool isOrXorXorTree(SDValue X, bool Root = true) {
22803 if (X.getOpcode() == ISD::OR)
22804 return isOrXorXorTree(X.getOperand(0), false) &&
22805 isOrXorXorTree(X.getOperand(1), false);
22806 if (Root)
22807 return false;
22808 return X.getOpcode() == ISD::XOR;
22809}
22810
22811/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22812/// expansion.
22813template <typename F>
22815 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22816 SDValue Op0 = X.getOperand(0);
22817 SDValue Op1 = X.getOperand(1);
22818 if (X.getOpcode() == ISD::OR) {
22819 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22820 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22821 if (VecVT != CmpVT)
22822 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22823 if (HasPT)
22824 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22825 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22826 }
22827 if (X.getOpcode() == ISD::XOR) {
22828 SDValue A = SToV(Op0);
22829 SDValue B = SToV(Op1);
22830 if (VecVT != CmpVT)
22831 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22832 if (HasPT)
22833 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22834 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22835 }
22836 llvm_unreachable("Impossible");
22837}
22838
22839/// Try to map a 128-bit or larger integer comparison to vector instructions
22840/// before type legalization splits it up into chunks.
22842 ISD::CondCode CC,
22843 const SDLoc &DL,
22844 SelectionDAG &DAG,
22845 const X86Subtarget &Subtarget) {
22846 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22847
22848 // We're looking for an oversized integer equality comparison.
22849 EVT OpVT = X.getValueType();
22850 unsigned OpSize = OpVT.getSizeInBits();
22851 if (!OpVT.isScalarInteger() || OpSize < 128)
22852 return SDValue();
22853
22854 // Ignore a comparison with zero because that gets special treatment in
22855 // EmitTest(). But make an exception for the special case of a pair of
22856 // logically-combined vector-sized operands compared to zero. This pattern may
22857 // be generated by the memcmp expansion pass with oversized integer compares
22858 // (see PR33325).
22859 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22860 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22861 return SDValue();
22862
22863 // Don't perform this combine if constructing the vector will be expensive.
22864 auto IsVectorBitCastCheap = [](SDValue X) {
22866 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22867 X.getOpcode() == ISD::LOAD;
22868 };
22869 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22870 !IsOrXorXorTreeCCZero)
22871 return SDValue();
22872
22873 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22874 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22875 // Otherwise use PCMPEQ (plus AND) and mask testing.
22876 bool NoImplicitFloatOps =
22878 Attribute::NoImplicitFloat);
22879 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22880 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22881 (OpSize == 256 && Subtarget.hasAVX()) ||
22882 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22883 bool HasPT = Subtarget.hasSSE41();
22884
22885 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22886 // vector registers are essentially free. (Technically, widening registers
22887 // prevents load folding, but the tradeoff is worth it.)
22888 bool PreferKOT = Subtarget.preferMaskRegisters();
22889 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22890
22891 EVT VecVT = MVT::v16i8;
22892 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22893 if (OpSize == 256) {
22894 VecVT = MVT::v32i8;
22895 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22896 }
22897 EVT CastVT = VecVT;
22898 bool NeedsAVX512FCast = false;
22899 if (OpSize == 512 || NeedZExt) {
22900 if (Subtarget.hasBWI()) {
22901 VecVT = MVT::v64i8;
22902 CmpVT = MVT::v64i1;
22903 if (OpSize == 512)
22904 CastVT = VecVT;
22905 } else {
22906 VecVT = MVT::v16i32;
22907 CmpVT = MVT::v16i1;
22908 CastVT = OpSize == 512 ? VecVT
22909 : OpSize == 256 ? MVT::v8i32
22910 : MVT::v4i32;
22911 NeedsAVX512FCast = true;
22912 }
22913 }
22914
22915 auto ScalarToVector = [&](SDValue X) -> SDValue {
22916 bool TmpZext = false;
22917 EVT TmpCastVT = CastVT;
22918 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22919 SDValue OrigX = X.getOperand(0);
22920 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22921 if (OrigSize < OpSize) {
22922 if (OrigSize == 128) {
22923 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22924 X = OrigX;
22925 TmpZext = true;
22926 } else if (OrigSize == 256) {
22927 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22928 X = OrigX;
22929 TmpZext = true;
22930 }
22931 }
22932 }
22933 X = DAG.getBitcast(TmpCastVT, X);
22934 if (!NeedZExt && !TmpZext)
22935 return X;
22936 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22937 DAG.getConstant(0, DL, VecVT), X,
22938 DAG.getVectorIdxConstant(0, DL));
22939 };
22940
22941 SDValue Cmp;
22942 if (IsOrXorXorTreeCCZero) {
22943 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22944 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22945 // Use 2 vector equality compares and 'and' the results before doing a
22946 // MOVMSK.
22947 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22948 } else {
22949 SDValue VecX = ScalarToVector(X);
22950 SDValue VecY = ScalarToVector(Y);
22951 if (VecVT != CmpVT) {
22952 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22953 } else if (HasPT) {
22954 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22955 } else {
22956 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22957 }
22958 }
22959 // AVX512 should emit a setcc that will lower to kortest.
22960 if (VecVT != CmpVT) {
22961 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22962 : CmpVT == MVT::v32i1 ? MVT::i32
22963 : MVT::i16;
22964 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22965 DAG.getConstant(0, DL, KRegVT), CC);
22966 }
22967 if (HasPT) {
22968 SDValue BCCmp =
22969 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22970 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22972 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22973 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22974 }
22975 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22976 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22977 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22978 assert(Cmp.getValueType() == MVT::v16i8 &&
22979 "Non 128-bit vector on pre-SSE41 target");
22980 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22981 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22982 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22983 }
22984
22985 return SDValue();
22986}
22987
22988/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22989/// style scalarized (associative) reduction patterns. Partial reductions
22990/// are supported when the pointer SrcMask is non-null.
22991/// TODO - move this to SelectionDAG?
22994 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22996 DenseMap<SDValue, APInt> SrcOpMap;
22997 EVT VT = MVT::Other;
22998
22999 // Recognize a special case where a vector is casted into wide integer to
23000 // test all 0s.
23001 assert(Op.getOpcode() == unsigned(BinOp) &&
23002 "Unexpected bit reduction opcode");
23003 Opnds.push_back(Op.getOperand(0));
23004 Opnds.push_back(Op.getOperand(1));
23005
23006 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
23008 // BFS traverse all BinOp operands.
23009 if (I->getOpcode() == unsigned(BinOp)) {
23010 Opnds.push_back(I->getOperand(0));
23011 Opnds.push_back(I->getOperand(1));
23012 // Re-evaluate the number of nodes to be traversed.
23013 e += 2; // 2 more nodes (LHS and RHS) are pushed.
23014 continue;
23015 }
23016
23017 // Quit if a non-EXTRACT_VECTOR_ELT
23018 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23019 return false;
23020
23021 // Quit if without a constant index.
23022 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23023 if (!Idx)
23024 return false;
23025
23026 SDValue Src = I->getOperand(0);
23027 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23028 if (M == SrcOpMap.end()) {
23029 VT = Src.getValueType();
23030 // Quit if not the same type.
23031 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23032 return false;
23033 unsigned NumElts = VT.getVectorNumElements();
23034 APInt EltCount = APInt::getZero(NumElts);
23035 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23036 SrcOps.push_back(Src);
23037 }
23038
23039 // Quit if element already used.
23040 unsigned CIdx = Idx->getZExtValue();
23041 if (M->second[CIdx])
23042 return false;
23043 M->second.setBit(CIdx);
23044 }
23045
23046 if (SrcMask) {
23047 // Collect the source partial masks.
23048 for (SDValue &SrcOp : SrcOps)
23049 SrcMask->push_back(SrcOpMap[SrcOp]);
23050 } else {
23051 // Quit if not all elements are used.
23052 for (const auto &I : SrcOpMap)
23053 if (!I.second.isAllOnes())
23054 return false;
23055 }
23056
23057 return true;
23058}
23059
23060// Helper function for comparing all bits of two vectors.
23062 ISD::CondCode CC, const APInt &OriginalMask,
23063 const X86Subtarget &Subtarget,
23064 SelectionDAG &DAG, X86::CondCode &X86CC) {
23065 EVT VT = LHS.getValueType();
23066 unsigned ScalarSize = VT.getScalarSizeInBits();
23067 if (OriginalMask.getBitWidth() != ScalarSize) {
23068 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23069 return SDValue();
23070 }
23071
23072 // Quit if not convertable to legal scalar or 128/256-bit vector.
23074 return SDValue();
23075
23076 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23077 if (VT.isFloatingPoint())
23078 return SDValue();
23079
23080 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23081 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23082
23083 APInt Mask = OriginalMask;
23084
23085 auto MaskBits = [&](SDValue Src) {
23086 if (Mask.isAllOnes())
23087 return Src;
23088 EVT SrcVT = Src.getValueType();
23089 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23090 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23091 };
23092
23093 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23094 if (VT.getSizeInBits() < 128) {
23095 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23096 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23097 if (IntVT != MVT::i64)
23098 return SDValue();
23099 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23100 MVT::i32, MVT::i32);
23101 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23102 MVT::i32, MVT::i32);
23103 SDValue Lo =
23104 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23105 SDValue Hi =
23106 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23107 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23108 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23109 DAG.getConstant(0, DL, MVT::i32));
23110 }
23111 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23112 DAG.getBitcast(IntVT, MaskBits(LHS)),
23113 DAG.getBitcast(IntVT, MaskBits(RHS)));
23114 }
23115
23116 // Without PTEST, a masked v2i64 or-reduction is not faster than
23117 // scalarization.
23118 bool UseKORTEST = Subtarget.useAVX512Regs();
23119 bool UsePTEST = Subtarget.hasSSE41();
23120 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23121 return SDValue();
23122
23123 // Split down to 128/256/512-bit vector.
23124 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23125
23126 // If the input vector has vector elements wider than the target test size,
23127 // then cast to <X x i64> so it will safely split.
23128 if (ScalarSize > TestSize) {
23129 if (!Mask.isAllOnes())
23130 return SDValue();
23131 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23132 LHS = DAG.getBitcast(VT, LHS);
23133 RHS = DAG.getBitcast(VT, RHS);
23134 Mask = APInt::getAllOnes(64);
23135 }
23136
23137 if (VT.getSizeInBits() > TestSize) {
23138 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23139 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23140 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23141 while (VT.getSizeInBits() > TestSize) {
23142 auto Split = DAG.SplitVector(LHS, DL);
23143 VT = Split.first.getValueType();
23144 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23145 }
23146 RHS = DAG.getAllOnesConstant(DL, VT);
23147 } else if (!UsePTEST && !KnownRHS.isZero()) {
23148 // MOVMSK Special Case:
23149 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23150 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23151 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23152 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23153 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23154 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23155 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23156 V = DAG.getSExtOrTrunc(V, DL, VT);
23157 while (VT.getSizeInBits() > TestSize) {
23158 auto Split = DAG.SplitVector(V, DL);
23159 VT = Split.first.getValueType();
23160 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23161 }
23162 V = DAG.getNOT(DL, V, VT);
23163 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23164 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23165 DAG.getConstant(0, DL, MVT::i32));
23166 } else {
23167 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23168 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23169 while (VT.getSizeInBits() > TestSize) {
23170 auto Split = DAG.SplitVector(V, DL);
23171 VT = Split.first.getValueType();
23172 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23173 }
23174 LHS = V;
23175 RHS = DAG.getConstant(0, DL, VT);
23176 }
23177 }
23178
23179 if (UseKORTEST && VT.is512BitVector()) {
23180 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23181 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23182 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23183 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23184 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23185 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23186 }
23187
23188 if (UsePTEST) {
23189 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23190 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23191 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23192 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23193 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23194 }
23195
23196 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23197 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23198 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23199 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23200 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23201 V = DAG.getNOT(DL, V, MaskVT);
23202 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23203 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23204 DAG.getConstant(0, DL, MVT::i32));
23205}
23206
23207// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23208// to CMP(MOVMSK(PCMPEQB(X,Y))).
23210 ISD::CondCode CC, const SDLoc &DL,
23211 const X86Subtarget &Subtarget,
23212 SelectionDAG &DAG,
23213 X86::CondCode &X86CC) {
23214 SDValue Op = OrigLHS;
23215
23216 bool CmpNull;
23217 APInt Mask;
23218 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23219 CmpNull = isNullConstant(OrigRHS);
23220 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23221 return SDValue();
23222
23223 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23224 return SDValue();
23225
23226 // Check whether we're masking/truncating an OR-reduction result, in which
23227 // case track the masked bits.
23228 // TODO: Add CmpAllOnes support.
23229 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23230 if (CmpNull) {
23231 switch (Op.getOpcode()) {
23232 case ISD::TRUNCATE: {
23233 SDValue Src = Op.getOperand(0);
23234 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23235 Op.getScalarValueSizeInBits());
23236 Op = Src;
23237 break;
23238 }
23239 case ISD::AND: {
23240 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23241 Mask = Cst->getAPIntValue();
23242 Op = Op.getOperand(0);
23243 }
23244 break;
23245 }
23246 }
23247 }
23248 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23249 CC = ISD::SETEQ;
23250 CmpNull = true;
23251 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23252 } else {
23253 return SDValue();
23254 }
23255
23256 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23257
23258 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23259 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23261 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23262 EVT VT = VecIns[0].getValueType();
23263 assert(llvm::all_of(VecIns,
23264 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23265 "Reduction source vector mismatch");
23266
23267 // Quit if not splittable to scalar/128/256/512-bit vector.
23269 return SDValue();
23270
23271 // If more than one full vector is evaluated, AND/OR them first before
23272 // PTEST.
23273 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23274 Slot += 2, e += 1) {
23275 // Each iteration will AND/OR 2 nodes and append the result until there is
23276 // only 1 node left, i.e. the final value of all vectors.
23277 SDValue LHS = VecIns[Slot];
23278 SDValue RHS = VecIns[Slot + 1];
23279 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23280 }
23281
23282 return LowerVectorAllEqual(DL, VecIns.back(),
23283 CmpNull ? DAG.getConstant(0, DL, VT)
23284 : DAG.getAllOnesConstant(DL, VT),
23285 CC, Mask, Subtarget, DAG, X86CC);
23286 }
23287
23288 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23289 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23290 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23291 ISD::NodeType BinOp;
23292 if (SDValue Match =
23293 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23294 EVT MatchVT = Match.getValueType();
23295 return LowerVectorAllEqual(DL, Match,
23296 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23297 : DAG.getAllOnesConstant(DL, MatchVT),
23298 CC, Mask, Subtarget, DAG, X86CC);
23299 }
23300 }
23301
23302 if (Mask.isAllOnes()) {
23303 assert(!Op.getValueType().isVector() &&
23304 "Illegal vector type for reduction pattern");
23306 if (Src.getValueType().isFixedLengthVector() &&
23307 Src.getValueType().getScalarType() == MVT::i1) {
23308 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23309 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23310 if (Src.getOpcode() == ISD::SETCC) {
23311 SDValue LHS = Src.getOperand(0);
23312 SDValue RHS = Src.getOperand(1);
23313 EVT LHSVT = LHS.getValueType();
23314 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23315 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23317 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23318 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23319 X86CC);
23320 }
23321 }
23322 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23323 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23324 // Peek through truncation, mask the LSB and compare against zero/LSB.
23325 if (Src.getOpcode() == ISD::TRUNCATE) {
23326 SDValue Inner = Src.getOperand(0);
23327 EVT InnerVT = Inner.getValueType();
23329 unsigned BW = InnerVT.getScalarSizeInBits();
23330 APInt SrcMask = APInt(BW, 1);
23331 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23332 return LowerVectorAllEqual(DL, Inner,
23333 DAG.getConstant(Cmp, DL, InnerVT), CC,
23334 SrcMask, Subtarget, DAG, X86CC);
23335 }
23336 }
23337 }
23338 }
23339
23340 return SDValue();
23341}
23342
23343/// return true if \c Op has a use that doesn't just read flags.
23345 for (SDUse &Use : Op->uses()) {
23346 SDNode *User = Use.getUser();
23347 unsigned UOpNo = Use.getOperandNo();
23348 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23349 // Look past truncate.
23350 UOpNo = User->use_begin()->getOperandNo();
23351 User = User->use_begin()->getUser();
23352 }
23353
23354 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23355 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23356 return true;
23357 }
23358 return false;
23359}
23360
23361// Transform to an x86-specific ALU node with flags if there is a chance of
23362// using an RMW op or only the flags are used. Otherwise, leave
23363// the node alone and emit a 'cmp' or 'test' instruction.
23365 for (SDNode *U : Op->users())
23366 if (U->getOpcode() != ISD::CopyToReg &&
23367 U->getOpcode() != ISD::SETCC &&
23368 U->getOpcode() != ISD::STORE)
23369 return false;
23370
23371 return true;
23372}
23373
23374/// Emit nodes that will be selected as "test Op0,Op0", or something
23375/// equivalent.
23377 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23378 // CF and OF aren't always set the way we want. Determine which
23379 // of these we need.
23380 bool NeedCF = false;
23381 bool NeedOF = false;
23382 switch (X86CC) {
23383 default: break;
23384 case X86::COND_A: case X86::COND_AE:
23385 case X86::COND_B: case X86::COND_BE:
23386 NeedCF = true;
23387 break;
23388 case X86::COND_G: case X86::COND_GE:
23389 case X86::COND_L: case X86::COND_LE:
23390 case X86::COND_O: case X86::COND_NO: {
23391 // Check if we really need to set the
23392 // Overflow flag. If NoSignedWrap is present
23393 // that is not actually needed.
23394 switch (Op->getOpcode()) {
23395 case ISD::ADD:
23396 case ISD::SUB:
23397 case ISD::MUL:
23398 case ISD::SHL:
23399 if (Op.getNode()->getFlags().hasNoSignedWrap())
23400 break;
23401 [[fallthrough]];
23402 default:
23403 NeedOF = true;
23404 break;
23405 }
23406 break;
23407 }
23408 }
23409 // See if we can use the EFLAGS value from the operand instead of
23410 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23411 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23412 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23413 // Emit a CMP with 0, which is the TEST pattern.
23414 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23415 DAG.getConstant(0, dl, Op.getValueType()));
23416 }
23417 unsigned Opcode = 0;
23418 unsigned NumOperands = 0;
23419
23420 SDValue ArithOp = Op;
23421
23422 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23423 // which may be the result of a CAST. We use the variable 'Op', which is the
23424 // non-casted variable when we check for possible users.
23425 switch (ArithOp.getOpcode()) {
23426 case ISD::AND:
23427 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23428 // because a TEST instruction will be better.
23429 if (!hasNonFlagsUse(Op))
23430 break;
23431
23432 [[fallthrough]];
23433 case ISD::ADD:
23434 case ISD::SUB:
23435 case ISD::OR:
23436 case ISD::XOR:
23438 break;
23439
23440 // Otherwise use a regular EFLAGS-setting instruction.
23441 switch (ArithOp.getOpcode()) {
23442 // clang-format off
23443 default: llvm_unreachable("unexpected operator!");
23444 case ISD::ADD: Opcode = X86ISD::ADD; break;
23445 case ISD::SUB: Opcode = X86ISD::SUB; break;
23446 case ISD::XOR: Opcode = X86ISD::XOR; break;
23447 case ISD::AND: Opcode = X86ISD::AND; break;
23448 case ISD::OR: Opcode = X86ISD::OR; break;
23449 // clang-format on
23450 }
23451
23452 NumOperands = 2;
23453 break;
23454 case X86ISD::ADD:
23455 case X86ISD::SUB:
23456 case X86ISD::OR:
23457 case X86ISD::XOR:
23458 case X86ISD::AND:
23459 return SDValue(Op.getNode(), 1);
23460 case ISD::SSUBO:
23461 case ISD::USUBO: {
23462 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23463 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23464 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23465 Op->getOperand(1)).getValue(1);
23466 }
23467 default:
23468 break;
23469 }
23470
23471 if (Opcode == 0) {
23472 // Emit a CMP with 0, which is the TEST pattern.
23473 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23474 DAG.getConstant(0, dl, Op.getValueType()));
23475 }
23476 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23477 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23478
23479 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23480 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23481 return SDValue(New.getNode(), 1);
23482}
23483
23484/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23485/// equivalent.
23487 const SDLoc &dl, SelectionDAG &DAG,
23488 const X86Subtarget &Subtarget) {
23489 if (isNullConstant(Op1))
23490 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23491
23492 EVT CmpVT = Op0.getValueType();
23493
23494 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23495 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23496
23497 // Only promote the compare up to I32 if it is a 16 bit operation
23498 // with an immediate. 16 bit immediates are to be avoided unless the target
23499 // isn't slowed down by length changing prefixes, we're optimizing for
23500 // codesize or the comparison is with a folded load.
23501 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23502 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23504 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23505 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23506 // Don't do this if the immediate can fit in 8-bits.
23507 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23508 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23509 unsigned ExtendOp =
23511 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23512 // For equality comparisons try to use SIGN_EXTEND if the input was
23513 // truncate from something with enough sign bits.
23514 if (Op0.getOpcode() == ISD::TRUNCATE) {
23515 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23516 ExtendOp = ISD::SIGN_EXTEND;
23517 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23518 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23519 ExtendOp = ISD::SIGN_EXTEND;
23520 }
23521 }
23522
23523 CmpVT = MVT::i32;
23524 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23525 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23526 }
23527 }
23528
23529 // Try to shrink i64 compares if the input has enough zero bits.
23530 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23531 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23532 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23533 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23534 CmpVT = MVT::i32;
23535 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23536 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23537 }
23538
23539 // Try to shrink all i64 compares if the inputs are representable as signed
23540 // i32.
23541 if (CmpVT == MVT::i64 &&
23542 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23543 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23544 CmpVT = MVT::i32;
23545 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23546 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23547 }
23548
23549 // 0-x == y --> x+y == 0
23550 // 0-x != y --> x+y != 0
23551 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23552 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23553 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23554 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23555 return Add.getValue(1);
23556 }
23557
23558 // x == 0-y --> x+y == 0
23559 // x != 0-y --> x+y != 0
23560 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23561 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23562 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23563 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23564 return Add.getValue(1);
23565 }
23566
23567 // If we already have an XOR of the ops, use that to check for equality.
23568 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23569 unsigned X86Opc = X86ISD::SUB;
23570 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23571 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23572 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23573 X86Opc = X86ISD::XOR;
23574
23575 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23576 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23577 return CmpOp.getValue(1);
23578}
23579
23584
23585bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23586 SDNode *N, SDValue, SDValue IntPow2) const {
23587 if (N->getOpcode() == ISD::FDIV)
23588 return true;
23589
23590 EVT FPVT = N->getValueType(0);
23591 EVT IntVT = IntPow2.getValueType();
23592
23593 // This indicates a non-free bitcast.
23594 // TODO: This is probably overly conservative as we will need to scale the
23595 // integer vector anyways for the int->fp cast.
23596 if (FPVT.isVector() &&
23597 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23598 return false;
23599
23600 return true;
23601}
23602
23603/// Check if replacement of SQRT with RSQRT should be disabled.
23604bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23605 EVT VT = Op.getValueType();
23606
23607 // We don't need to replace SQRT with RSQRT for half type.
23608 if (VT.getScalarType() == MVT::f16)
23609 return true;
23610
23611 // We never want to use both SQRT and RSQRT instructions for the same input.
23612 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23613 return false;
23614
23615 if (VT.isVector())
23616 return Subtarget.hasFastVectorFSQRT();
23617 return Subtarget.hasFastScalarFSQRT();
23618}
23619
23620/// The minimum architected relative accuracy is 2^-12. We need one
23621/// Newton-Raphson step to have a good float result (24 bits of precision).
23622SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23623 SelectionDAG &DAG, int Enabled,
23624 int &RefinementSteps,
23625 bool &UseOneConstNR,
23626 bool Reciprocal) const {
23627 SDLoc DL(Op);
23628 EVT VT = Op.getValueType();
23629
23630 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23631 // It is likely not profitable to do this for f64 because a double-precision
23632 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23633 // instructions: convert to single, rsqrtss, convert back to double, refine
23634 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23635 // along with FMA, this could be a throughput win.
23636 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23637 // after legalize types.
23638 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23639 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23640 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23641 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23642 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23643 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23644 RefinementSteps = 1;
23645
23646 UseOneConstNR = false;
23647 // There is no FSQRT for 512-bits, but there is RSQRT14.
23648 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23649 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23650 if (RefinementSteps == 0 && !Reciprocal)
23651 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23652 return Estimate;
23653 }
23654
23655 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23656 Subtarget.hasFP16()) {
23657 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23658 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23659 RefinementSteps = 0;
23660
23661 if (VT == MVT::f16) {
23663 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23664 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23665 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23666 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23667 }
23668
23669 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23670 }
23671 return SDValue();
23672}
23673
23674/// The minimum architected relative accuracy is 2^-12. We need one
23675/// Newton-Raphson step to have a good float result (24 bits of precision).
23676SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23677 int Enabled,
23678 int &RefinementSteps) const {
23679 SDLoc DL(Op);
23680 EVT VT = Op.getValueType();
23681
23682 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23683 // It is likely not profitable to do this for f64 because a double-precision
23684 // reciprocal estimate with refinement on x86 prior to FMA requires
23685 // 15 instructions: convert to single, rcpss, convert back to double, refine
23686 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23687 // along with FMA, this could be a throughput win.
23688
23689 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23690 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23691 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23692 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23693 // Enable estimate codegen with 1 refinement step for vector division.
23694 // Scalar division estimates are disabled because they break too much
23695 // real-world code. These defaults are intended to match GCC behavior.
23696 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23697 return SDValue();
23698
23699 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23700 RefinementSteps = 1;
23701
23702 // There is no FSQRT for 512-bits, but there is RCP14.
23703 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23704 return DAG.getNode(Opcode, DL, VT, Op);
23705 }
23706
23707 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23708 Subtarget.hasFP16()) {
23709 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23710 RefinementSteps = 0;
23711
23712 if (VT == MVT::f16) {
23714 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23715 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23716 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23717 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23718 }
23719
23720 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23721 }
23722 return SDValue();
23723}
23724
23725/// If we have at least two divisions that use the same divisor, convert to
23726/// multiplication by a reciprocal. This may need to be adjusted for a given
23727/// CPU if a division's cost is not at least twice the cost of a multiplication.
23728/// This is because we still need one division to calculate the reciprocal and
23729/// then we need two multiplies by that reciprocal as replacements for the
23730/// original divisions.
23732 return 2;
23733}
23734
23735SDValue
23736X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23737 SelectionDAG &DAG,
23738 SmallVectorImpl<SDNode *> &Created) const {
23739 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23740 if (isIntDivCheap(N->getValueType(0), Attr))
23741 return SDValue(N,0); // Lower SDIV as SDIV
23742
23743 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23744 "Unexpected divisor!");
23745
23746 // Only perform this transform if CMOV is supported otherwise the select
23747 // below will become a branch.
23748 if (!Subtarget.canUseCMOV())
23749 return SDValue();
23750
23751 // fold (sdiv X, pow2)
23752 EVT VT = N->getValueType(0);
23753 // FIXME: Support i8.
23754 if (VT != MVT::i16 && VT != MVT::i32 &&
23755 !(Subtarget.is64Bit() && VT == MVT::i64))
23756 return SDValue();
23757
23758 // If the divisor is 2 or -2, the default expansion is better.
23759 if (Divisor == 2 ||
23760 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23761 return SDValue();
23762
23763 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23764}
23765
23766/// Result of 'and' is compared against zero. Change to a BT node if possible.
23767/// Returns the BT node and the condition code needed to use it.
23769 SelectionDAG &DAG, X86::CondCode &X86CC) {
23770 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23771 SDValue Op0 = And.getOperand(0);
23772 SDValue Op1 = And.getOperand(1);
23773 if (Op0.getOpcode() == ISD::TRUNCATE)
23774 Op0 = Op0.getOperand(0);
23775 if (Op1.getOpcode() == ISD::TRUNCATE)
23776 Op1 = Op1.getOperand(0);
23777
23778 SDValue Src, BitNo;
23779 if (Op1.getOpcode() == ISD::SHL)
23780 std::swap(Op0, Op1);
23781 if (Op0.getOpcode() == ISD::SHL) {
23782 if (isOneConstant(Op0.getOperand(0))) {
23783 // If we looked past a truncate, check that it's only truncating away
23784 // known zeros.
23785 unsigned BitWidth = Op0.getValueSizeInBits();
23786 unsigned AndBitWidth = And.getValueSizeInBits();
23787 if (BitWidth > AndBitWidth) {
23788 KnownBits Known = DAG.computeKnownBits(Op0);
23789 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23790 return SDValue();
23791 }
23792 Src = Op1;
23793 BitNo = Op0.getOperand(1);
23794 }
23795 } else if (Op1.getOpcode() == ISD::Constant) {
23796 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23797 uint64_t AndRHSVal = AndRHS->getZExtValue();
23798 SDValue AndLHS = Op0;
23799
23800 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23801 Src = AndLHS.getOperand(0);
23802 BitNo = AndLHS.getOperand(1);
23803 } else {
23804 // Use BT if the immediate can't be encoded in a TEST instruction or we
23805 // are optimizing for size and the immedaite won't fit in a byte.
23806 bool OptForSize = DAG.shouldOptForSize();
23807 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23808 isPowerOf2_64(AndRHSVal)) {
23809 Src = AndLHS;
23810 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23811 Src.getValueType());
23812 }
23813 }
23814 }
23815
23816 // No patterns found, give up.
23817 if (!Src.getNode())
23818 return SDValue();
23819
23820 // Remove any bit flip.
23821 if (isBitwiseNot(Src)) {
23822 Src = Src.getOperand(0);
23823 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23824 }
23825
23826 // Attempt to create the X86ISD::BT node.
23827 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23828 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23829 return BT;
23830 }
23831
23832 return SDValue();
23833}
23834
23835// Check if pre-AVX condcode can be performed by a single FCMP op.
23836static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23837 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23838}
23839
23840/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23841/// CMPs.
23842static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23843 SDValue &Op1, bool &IsAlwaysSignaling) {
23844 unsigned SSECC;
23845 bool Swap = false;
23846
23847 // SSE Condition code mapping:
23848 // 0 - EQ
23849 // 1 - LT
23850 // 2 - LE
23851 // 3 - UNORD
23852 // 4 - NEQ
23853 // 5 - NLT
23854 // 6 - NLE
23855 // 7 - ORD
23856 switch (SetCCOpcode) {
23857 // clang-format off
23858 default: llvm_unreachable("Unexpected SETCC condition");
23859 case ISD::SETOEQ:
23860 case ISD::SETEQ: SSECC = 0; break;
23861 case ISD::SETOGT:
23862 case ISD::SETGT: Swap = true; [[fallthrough]];
23863 case ISD::SETLT:
23864 case ISD::SETOLT: SSECC = 1; break;
23865 case ISD::SETOGE:
23866 case ISD::SETGE: Swap = true; [[fallthrough]];
23867 case ISD::SETLE:
23868 case ISD::SETOLE: SSECC = 2; break;
23869 case ISD::SETUO: SSECC = 3; break;
23870 case ISD::SETUNE:
23871 case ISD::SETNE: SSECC = 4; break;
23872 case ISD::SETULE: Swap = true; [[fallthrough]];
23873 case ISD::SETUGE: SSECC = 5; break;
23874 case ISD::SETULT: Swap = true; [[fallthrough]];
23875 case ISD::SETUGT: SSECC = 6; break;
23876 case ISD::SETO: SSECC = 7; break;
23877 case ISD::SETUEQ: SSECC = 8; break;
23878 case ISD::SETONE: SSECC = 12; break;
23879 // clang-format on
23880 }
23881 if (Swap)
23882 std::swap(Op0, Op1);
23883
23884 switch (SetCCOpcode) {
23885 default:
23886 IsAlwaysSignaling = true;
23887 break;
23888 case ISD::SETEQ:
23889 case ISD::SETOEQ:
23890 case ISD::SETUEQ:
23891 case ISD::SETNE:
23892 case ISD::SETONE:
23893 case ISD::SETUNE:
23894 case ISD::SETO:
23895 case ISD::SETUO:
23896 IsAlwaysSignaling = false;
23897 break;
23898 }
23899
23900 return SSECC;
23901}
23902
23903/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23904/// concatenate the result back.
23906 SelectionDAG &DAG, const SDLoc &dl) {
23907 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23908 "Unsupported VTs!");
23909 SDValue CC = DAG.getCondCode(Cond);
23910
23911 // Extract the LHS Lo/Hi vectors
23912 SDValue LHS1, LHS2;
23913 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23914
23915 // Extract the RHS Lo/Hi vectors
23916 SDValue RHS1, RHS2;
23917 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23918
23919 // Issue the operation on the smaller types and concatenate the result back
23920 EVT LoVT, HiVT;
23921 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23922 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23923 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23924 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23925}
23926
23928 SelectionDAG &DAG) {
23929 SDValue Op0 = Op.getOperand(0);
23930 SDValue Op1 = Op.getOperand(1);
23931 SDValue CC = Op.getOperand(2);
23932 MVT VT = Op.getSimpleValueType();
23933 assert(VT.getVectorElementType() == MVT::i1 &&
23934 "Cannot set masked compare for this operation");
23935
23936 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23937
23938 // Prefer SETGT over SETLT.
23939 if (SetCCOpcode == ISD::SETLT) {
23940 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23941 std::swap(Op0, Op1);
23942 }
23943
23944 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23945}
23946
23947/// Given a buildvector constant, return a new vector constant with each element
23948/// incremented or decremented. If incrementing or decrementing would result in
23949/// unsigned overflow or underflow or this is not a simple vector constant,
23950/// return an empty value.
23952 bool NSW) {
23953 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23954 if (!BV || !V.getValueType().isSimple())
23955 return SDValue();
23956
23957 MVT VT = V.getSimpleValueType();
23958 MVT EltVT = VT.getVectorElementType();
23959 unsigned NumElts = VT.getVectorNumElements();
23961 SDLoc DL(V);
23962 for (unsigned i = 0; i < NumElts; ++i) {
23963 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23964 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23965 return SDValue();
23966
23967 // Avoid overflow/underflow.
23968 const APInt &EltC = Elt->getAPIntValue();
23969 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23970 return SDValue();
23971 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23972 (!IsInc && EltC.isMinSignedValue())))
23973 return SDValue();
23974
23975 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23976 }
23977
23978 return DAG.getBuildVector(VT, DL, NewVecC);
23979}
23980
23981/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23982/// Op0 u<= Op1:
23983/// t = psubus Op0, Op1
23984/// pcmpeq t, <0..0>
23986 ISD::CondCode Cond, const SDLoc &dl,
23987 const X86Subtarget &Subtarget,
23988 SelectionDAG &DAG) {
23989 if (!Subtarget.hasSSE2())
23990 return SDValue();
23991
23992 MVT VET = VT.getVectorElementType();
23993 if (VET != MVT::i8 && VET != MVT::i16)
23994 return SDValue();
23995
23996 switch (Cond) {
23997 default:
23998 return SDValue();
23999 case ISD::SETULT: {
24000 // If the comparison is against a constant we can turn this into a
24001 // setule. With psubus, setule does not require a swap. This is
24002 // beneficial because the constant in the register is no longer
24003 // destructed as the destination so it can be hoisted out of a loop.
24004 // Only do this pre-AVX since vpcmp* is no longer destructive.
24005 if (Subtarget.hasAVX())
24006 return SDValue();
24007 SDValue ULEOp1 =
24008 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
24009 if (!ULEOp1)
24010 return SDValue();
24011 Op1 = ULEOp1;
24012 break;
24013 }
24014 case ISD::SETUGT: {
24015 // If the comparison is against a constant, we can turn this into a setuge.
24016 // This is beneficial because materializing a constant 0 for the PCMPEQ is
24017 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
24018 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
24019 SDValue UGEOp1 =
24020 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
24021 if (!UGEOp1)
24022 return SDValue();
24023 Op1 = Op0;
24024 Op0 = UGEOp1;
24025 break;
24026 }
24027 // Psubus is better than flip-sign because it requires no inversion.
24028 case ISD::SETUGE:
24029 std::swap(Op0, Op1);
24030 break;
24031 case ISD::SETULE:
24032 break;
24033 }
24034
24035 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24036 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24037 DAG.getConstant(0, dl, VT));
24038}
24039
24040static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24041 SelectionDAG &DAG) {
24042 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24043 Op.getOpcode() == ISD::STRICT_FSETCCS;
24044 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24045 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24046 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24047 MVT VT = Op->getSimpleValueType(0);
24049 MVT OpVT = Op0.getSimpleValueType();
24050 SDLoc dl(Op);
24051
24052 if (OpVT.isFloatingPoint()) {
24053 MVT EltVT = OpVT.getVectorElementType();
24054 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24055 EltVT == MVT::f64);
24056
24057 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24058 if (isSoftF16(EltVT, Subtarget)) {
24059 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24060 return SDValue();
24061
24062 // Break 256-bit FP vector compare into smaller ones.
24063 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24064 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24065
24066 // Break 512-bit FP vector compare into smaller ones.
24067 if (OpVT.is512BitVector())
24068 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24069
24070 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24071 if (IsStrict) {
24072 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24073 {Chain, Op0});
24074 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24075 {Chain, Op1});
24076 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24077 {Chain, Op0, Op1, CC});
24078 }
24079 MVT DVT = VT.getVectorElementType() == MVT::i16
24080 ? VT.changeVectorElementType(MVT::i32)
24081 : VT;
24082 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24083 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24084 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24085 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24086 }
24087
24088 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24089
24090 // If we have a strict compare with a vXi1 result and the input is 128/256
24091 // bits we can't use a masked compare unless we have VLX. If we use a wider
24092 // compare like we do for non-strict, we might trigger spurious exceptions
24093 // from the upper elements. Instead emit a AVX compare and convert to mask.
24094 unsigned Opc;
24095 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24096 (!IsStrict || Subtarget.hasVLX() ||
24098#ifndef NDEBUG
24099 unsigned Num = VT.getVectorNumElements();
24100 assert(Num <= 16 ||
24101 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24102#endif
24103 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24104 } else {
24105 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24106 // The SSE/AVX packed FP comparison nodes are defined with a
24107 // floating-point vector result that matches the operand type. This allows
24108 // them to work with an SSE1 target (integer vector types are not legal).
24109 VT = Op0.getSimpleValueType();
24110 }
24111
24112 SDValue Cmp;
24113 bool IsAlwaysSignaling;
24114 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24115 if (!Subtarget.hasAVX()) {
24116 // TODO: We could use following steps to handle a quiet compare with
24117 // signaling encodings.
24118 // 1. Get ordered masks from a quiet ISD::SETO
24119 // 2. Use the masks to mask potential unordered elements in operand A, B
24120 // 3. Get the compare results of masked A, B
24121 // 4. Calculating final result using the mask and result from 3
24122 // But currently, we just fall back to scalar operations.
24123 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24124 return SDValue();
24125
24126 // Insert an extra signaling instruction to raise exception.
24127 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24128 SDValue SignalCmp = DAG.getNode(
24129 Opc, dl, {VT, MVT::Other},
24130 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24131 // FIXME: It seems we need to update the flags of all new strict nodes.
24132 // Otherwise, mayRaiseFPException in MI will return false due to
24133 // NoFPExcept = false by default. However, I didn't find it in other
24134 // patches.
24135 SignalCmp->setFlags(Op->getFlags());
24136 Chain = SignalCmp.getValue(1);
24137 }
24138
24139 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24140 // emit two comparisons and a logic op to tie them together.
24141 if (!cheapX86FSETCC_SSE(Cond)) {
24142 // LLVM predicate is SETUEQ or SETONE.
24143 unsigned CC0, CC1;
24144 unsigned CombineOpc;
24145 if (Cond == ISD::SETUEQ) {
24146 CC0 = 3; // UNORD
24147 CC1 = 0; // EQ
24148 CombineOpc = X86ISD::FOR;
24149 } else {
24151 CC0 = 7; // ORD
24152 CC1 = 4; // NEQ
24153 CombineOpc = X86ISD::FAND;
24154 }
24155
24156 SDValue Cmp0, Cmp1;
24157 if (IsStrict) {
24158 Cmp0 = DAG.getNode(
24159 Opc, dl, {VT, MVT::Other},
24160 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24161 Cmp1 = DAG.getNode(
24162 Opc, dl, {VT, MVT::Other},
24163 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24164 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24165 Cmp1.getValue(1));
24166 } else {
24167 Cmp0 = DAG.getNode(
24168 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24169 Cmp1 = DAG.getNode(
24170 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24171 }
24172 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24173 } else {
24174 if (IsStrict) {
24175 Cmp = DAG.getNode(
24176 Opc, dl, {VT, MVT::Other},
24177 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24178 Chain = Cmp.getValue(1);
24179 } else
24180 Cmp = DAG.getNode(
24181 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24182 }
24183 } else {
24184 // Handle all other FP comparisons here.
24185 if (IsStrict) {
24186 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24187 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24188 Cmp = DAG.getNode(
24189 Opc, dl, {VT, MVT::Other},
24190 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24191 Chain = Cmp.getValue(1);
24192 } else
24193 Cmp = DAG.getNode(
24194 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24195 }
24196
24197 if (VT.getFixedSizeInBits() >
24198 Op.getSimpleValueType().getFixedSizeInBits()) {
24199 // We emitted a compare with an XMM/YMM result. Finish converting to a
24200 // mask register using a vptestm.
24202 Cmp = DAG.getBitcast(CastVT, Cmp);
24203 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24204 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24205 } else {
24206 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24207 // the result type of SETCC. The bitcast is expected to be optimized
24208 // away during combining/isel.
24209 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24210 }
24211
24212 if (IsStrict)
24213 return DAG.getMergeValues({Cmp, Chain}, dl);
24214
24215 return Cmp;
24216 }
24217
24218 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24219
24220 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24221 assert(VTOp0 == Op1.getSimpleValueType() &&
24222 "Expected operands with same type!");
24224 "Invalid number of packed elements for source and destination!");
24225
24226 // The non-AVX512 code below works under the assumption that source and
24227 // destination types are the same.
24228 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24229 "Value types for source and destination must be the same!");
24230
24231 // The result is boolean, but operands are int/float
24232 if (VT.getVectorElementType() == MVT::i1) {
24233 // In AVX-512 architecture setcc returns mask with i1 elements,
24234 // But there is no compare instruction for i8 and i16 elements in KNL.
24235 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24236 "Unexpected operand type");
24237 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24238 }
24239
24240 // Lower using XOP integer comparisons.
24241 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24242 // Translate compare code to XOP PCOM compare mode.
24243 unsigned CmpMode = 0;
24244 switch (Cond) {
24245 // clang-format off
24246 default: llvm_unreachable("Unexpected SETCC condition");
24247 case ISD::SETULT:
24248 case ISD::SETLT: CmpMode = 0x00; break;
24249 case ISD::SETULE:
24250 case ISD::SETLE: CmpMode = 0x01; break;
24251 case ISD::SETUGT:
24252 case ISD::SETGT: CmpMode = 0x02; break;
24253 case ISD::SETUGE:
24254 case ISD::SETGE: CmpMode = 0x03; break;
24255 case ISD::SETEQ: CmpMode = 0x04; break;
24256 case ISD::SETNE: CmpMode = 0x05; break;
24257 // clang-format on
24258 }
24259
24260 // Are we comparing unsigned or signed integers?
24261 unsigned Opc =
24263
24264 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24265 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24266 }
24267
24268 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24269 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24271 SDValue BC0 = peekThroughBitcasts(Op0);
24272 if (BC0.getOpcode() == ISD::AND &&
24274 /*AllowUndefs=*/false)) {
24275 Cond = ISD::SETEQ;
24276 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24277 }
24278 }
24279
24280 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24281 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24282 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24284 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24285 unsigned BitWidth = VT.getScalarSizeInBits();
24286 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24287
24288 SDValue Result = Op0.getOperand(0);
24289 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24290 DAG.getConstant(ShiftAmt, dl, VT));
24291 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24292 DAG.getConstant(BitWidth - 1, dl, VT));
24293 return Result;
24294 }
24295 }
24296
24297 // Break 256-bit integer vector compare into smaller ones.
24298 if (VT.is256BitVector() && !Subtarget.hasInt256())
24299 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24300
24301 // Break 512-bit integer vector compare into smaller ones.
24302 // TODO: Try harder to use VPCMPx + VPMOV2x?
24303 if (VT.is512BitVector())
24304 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24305
24306 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24307 // not-of-PCMPEQ:
24308 // X != INT_MIN --> X >s INT_MIN
24309 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24310 // +X != 0 --> +X >s 0
24311 APInt ConstValue;
24312 if (Cond == ISD::SETNE &&
24313 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24314 if (ConstValue.isMinSignedValue())
24315 Cond = ISD::SETGT;
24316 else if (ConstValue.isMaxSignedValue())
24317 Cond = ISD::SETLT;
24318 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24319 Cond = ISD::SETGT;
24320 }
24321
24322 // If both operands are known non-negative, then an unsigned compare is the
24323 // same as a signed compare and there's no need to flip signbits.
24324 // TODO: We could check for more general simplifications here since we're
24325 // computing known bits.
24326 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24327 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24328
24329 // Special case: Use min/max operations for unsigned compares.
24330 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24332 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24333 TLI.isOperationLegal(ISD::UMIN, VT)) {
24334 // If we have a constant operand, increment/decrement it and change the
24335 // condition to avoid an invert.
24336 if (Cond == ISD::SETUGT) {
24337 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24338 if (SDValue UGTOp1 =
24339 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24340 Op1 = UGTOp1;
24341 Cond = ISD::SETUGE;
24342 }
24343 }
24344 if (Cond == ISD::SETULT) {
24345 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24346 if (SDValue ULTOp1 =
24347 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24348 Op1 = ULTOp1;
24349 Cond = ISD::SETULE;
24350 }
24351 }
24352 bool Invert = false;
24353 unsigned Opc;
24354 switch (Cond) {
24355 // clang-format off
24356 default: llvm_unreachable("Unexpected condition code");
24357 case ISD::SETUGT: Invert = true; [[fallthrough]];
24358 case ISD::SETULE: Opc = ISD::UMIN; break;
24359 case ISD::SETULT: Invert = true; [[fallthrough]];
24360 case ISD::SETUGE: Opc = ISD::UMAX; break;
24361 // clang-format on
24362 }
24363
24364 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24365 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24366
24367 // If the logical-not of the result is required, perform that now.
24368 if (Invert)
24369 Result = DAG.getNOT(dl, Result, VT);
24370
24371 return Result;
24372 }
24373
24374 // Try to use SUBUS and PCMPEQ.
24375 if (FlipSigns)
24376 if (SDValue V =
24377 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24378 return V;
24379
24380 // We are handling one of the integer comparisons here. Since SSE only has
24381 // GT and EQ comparisons for integer, swapping operands and multiple
24382 // operations may be required for some comparisons.
24383 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24385 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24387 bool Invert = Cond == ISD::SETNE ||
24389
24390 if (Swap)
24391 std::swap(Op0, Op1);
24392
24393 // Check that the operation in question is available (most are plain SSE2,
24394 // but PCMPGTQ and PCMPEQQ have different requirements).
24395 if (VT == MVT::v2i64) {
24396 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24397 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24398
24399 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24400 // the odd elements over the even elements.
24401 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24402 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24403 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24404
24405 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24406 static const int MaskHi[] = { 1, 1, 3, 3 };
24407 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24408
24409 return DAG.getBitcast(VT, Result);
24410 }
24411
24412 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24413 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24414 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24415
24416 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24417 static const int MaskHi[] = { 1, 1, 3, 3 };
24418 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24419
24420 return DAG.getBitcast(VT, Result);
24421 }
24422
24423 // If the i64 elements are sign-extended enough to be representable as i32
24424 // then we can compare the lower i32 bits and splat.
24425 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24426 DAG.ComputeNumSignBits(Op1) > 32) {
24427 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24428 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24429
24430 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24431 static const int MaskLo[] = {0, 0, 2, 2};
24432 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24433
24434 return DAG.getBitcast(VT, Result);
24435 }
24436
24437 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24438 // bits of the inputs before performing those operations. The lower
24439 // compare is always unsigned.
24440 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24441 : 0x0000000080000000ULL,
24442 dl, MVT::v2i64);
24443
24444 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24445 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24446
24447 // Cast everything to the right type.
24448 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24449 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24450
24451 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24452 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24453 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24454
24455 // Create masks for only the low parts/high parts of the 64 bit integers.
24456 static const int MaskHi[] = { 1, 1, 3, 3 };
24457 static const int MaskLo[] = { 0, 0, 2, 2 };
24458 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24459 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24460 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24461
24462 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24463 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24464
24465 if (Invert)
24466 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24467
24468 return DAG.getBitcast(VT, Result);
24469 }
24470
24471 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24472 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24473 // pcmpeqd + pshufd + pand.
24474 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24475
24476 // First cast everything to the right type.
24477 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24478 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24479
24480 // Do the compare.
24481 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24482
24483 // Make sure the lower and upper halves are both all-ones.
24484 static const int Mask[] = { 1, 0, 3, 2 };
24485 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24486 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24487
24488 if (Invert)
24489 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24490
24491 return DAG.getBitcast(VT, Result);
24492 }
24493 }
24494
24495 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24496 // bits of the inputs before performing those operations.
24497 if (FlipSigns) {
24498 MVT EltVT = VT.getVectorElementType();
24500 VT);
24501 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24502 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24503 }
24504
24505 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24506
24507 // If the logical-not of the result is required, perform that now.
24508 if (Invert)
24509 Result = DAG.getNOT(dl, Result, VT);
24510
24511 return Result;
24512}
24513
24514// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24516 const SDLoc &dl, SelectionDAG &DAG,
24517 const X86Subtarget &Subtarget,
24518 SDValue &X86CC) {
24519 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24520
24521 // Must be a bitcast from vXi1.
24522 if (Op0.getOpcode() != ISD::BITCAST)
24523 return SDValue();
24524
24525 Op0 = Op0.getOperand(0);
24526 MVT VT = Op0.getSimpleValueType();
24527 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24528 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24529 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24530 return SDValue();
24531
24532 X86::CondCode X86Cond;
24533 if (isNullConstant(Op1)) {
24534 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24535 } else if (isAllOnesConstant(Op1)) {
24536 // C flag is set for all ones.
24537 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24538 } else
24539 return SDValue();
24540
24541 // If the input is an AND, we can combine it's operands into the KTEST.
24542 bool KTestable = false;
24543 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24544 KTestable = true;
24545 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24546 KTestable = true;
24547 if (!isNullConstant(Op1))
24548 KTestable = false;
24549 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24550 SDValue LHS = Op0.getOperand(0);
24551 SDValue RHS = Op0.getOperand(1);
24552 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24553 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24554 }
24555
24556 // If the input is an OR, we can combine it's operands into the KORTEST.
24557 SDValue LHS = Op0;
24558 SDValue RHS = Op0;
24559 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24560 LHS = Op0.getOperand(0);
24561 RHS = Op0.getOperand(1);
24562 }
24563
24564 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24565 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24566}
24567
24568/// Emit flags for the given setcc condition and operands. Also returns the
24569/// corresponding X86 condition code constant in X86CC.
24570SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24571 ISD::CondCode CC, const SDLoc &dl,
24572 SelectionDAG &DAG,
24573 SDValue &X86CC) const {
24574 // Equality Combines.
24575 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24576 X86::CondCode X86CondCode;
24577
24578 // Optimize to BT if possible.
24579 // Lower (X & (1 << N)) == 0 to BT(X, N).
24580 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24581 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24582 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24583 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24584 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24585 return BT;
24586 }
24587 }
24588
24589 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24590 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24591 X86CondCode)) {
24592 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24593 return CmpZ;
24594 }
24595
24596 // Try to lower using KORTEST or KTEST.
24597 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24598 return Test;
24599
24600 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24601 // of these.
24602 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24603 // If the input is a setcc, then reuse the input setcc or use a new one
24604 // with the inverted condition.
24605 if (Op0.getOpcode() == X86ISD::SETCC) {
24606 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24607
24608 X86CC = Op0.getOperand(0);
24609 if (Invert) {
24610 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24611 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24612 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24613 }
24614
24615 return Op0.getOperand(1);
24616 }
24617 }
24618
24619 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24620 // overflow.
24621 if (isMinSignedConstant(Op1)) {
24622 EVT VT = Op0.getValueType();
24623 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24624 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24626 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24627 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24628 DAG.getConstant(0, dl, VT), Op0);
24629 return SDValue(Neg.getNode(), 1);
24630 }
24631 }
24632
24633 // Try to use the carry flag from the add in place of an separate CMP for:
24634 // (seteq (add X, -1), -1). Similar for setne.
24635 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24636 Op0.getOperand(1) == Op1) {
24637 if (isProfitableToUseFlagOp(Op0)) {
24638 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24639
24640 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24641 Op0.getOperand(1));
24642 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24643 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24644 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24645 return SDValue(New.getNode(), 1);
24646 }
24647 }
24648 }
24649
24651 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24652 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24653
24654 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24655 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24656 return EFLAGS;
24657}
24658
24659SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24660
24661 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24662 Op.getOpcode() == ISD::STRICT_FSETCCS;
24663 MVT VT = Op->getSimpleValueType(0);
24664
24665 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24666
24667 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24668 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24669 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24670 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24671 SDLoc dl(Op);
24672 ISD::CondCode CC =
24673 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24674
24675 if (isSoftF16(Op0.getValueType(), Subtarget))
24676 return SDValue();
24677
24678 // Handle f128 first, since one possible outcome is a normal integer
24679 // comparison which gets handled by emitFlagsForSetcc.
24680 if (Op0.getValueType() == MVT::f128) {
24681 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24682 Op.getOpcode() == ISD::STRICT_FSETCCS);
24683
24684 // If softenSetCCOperands returned a scalar, use it.
24685 if (!Op1.getNode()) {
24686 assert(Op0.getValueType() == Op.getValueType() &&
24687 "Unexpected setcc expansion!");
24688 if (IsStrict)
24689 return DAG.getMergeValues({Op0, Chain}, dl);
24690 return Op0;
24691 }
24692 }
24693
24694 if (Op0.getSimpleValueType().isInteger()) {
24695 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24696 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24697 // this may translate to less uops depending on uarch implementation. The
24698 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24699 // canonicalize to that CondCode.
24700 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24701 // encoding size - so it must either already be a i8 or i32 immediate, or it
24702 // shrinks down to that. We don't do this for any i64's to avoid additional
24703 // constant materializations.
24704 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24705 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24706 const APInt &Op1Val = Op1C->getAPIntValue();
24707 if (!Op1Val.isZero()) {
24708 // Ensure the constant+1 doesn't overflow.
24709 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24710 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24711 APInt Op1ValPlusOne = Op1Val + 1;
24712 if (Op1ValPlusOne.isSignedIntN(32) &&
24713 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24714 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24717 }
24718 }
24719 }
24720 }
24721
24722 SDValue X86CC;
24723 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24724 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24725 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24726 }
24727
24728 if (Subtarget.hasAVX10_2()) {
24729 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24730 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24731 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24732 if (Op0.getSimpleValueType() != MVT::f80) {
24733 SDValue Res = getSETCC(
24734 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24735 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24736 }
24737 }
24738 }
24739 // Handle floating point.
24740 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24741 if (CondCode == X86::COND_INVALID)
24742 return SDValue();
24743
24744 SDValue EFLAGS;
24745 if (IsStrict) {
24746 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24747 EFLAGS =
24749 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24750 Chain = EFLAGS.getValue(1);
24751 } else {
24752 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24753 }
24754
24755 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24756 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24757 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24758}
24759
24760SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24761 SDValue LHS = Op.getOperand(0);
24762 SDValue RHS = Op.getOperand(1);
24763 SDValue Carry = Op.getOperand(2);
24764 SDValue Cond = Op.getOperand(3);
24765 SDLoc DL(Op);
24766
24767 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24769
24770 // Recreate the carry if needed.
24771 EVT CarryVT = Carry.getValueType();
24772 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24773 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24774
24775 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24776 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24777 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24778}
24779
24780// This function returns three things: the arithmetic computation itself
24781// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24782// flag and the condition code define the case in which the arithmetic
24783// computation overflows.
24784static std::pair<SDValue, SDValue>
24786 assert(Op.getResNo() == 0 && "Unexpected result number!");
24787 SDValue Value, Overflow;
24788 SDValue LHS = Op.getOperand(0);
24789 SDValue RHS = Op.getOperand(1);
24790 unsigned BaseOp = 0;
24791 SDLoc DL(Op);
24792 switch (Op.getOpcode()) {
24793 default: llvm_unreachable("Unknown ovf instruction!");
24794 case ISD::SADDO:
24795 BaseOp = X86ISD::ADD;
24796 Cond = X86::COND_O;
24797 break;
24798 case ISD::UADDO:
24799 BaseOp = X86ISD::ADD;
24801 break;
24802 case ISD::SSUBO:
24803 BaseOp = X86ISD::SUB;
24804 Cond = X86::COND_O;
24805 break;
24806 case ISD::USUBO:
24807 BaseOp = X86ISD::SUB;
24808 Cond = X86::COND_B;
24809 break;
24810 case ISD::SMULO:
24811 BaseOp = X86ISD::SMUL;
24812 Cond = X86::COND_O;
24813 break;
24814 case ISD::UMULO:
24815 BaseOp = X86ISD::UMUL;
24816 Cond = X86::COND_O;
24817 break;
24818 }
24819
24820 if (BaseOp) {
24821 // Also sets EFLAGS.
24822 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24823 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24824 Overflow = Value.getValue(1);
24825 }
24826
24827 return std::make_pair(Value, Overflow);
24828}
24829
24831 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24832 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24833 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24834 // has only one use.
24835 SDLoc DL(Op);
24837 SDValue Value, Overflow;
24838 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24839
24840 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24841 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24842 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24843}
24844
24845/// Return true if opcode is a X86 logical comparison.
24847 unsigned Opc = Op.getOpcode();
24848 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24849 Opc == X86ISD::FCMP)
24850 return true;
24851 if (Op.getResNo() == 1 &&
24852 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24854 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24855 return true;
24856
24857 return false;
24858}
24859
24861 if (V.getOpcode() != ISD::TRUNCATE)
24862 return false;
24863
24864 SDValue VOp0 = V.getOperand(0);
24865 unsigned InBits = VOp0.getValueSizeInBits();
24866 unsigned Bits = V.getValueSizeInBits();
24867 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24868}
24869
24870// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24872 unsigned X86CC, const SDLoc &DL,
24873 SelectionDAG &DAG,
24874 const X86Subtarget &Subtarget) {
24875 EVT CmpVT = CmpVal.getValueType();
24876 EVT VT = LHS.getValueType();
24877 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24878 return SDValue();
24879
24880 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24881 isOneConstant(CmpVal.getOperand(1))) {
24882 auto SplatLSB = [&](EVT SplatVT) {
24883 // we need mask of all zeros or ones with same size of the other
24884 // operands.
24885 SDValue Neg = CmpVal;
24886 if (CmpVT.bitsGT(SplatVT))
24887 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24888 else if (CmpVT.bitsLT(SplatVT))
24889 Neg = DAG.getNode(
24890 ISD::AND, DL, SplatVT,
24891 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24892 DAG.getConstant(1, DL, SplatVT));
24893 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24894 };
24895
24896 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24898 return SplatLSB(VT);
24899
24900 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24901 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24903 SDValue Mask = SplatLSB(VT);
24904 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24905 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24906 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24907 }
24908
24909 SDValue Src1, Src2;
24910 auto isIdentityPatternZero = [&]() {
24911 switch (RHS.getOpcode()) {
24912 default:
24913 break;
24914 case ISD::OR:
24915 case ISD::XOR:
24916 case ISD::ADD:
24917 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24918 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24919 Src2 = LHS;
24920 return true;
24921 }
24922 break;
24923 case ISD::SHL:
24924 case ISD::SRA:
24925 case ISD::SRL:
24926 case ISD::SUB:
24927 if (RHS.getOperand(0) == LHS) {
24928 Src1 = RHS.getOperand(1);
24929 Src2 = LHS;
24930 return true;
24931 }
24932 break;
24933 }
24934 return false;
24935 };
24936
24937 auto isIdentityPatternOnes = [&]() {
24938 switch (LHS.getOpcode()) {
24939 default:
24940 break;
24941 case ISD::AND:
24942 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24943 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24944 Src2 = RHS;
24945 return true;
24946 }
24947 break;
24948 }
24949 return false;
24950 };
24951
24952 // Convert 'identity' patterns (iff X is 0 or 1):
24953 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24954 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24955 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24956 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24957 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24958 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24959 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24960 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24961 SDValue Mask = SplatLSB(Src1.getValueType());
24962 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24963 Src1); // Mask & z
24964 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24965 }
24966 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24967 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24968 SDValue Mask = SplatLSB(VT);
24969 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24970 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24971 }
24972 }
24973
24974 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24977 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24978
24979 // 'X - 1' sets the carry flag if X == 0.
24980 // '0 - X' sets the carry flag if X != 0.
24981 // Convert the carry flag to a -1/0 mask with sbb:
24982 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24983 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24984 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24985 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24986 SDValue Sub;
24987 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24988 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24989 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24990 } else {
24991 SDValue One = DAG.getConstant(1, DL, CmpVT);
24992 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24993 }
24994 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24995 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24996 Sub.getValue(1));
24997 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24998 }
24999
25000 return SDValue();
25001}
25002
25003SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
25004 bool AddTest = true;
25005 SDValue Cond = Op.getOperand(0);
25006 SDValue Op1 = Op.getOperand(1);
25007 SDValue Op2 = Op.getOperand(2);
25008 SDLoc DL(Op);
25009 MVT VT = Op1.getSimpleValueType();
25010 SDValue CC;
25011
25012 if (isSoftF16(VT, Subtarget)) {
25013 MVT NVT = VT.changeTypeToInteger();
25014 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
25015 DAG.getBitcast(NVT, Op1),
25016 DAG.getBitcast(NVT, Op2)));
25017 }
25018
25019 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
25020 // are available or VBLENDV if AVX is available.
25021 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
25022 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
25023 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
25024 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25025 bool IsAlwaysSignaling;
25026 unsigned SSECC =
25027 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25028 CondOp0, CondOp1, IsAlwaysSignaling);
25029
25030 if (Subtarget.hasAVX512()) {
25031 SDValue Cmp =
25032 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25033 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25034 assert(!VT.isVector() && "Not a scalar type?");
25035 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25036 }
25037
25038 if (SSECC < 8 || Subtarget.hasAVX()) {
25039 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25040 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25041
25042 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25043 // instead of 3 logic instructions for size savings and potentially speed.
25044 // Unfortunately, there is no scalar form of VBLENDV.
25045 //
25046 // If either operand is a +0.0 constant, don't try this. We can expect to
25047 // optimize away at least one of the logic instructions later in that
25048 // case, so that sequence would be faster than a variable blend.
25049 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25050 !isNullFPConstant(Op2)) {
25051 // Convert to vectors, do a VSELECT, and convert back to scalar.
25052 // All of the conversions should be optimized away.
25053 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25054 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25055 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25056 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25057
25058 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25059 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25060
25061 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25062
25063 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25064 DAG.getVectorIdxConstant(0, DL));
25065 }
25066 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25067 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25068 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25069 }
25070 }
25071
25072 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25073 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25074 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25075 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25076 }
25077
25078 if (Cond.getOpcode() == ISD::SETCC &&
25079 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25080 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25081 Cond = NewCond;
25082 // If the condition was updated, it's possible that the operands of the
25083 // select were also updated (for example, EmitTest has a RAUW). Refresh
25084 // the local references to the select operands in case they got stale.
25085 Op1 = Op.getOperand(1);
25086 Op2 = Op.getOperand(2);
25087 }
25088 }
25089
25090 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25091 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25092 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25093 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25094 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25095 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25096 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25097 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25098 if (Cond.getOpcode() == X86ISD::SETCC &&
25099 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25100 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25101 SDValue Cmp = Cond.getOperand(1);
25102 SDValue CmpOp0 = Cmp.getOperand(0);
25103 unsigned CondCode = Cond.getConstantOperandVal(0);
25104
25105 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25106 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25107 // handle to keep the CMP with 0. This should be removed by
25108 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25109 // cttz_zero_undef.
25110 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25111 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25112 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25113 };
25114 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25115 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25116 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25117 // Keep Cmp.
25118 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25119 DL, DAG, Subtarget)) {
25120 return R;
25121 } else if (VT.isScalarInteger() && isNullConstant(Op2) &&
25122 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25123 ((CondCode == X86::COND_S) || // smin(x, 0)
25124 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25125 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25126 //
25127 // If the comparison is testing for a positive value, we have to invert
25128 // the sign bit mask, so only do that transform if the target has a
25129 // bitwise 'and not' instruction (the invert is free).
25130 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25131 unsigned ShCt = VT.getSizeInBits() - 1;
25132 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25133 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25134 if (CondCode == X86::COND_G)
25135 Shift = DAG.getNOT(DL, Shift, VT);
25136 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25137 }
25138 }
25139
25140 // Look past (and (setcc_carry (cmp ...)), 1).
25141 if (Cond.getOpcode() == ISD::AND &&
25142 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25143 isOneConstant(Cond.getOperand(1)))
25144 Cond = Cond.getOperand(0);
25145
25146 // Attempt to fold "raw cond" cases by treating them as:
25147 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25148 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25149 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25150 Subtarget))
25151 return R;
25152
25153 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25154 // setting operand in place of the X86ISD::SETCC.
25155 unsigned CondOpcode = Cond.getOpcode();
25156 if (CondOpcode == X86ISD::SETCC ||
25157 CondOpcode == X86ISD::SETCC_CARRY) {
25158 CC = Cond.getOperand(0);
25159
25160 SDValue Cmp = Cond.getOperand(1);
25161 bool IllegalFPCMov = false;
25162 if (VT.isFloatingPoint() && !VT.isVector() &&
25163 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25164 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25165
25166 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25167 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25168 Cond = Cmp;
25169 AddTest = false;
25170 }
25171 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25172 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25173 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25174 SDValue Value;
25175 X86::CondCode X86Cond;
25176 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25177
25178 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25179 AddTest = false;
25180 }
25181
25182 if (AddTest) {
25183 // Look past the truncate if the high bits are known zero.
25185 Cond = Cond.getOperand(0);
25186
25187 // We know the result of AND is compared against zero. Try to match
25188 // it to BT.
25189 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25190 X86::CondCode X86CondCode;
25191 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25192 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25193 Cond = BT;
25194 AddTest = false;
25195 }
25196 }
25197 }
25198
25199 if (AddTest) {
25200 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25201 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25202 }
25203
25204 // a < b ? -1 : 0 -> RES = ~setcc_carry
25205 // a < b ? 0 : -1 -> RES = setcc_carry
25206 // a >= b ? -1 : 0 -> RES = setcc_carry
25207 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25208 if (Cond.getOpcode() == X86ISD::SUB) {
25209 unsigned CondCode = CC->getAsZExtVal();
25210
25211 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25212 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25213 (isNullConstant(Op1) || isNullConstant(Op2))) {
25214 SDValue Res =
25215 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25216 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25217 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25218 return DAG.getNOT(DL, Res, Res.getValueType());
25219 return Res;
25220 }
25221 }
25222
25223 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25224 // widen the cmov and push the truncate through. This avoids introducing a new
25225 // branch during isel and doesn't add any extensions.
25226 if (Op.getValueType() == MVT::i8 &&
25227 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25228 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25229 if (T1.getValueType() == T2.getValueType() &&
25230 // Exclude CopyFromReg to avoid partial register stalls.
25231 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25232 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25233 CC, Cond);
25234 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25235 }
25236 }
25237
25238 // Or finally, promote i8 cmovs if we have CMOV,
25239 // or i16 cmovs if it won't prevent folding a load.
25240 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25241 // legal, but EmitLoweredSelect() can not deal with these extensions
25242 // being inserted between two CMOV's. (in i16 case too TBN)
25243 // https://bugs.llvm.org/show_bug.cgi?id=40974
25244 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25245 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25246 !X86::mayFoldLoad(Op2, Subtarget))) {
25247 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25248 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25249 SDValue Ops[] = { Op2, Op1, CC, Cond };
25250 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25251 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25252 }
25253
25254 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25255 // condition is true.
25256 SDValue Ops[] = { Op2, Op1, CC, Cond };
25257 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25258}
25259
25261 const X86Subtarget &Subtarget,
25262 SelectionDAG &DAG) {
25263 MVT VT = Op->getSimpleValueType(0);
25264 SDValue In = Op->getOperand(0);
25265 MVT InVT = In.getSimpleValueType();
25266 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25267 MVT VTElt = VT.getVectorElementType();
25268 unsigned NumElts = VT.getVectorNumElements();
25269
25270 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25271 MVT ExtVT = VT;
25272 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25273 // If v16i32 is to be avoided, we'll need to split and concatenate.
25274 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25275 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25276
25277 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25278 }
25279
25280 // Widen to 512-bits if VLX is not supported.
25281 MVT WideVT = ExtVT;
25282 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25283 NumElts *= 512 / ExtVT.getSizeInBits();
25284 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25285 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25286 DAG.getVectorIdxConstant(0, dl));
25287 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25288 }
25289
25290 SDValue V;
25291 MVT WideEltVT = WideVT.getVectorElementType();
25292 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25293 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25294 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25295 } else {
25296 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25297 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25298 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25299 }
25300
25301 // Truncate if we had to extend i16/i8 above.
25302 if (VT != ExtVT) {
25303 WideVT = MVT::getVectorVT(VTElt, NumElts);
25304 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25305 }
25306
25307 // Extract back to 128/256-bit if we widened.
25308 if (WideVT != VT)
25309 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25310 DAG.getVectorIdxConstant(0, dl));
25311
25312 return V;
25313}
25314
25316 SelectionDAG &DAG) {
25317 SDValue In = Op->getOperand(0);
25318 MVT InVT = In.getSimpleValueType();
25319 SDLoc DL(Op);
25320
25321 if (InVT.getVectorElementType() == MVT::i1)
25322 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25323
25324 assert(Subtarget.hasAVX() && "Expected AVX support");
25325 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25326}
25327
25328// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25329// For sign extend this needs to handle all vector sizes and SSE4.1 and
25330// non-SSE4.1 targets. For zero extend this should only handle inputs of
25331// MVT::v64i8 when BWI is not supported, but AVX512 is.
25333 const X86Subtarget &Subtarget,
25334 SelectionDAG &DAG) {
25335 SDValue In = Op->getOperand(0);
25336 MVT VT = Op->getSimpleValueType(0);
25337 MVT InVT = In.getSimpleValueType();
25338
25339 MVT SVT = VT.getVectorElementType();
25340 MVT InSVT = InVT.getVectorElementType();
25342
25343 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25344 return SDValue();
25345 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25346 return SDValue();
25347 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25348 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25349 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25350 return SDValue();
25351
25352 SDLoc dl(Op);
25353 unsigned Opc = Op.getOpcode();
25354 unsigned NumElts = VT.getVectorNumElements();
25355
25356 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25357 // For 512-bit vectors, we need 128-bits or 256-bits.
25358 if (InVT.getSizeInBits() > 128) {
25359 // Input needs to be at least the same number of elements as output, and
25360 // at least 128-bits.
25361 int InSize = InSVT.getSizeInBits() * NumElts;
25362 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25363 InVT = In.getSimpleValueType();
25364 }
25365
25366 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25367 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25368 // need to be handled here for 256/512-bit results.
25369 if (Subtarget.hasInt256()) {
25370 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25371
25372 if (InVT.getVectorNumElements() != NumElts)
25373 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25374
25375 // FIXME: Apparently we create inreg operations that could be regular
25376 // extends.
25377 unsigned ExtOpc =
25380 return DAG.getNode(ExtOpc, dl, VT, In);
25381 }
25382
25383 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25384 if (Subtarget.hasAVX()) {
25385 assert(VT.is256BitVector() && "256-bit vector expected");
25386 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25387 int HalfNumElts = HalfVT.getVectorNumElements();
25388
25389 unsigned NumSrcElts = InVT.getVectorNumElements();
25390 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25391 for (int i = 0; i != HalfNumElts; ++i)
25392 HiMask[i] = HalfNumElts + i;
25393
25394 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25395 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25396 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25397 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25398 }
25399
25400 // We should only get here for sign extend.
25401 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25402 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25403 unsigned InNumElts = InVT.getVectorNumElements();
25404
25405 // If the source elements are already all-signbits, we don't need to extend,
25406 // just splat the elements.
25407 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25408 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25409 unsigned Scale = InNumElts / NumElts;
25410 SmallVector<int, 16> ShuffleMask;
25411 for (unsigned I = 0; I != NumElts; ++I)
25412 ShuffleMask.append(Scale, I);
25413 return DAG.getBitcast(VT,
25414 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25415 }
25416
25417 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25418 SDValue Curr = In;
25419 SDValue SignExt = Curr;
25420
25421 // As SRAI is only available on i16/i32 types, we expand only up to i32
25422 // and handle i64 separately.
25423 if (InVT != MVT::v4i32) {
25424 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25425
25426 unsigned DestWidth = DestVT.getScalarSizeInBits();
25427 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25428 unsigned DestElts = DestVT.getVectorNumElements();
25429
25430 // Build a shuffle mask that takes each input element and places it in the
25431 // MSBs of the new element size.
25432 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25433 for (unsigned i = 0; i != DestElts; ++i)
25434 Mask[i * Scale + (Scale - 1)] = i;
25435
25436 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25437 Curr = DAG.getBitcast(DestVT, Curr);
25438
25439 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25440 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25441 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25442 }
25443
25444 if (VT == MVT::v2i64) {
25445 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25446 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25447 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25448 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25449 SignExt = DAG.getBitcast(VT, SignExt);
25450 }
25451
25452 return SignExt;
25453}
25454
25456 SelectionDAG &DAG) {
25457 MVT VT = Op->getSimpleValueType(0);
25458 SDValue In = Op->getOperand(0);
25459 MVT InVT = In.getSimpleValueType();
25460 SDLoc dl(Op);
25461
25462 if (InVT.getVectorElementType() == MVT::i1)
25463 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25464
25465 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25467 "Expected same number of elements");
25468 assert((VT.getVectorElementType() == MVT::i16 ||
25469 VT.getVectorElementType() == MVT::i32 ||
25470 VT.getVectorElementType() == MVT::i64) &&
25471 "Unexpected element type");
25472 assert((InVT.getVectorElementType() == MVT::i8 ||
25473 InVT.getVectorElementType() == MVT::i16 ||
25474 InVT.getVectorElementType() == MVT::i32) &&
25475 "Unexpected element type");
25476
25477 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25478 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25479 return splitVectorIntUnary(Op, DAG, dl);
25480 }
25481
25482 if (Subtarget.hasInt256())
25483 return Op;
25484
25485 // Optimize vectors in AVX mode
25486 // Sign extend v8i16 to v8i32 and
25487 // v4i32 to v4i64
25488 //
25489 // Divide input vector into two parts
25490 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25491 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25492 // concat the vectors to original VT
25493 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25494 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25495
25496 unsigned NumElems = InVT.getVectorNumElements();
25497 SmallVector<int,8> ShufMask(NumElems, -1);
25498 for (unsigned i = 0; i != NumElems/2; ++i)
25499 ShufMask[i] = i + NumElems/2;
25500
25501 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25502 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25503
25504 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25505}
25506
25507/// Change a vector store into a pair of half-size vector stores.
25509 SDValue StoredVal = Store->getValue();
25510 assert((StoredVal.getValueType().is256BitVector() ||
25511 StoredVal.getValueType().is512BitVector()) &&
25512 "Expecting 256/512-bit op");
25513
25514 // Splitting volatile memory ops is not allowed unless the operation was not
25515 // legal to begin with. Assume the input store is legal (this transform is
25516 // only used for targets with AVX). Note: It is possible that we have an
25517 // illegal type like v2i128, and so we could allow splitting a volatile store
25518 // in that case if that is important.
25519 if (!Store->isSimple())
25520 return SDValue();
25521
25522 SDLoc DL(Store);
25523 SDValue Value0, Value1;
25524 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25525 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25526 SDValue Ptr0 = Store->getBasePtr();
25527 SDValue Ptr1 =
25528 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25529 SDValue Ch0 =
25530 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25531 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25532 SDValue Ch1 =
25533 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25534 Store->getPointerInfo().getWithOffset(HalfOffset),
25535 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25536 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25537}
25538
25539/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25540/// type.
25542 SelectionDAG &DAG) {
25543 SDValue StoredVal = Store->getValue();
25544 assert(StoreVT.is128BitVector() &&
25545 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25546 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25547
25548 // Splitting volatile memory ops is not allowed unless the operation was not
25549 // legal to begin with. We are assuming the input op is legal (this transform
25550 // is only used for targets with AVX).
25551 if (!Store->isSimple())
25552 return SDValue();
25553
25554 MVT StoreSVT = StoreVT.getScalarType();
25555 unsigned NumElems = StoreVT.getVectorNumElements();
25556 unsigned ScalarSize = StoreSVT.getStoreSize();
25557
25558 SDLoc DL(Store);
25560 for (unsigned i = 0; i != NumElems; ++i) {
25561 unsigned Offset = i * ScalarSize;
25562 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25564 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25565 DAG.getVectorIdxConstant(i, DL));
25566 SDValue Ch =
25567 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25568 Store->getPointerInfo().getWithOffset(Offset),
25569 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25570 Stores.push_back(Ch);
25571 }
25572 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25573}
25574
25575static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25576 SelectionDAG &DAG) {
25577 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25578 SDLoc dl(St);
25579 SDValue StoredVal = St->getValue();
25580
25581 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25582 if (StoredVal.getValueType().isVector() &&
25583 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25584 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25585 assert(NumElts <= 8 && "Unexpected VT");
25586 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25587 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25588 "Expected AVX512F without AVX512DQI");
25589
25590 // We must pad with zeros to ensure we store zeroes to any unused bits.
25591 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25592 DAG.getUNDEF(MVT::v16i1), StoredVal,
25593 DAG.getVectorIdxConstant(0, dl));
25594 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25595 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25596 // Make sure we store zeros in the extra bits.
25597 if (NumElts < 8)
25598 StoredVal = DAG.getZeroExtendInReg(
25599 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25600
25601 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25602 St->getPointerInfo(), St->getBaseAlign(),
25603 St->getMemOperand()->getFlags());
25604 }
25605
25606 if (St->isTruncatingStore())
25607 return SDValue();
25608
25609 // If this is a 256/512-bit store of concatenated ops, we are better off
25610 // splitting that store into two half-size stores. This avoids spurious use of
25611 // concatenated ops and each half can execute independently. Some cores would
25612 // split the op into halves anyway, so the concat is purely an extra op.
25613 MVT StoreVT = StoredVal.getSimpleValueType();
25614 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25615 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25616 return splitVectorStore(St, DAG);
25617 return SDValue();
25618 }
25619
25620 if (StoreVT.is32BitVector())
25621 return SDValue();
25622
25623 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25624 assert(StoreVT.is64BitVector() && "Unexpected VT");
25625 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25627 "Unexpected type action!");
25628
25629 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25630 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25631 DAG.getUNDEF(StoreVT));
25632
25633 if (Subtarget.hasSSE2()) {
25634 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25635 // and store it.
25636 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25637 MVT CastVT = MVT::getVectorVT(StVT, 2);
25638 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25639 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25640 DAG.getVectorIdxConstant(0, dl));
25641
25642 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25643 St->getPointerInfo(), St->getBaseAlign(),
25644 St->getMemOperand()->getFlags());
25645 }
25646 assert(Subtarget.hasSSE1() && "Expected SSE");
25647 SDVTList Tys = DAG.getVTList(MVT::Other);
25648 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25649 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25650 St->getMemOperand());
25651}
25652
25653// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25654// may emit an illegal shuffle but the expansion is still better than scalar
25655// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25656// we'll emit a shuffle and a arithmetic shift.
25657// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25658// TODO: It is possible to support ZExt by zeroing the undef values during
25659// the shuffle phase or after the shuffle.
25660static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25661 SelectionDAG &DAG) {
25662 MVT RegVT = Op.getSimpleValueType();
25663 assert(RegVT.isVector() && "We only custom lower vector loads.");
25664 assert(RegVT.isInteger() &&
25665 "We only custom lower integer vector loads.");
25666
25667 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25668 SDLoc dl(Ld);
25669
25670 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25671 if (RegVT.getVectorElementType() == MVT::i1) {
25672 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25673 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25674 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25675 "Expected AVX512F without AVX512DQI");
25676
25677 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25678 Ld->getPointerInfo(), Ld->getBaseAlign(),
25679 Ld->getMemOperand()->getFlags());
25680
25681 // Replace chain users with the new chain.
25682 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25683
25684 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25685 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25686 DAG.getBitcast(MVT::v16i1, Val),
25687 DAG.getVectorIdxConstant(0, dl));
25688 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25689 }
25690
25691 return SDValue();
25692}
25693
25694/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25695/// each of which has no other use apart from the AND / OR.
25696static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25697 Opc = Op.getOpcode();
25698 if (Opc != ISD::OR && Opc != ISD::AND)
25699 return false;
25700 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25701 Op.getOperand(0).hasOneUse() &&
25702 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25703 Op.getOperand(1).hasOneUse());
25704}
25705
25706SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25707 SDValue Chain = Op.getOperand(0);
25708 SDValue Cond = Op.getOperand(1);
25709 SDValue Dest = Op.getOperand(2);
25710 SDLoc dl(Op);
25711
25712 // Bail out when we don't have native compare instructions.
25713 if (Cond.getOpcode() == ISD::SETCC &&
25714 Cond.getOperand(0).getValueType() != MVT::f128 &&
25715 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25716 SDValue LHS = Cond.getOperand(0);
25717 SDValue RHS = Cond.getOperand(1);
25718 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25719
25720 // Special case for
25721 // setcc([su]{add,sub,mul}o == 0)
25722 // setcc([su]{add,sub,mul}o != 1)
25724 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25726 SDValue Value, Overflow;
25727 X86::CondCode X86Cond;
25728 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25729
25730 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25731 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25732
25733 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25734 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25735 Overflow, Op->getFlags());
25736 }
25737
25738 if (LHS.getSimpleValueType().isInteger()) {
25739 SDValue CCVal;
25740 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25741 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25742 EFLAGS, Op->getFlags());
25743 }
25744
25745 if (CC == ISD::SETOEQ) {
25746 // For FCMP_OEQ, we can emit
25747 // two branches instead of an explicit AND instruction with a
25748 // separate test. However, we only do this if this block doesn't
25749 // have a fall-through edge, because this requires an explicit
25750 // jmp when the condition is false.
25751 if (Op.getNode()->hasOneUse()) {
25752 SDNode *User = *Op.getNode()->user_begin();
25753 // Look for an unconditional branch following this conditional branch.
25754 // We need this because we need to reverse the successors in order
25755 // to implement FCMP_OEQ.
25756 if (User->getOpcode() == ISD::BR) {
25757 SDValue FalseBB = User->getOperand(1);
25758 SDNode *NewBR =
25759 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25760 assert(NewBR == User);
25761 (void)NewBR;
25762 Dest = FalseBB;
25763
25764 SDValue Cmp =
25765 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25766 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25767 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25768 CCVal, Cmp, Op->getFlags());
25769 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25770 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25771 Cmp, Op->getFlags());
25772 }
25773 }
25774 } else if (CC == ISD::SETUNE) {
25775 // For FCMP_UNE, we can emit
25776 // two branches instead of an explicit OR instruction with a
25777 // separate test.
25778 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25779 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25780 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25781 Cmp, Op->getFlags());
25782 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25783 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25784 Cmp, Op->getFlags());
25785 } else {
25786 X86::CondCode X86Cond =
25787 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25788 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25789 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25790 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25791 Cmp, Op->getFlags());
25792 }
25793 }
25794
25796 SDValue Value, Overflow;
25797 X86::CondCode X86Cond;
25798 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25799
25800 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25801 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25802 Overflow, Op->getFlags());
25803 }
25804
25805 // Look past the truncate if the high bits are known zero.
25807 Cond = Cond.getOperand(0);
25808
25809 EVT CondVT = Cond.getValueType();
25810
25811 // Add an AND with 1 if we don't already have one.
25812 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25813 Cond =
25814 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25815
25816 SDValue LHS = Cond;
25817 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25818
25819 SDValue CCVal;
25820 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25821 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25822 Op->getFlags());
25823}
25824
25825// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25826// Calls to _alloca are needed to probe the stack when allocating more than 4k
25827// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25828// that the guard pages used by the OS virtual memory manager are allocated in
25829// correct sequence.
25830SDValue
25831X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25832 SelectionDAG &DAG) const {
25833 MachineFunction &MF = DAG.getMachineFunction();
25834 bool SplitStack = MF.shouldSplitStack();
25835 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25836 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25837 SplitStack || EmitStackProbeCall;
25838 SDLoc dl(Op);
25839
25840 // Get the inputs.
25841 SDNode *Node = Op.getNode();
25842 SDValue Chain = Op.getOperand(0);
25843 SDValue Size = Op.getOperand(1);
25844 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25845 EVT VT = Node->getValueType(0);
25846
25847 // Chain the dynamic stack allocation so that it doesn't modify the stack
25848 // pointer when other instructions are using the stack.
25849 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25850
25851 bool Is64Bit = Subtarget.is64Bit();
25852 MVT SPTy = Op.getValueType().getSimpleVT();
25853
25855 if (!Lower) {
25856 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25858 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25859 " not tell us which reg is the stack pointer!");
25860
25861 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25862 const Align StackAlign = TFI.getStackAlign();
25863 if (hasInlineStackProbe(MF)) {
25864 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25865 {Chain, Size});
25866 Chain = Result.getValue(1);
25867 } else {
25868 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25869 Chain = SP.getValue(1);
25870 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25871 }
25872 if (Alignment && *Alignment > StackAlign)
25873 Result = DAG.getNode(
25874 ISD::AND, dl, VT, Result,
25875 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25876 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25877 } else if (SplitStack) {
25878 if (Is64Bit) {
25879 // The 64 bit implementation of segmented stacks needs to clobber both r10
25880 // r11. This makes it impossible to use it along with nested parameters.
25881 const Function &F = MF.getFunction();
25882 for (const auto &A : F.args()) {
25883 if (A.hasNestAttr())
25884 report_fatal_error("Cannot use segmented stacks with functions that "
25885 "have nested arguments.");
25886 }
25887 }
25888
25889 Result =
25890 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25891 Chain = Result.getValue(1);
25892 } else {
25893 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25894 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25895 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25896
25897 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25898 Register SPReg = RegInfo->getStackRegister();
25899 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25900 Chain = SP.getValue(1);
25901
25902 if (Alignment) {
25903 SP = DAG.getNode(
25904 ISD::AND, dl, VT, SP.getValue(0),
25905 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25906 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25907 }
25908
25909 Result = SP;
25910 }
25911
25912 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25913
25914 SDValue Ops[2] = {Result, Chain};
25915 return DAG.getMergeValues(Ops, dl);
25916}
25917
25918SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25919 MachineFunction &MF = DAG.getMachineFunction();
25920 SDValue Ptr = Op.getOperand(1);
25921 EVT PtrVT = Ptr.getValueType();
25922
25923 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25924
25925 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25926 SDLoc DL(Op);
25927
25928 if (!Subtarget.is64Bit() ||
25929 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25930 // vastart just stores the address of the VarArgsFrameIndex slot into the
25931 // memory location argument.
25932 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25933 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25934 }
25935
25936 // __va_list_tag:
25937 // gp_offset (0 - 6 * 8)
25938 // fp_offset (48 - 48 + 8 * 16)
25939 // overflow_arg_area (point to parameters coming in memory).
25940 // reg_save_area
25942 SDValue FIN = Op.getOperand(1);
25943 // Store gp_offset
25944 SDValue Store = DAG.getStore(
25945 Op.getOperand(0), DL,
25946 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25947 MachinePointerInfo(SV));
25948 MemOps.push_back(Store);
25949
25950 // Store fp_offset
25951 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25952 Store = DAG.getStore(
25953 Op.getOperand(0), DL,
25954 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25955 MachinePointerInfo(SV, 4));
25956 MemOps.push_back(Store);
25957
25958 // Store ptr to overflow_arg_area
25959 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25960 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25961 Store =
25962 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25963 MemOps.push_back(Store);
25964
25965 // Store ptr to reg_save_area.
25966 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25967 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25968 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25969 Store = DAG.getStore(
25970 Op.getOperand(0), DL, RSFIN, FIN,
25971 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25972 MemOps.push_back(Store);
25973 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25974}
25975
25976SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25977 assert(Subtarget.is64Bit() &&
25978 "LowerVAARG only handles 64-bit va_arg!");
25979 assert(Op.getNumOperands() == 4);
25980
25981 MachineFunction &MF = DAG.getMachineFunction();
25982 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25983 // The Win64 ABI uses char* instead of a structure.
25984 return DAG.expandVAArg(Op.getNode());
25985
25986 SDValue Chain = Op.getOperand(0);
25987 SDValue SrcPtr = Op.getOperand(1);
25988 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25989 unsigned Align = Op.getConstantOperandVal(3);
25990 SDLoc dl(Op);
25991
25992 EVT ArgVT = Op.getNode()->getValueType(0);
25993 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25994 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25995 uint8_t ArgMode;
25996
25997 // Decide which area this value should be read from.
25998 // TODO: Implement the AMD64 ABI in its entirety. This simple
25999 // selection mechanism works only for the basic types.
26000 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
26001 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
26002 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
26003 } else {
26004 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
26005 "Unhandled argument type in LowerVAARG");
26006 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
26007 }
26008
26009 if (ArgMode == 2) {
26010 // Make sure using fp_offset makes sense.
26011 assert(!Subtarget.useSoftFloat() &&
26012 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
26013 Subtarget.hasSSE1());
26014 }
26015
26016 // Insert VAARG node into the DAG
26017 // VAARG returns two values: Variable Argument Address, Chain
26018 SDValue InstOps[] = {Chain, SrcPtr,
26019 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
26020 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
26021 DAG.getTargetConstant(Align, dl, MVT::i32)};
26022 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
26023 SDValue VAARG = DAG.getMemIntrinsicNode(
26024 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
26025 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26026 /*Alignment=*/std::nullopt,
26028 Chain = VAARG.getValue(1);
26029
26030 // Load the next argument and return it
26031 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26032}
26033
26034static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26035 SelectionDAG &DAG) {
26036 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26037 // where a va_list is still an i8*.
26038 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26039 if (Subtarget.isCallingConvWin64(
26041 // Probably a Win64 va_copy.
26042 return DAG.expandVACopy(Op.getNode());
26043
26044 SDValue Chain = Op.getOperand(0);
26045 SDValue DstPtr = Op.getOperand(1);
26046 SDValue SrcPtr = Op.getOperand(2);
26047 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26048 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26049 SDLoc DL(Op);
26050
26051 return DAG.getMemcpy(
26052 Chain, DL, DstPtr, SrcPtr,
26053 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26054 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26055 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26056 MachinePointerInfo(SrcSV));
26057}
26058
26059// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26060static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26061 switch (Opc) {
26062 case ISD::SHL:
26063 case X86ISD::VSHL:
26064 case X86ISD::VSHLI:
26065 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26066 case ISD::SRL:
26067 case X86ISD::VSRL:
26068 case X86ISD::VSRLI:
26069 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26070 case ISD::SRA:
26071 case X86ISD::VSRA:
26072 case X86ISD::VSRAI:
26073 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26074 }
26075 llvm_unreachable("Unknown target vector shift node");
26076}
26077
26078/// Handle vector element shifts where the shift amount is a constant.
26079/// Takes immediate version of shift as input.
26080static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26081 SDValue SrcOp, uint64_t ShiftAmt,
26082 SelectionDAG &DAG) {
26083 MVT ElementType = VT.getVectorElementType();
26084
26085 // Bitcast the source vector to the output type, this is mainly necessary for
26086 // vXi8/vXi64 shifts.
26087 if (VT != SrcOp.getSimpleValueType())
26088 SrcOp = DAG.getBitcast(VT, SrcOp);
26089
26090 // Fold this packed shift into its first operand if ShiftAmt is 0.
26091 if (ShiftAmt == 0)
26092 return SrcOp;
26093
26094 // Check for ShiftAmt >= element width
26095 if (ShiftAmt >= ElementType.getSizeInBits()) {
26096 if (Opc == X86ISD::VSRAI)
26097 ShiftAmt = ElementType.getSizeInBits() - 1;
26098 else
26099 return DAG.getConstant(0, dl, VT);
26100 }
26101
26103 && "Unknown target vector shift-by-constant node");
26104
26105 // Fold this packed vector shift into a build vector if SrcOp is a
26106 // vector of Constants or UNDEFs.
26108 unsigned ShiftOpc;
26109 switch (Opc) {
26110 default: llvm_unreachable("Unknown opcode!");
26111 case X86ISD::VSHLI:
26112 ShiftOpc = ISD::SHL;
26113 break;
26114 case X86ISD::VSRLI:
26115 ShiftOpc = ISD::SRL;
26116 break;
26117 case X86ISD::VSRAI:
26118 ShiftOpc = ISD::SRA;
26119 break;
26120 }
26121
26122 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26123 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26124 return C;
26125 }
26126
26127 return DAG.getNode(Opc, dl, VT, SrcOp,
26128 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26129}
26130
26131/// Handle vector element shifts by a splat shift amount
26132static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26133 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26134 const X86Subtarget &Subtarget,
26135 SelectionDAG &DAG) {
26136 MVT AmtVT = ShAmt.getSimpleValueType();
26137 assert(AmtVT.isVector() && "Vector shift type mismatch");
26138 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26139 "Illegal vector splat index");
26140
26141 // Move the splat element to the bottom element.
26142 if (ShAmtIdx != 0) {
26143 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26144 Mask[0] = ShAmtIdx;
26145 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26146 }
26147
26148 // Peek through any zext node if we can get back to a 128-bit source.
26149 if (AmtVT.getScalarSizeInBits() == 64 &&
26150 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26152 ShAmt.getOperand(0).getValueType().isSimple() &&
26153 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26154 ShAmt = ShAmt.getOperand(0);
26155 AmtVT = ShAmt.getSimpleValueType();
26156 }
26157
26158 // See if we can mask off the upper elements using the existing source node.
26159 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26160 // do this for vXi64 types.
26161 bool IsMasked = false;
26162 if (AmtVT.getScalarSizeInBits() < 64) {
26163 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26164 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26165 // If the shift amount has come from a scalar, then zero-extend the scalar
26166 // before moving to the vector.
26167 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26168 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26169 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26170 AmtVT = MVT::v4i32;
26171 IsMasked = true;
26172 } else if (ShAmt.getOpcode() == ISD::AND) {
26173 // See if the shift amount is already masked (e.g. for rotation modulo),
26174 // then we can zero-extend it by setting all the other mask elements to
26175 // zero.
26176 SmallVector<SDValue> MaskElts(
26177 AmtVT.getVectorNumElements(),
26178 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26179 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26180 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26181 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26182 {ShAmt.getOperand(1), Mask}))) {
26183 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26184 IsMasked = true;
26185 }
26186 }
26187 }
26188
26189 // Extract if the shift amount vector is larger than 128-bits.
26190 if (AmtVT.getSizeInBits() > 128) {
26191 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26192 AmtVT = ShAmt.getSimpleValueType();
26193 }
26194
26195 // Zero-extend bottom element to v2i64 vector type, either by extension or
26196 // shuffle masking.
26197 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26198 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26199 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26200 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26201 } else if (Subtarget.hasSSE41()) {
26202 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26203 MVT::v2i64, ShAmt);
26204 } else {
26205 SDValue ByteShift = DAG.getTargetConstant(
26206 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26207 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26208 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26209 ByteShift);
26210 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26211 ByteShift);
26212 }
26213 }
26214
26215 // Change opcode to non-immediate version.
26217
26218 // The return type has to be a 128-bit type with the same element
26219 // type as the input type.
26220 MVT EltVT = VT.getVectorElementType();
26221 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26222
26223 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26224 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26225}
26226
26227/// Return Mask with the necessary casting or extending
26228/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26229static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26230 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26231 const SDLoc &dl) {
26232
26233 if (isAllOnesConstant(Mask))
26234 return DAG.getConstant(1, dl, MaskVT);
26235 if (X86::isZeroNode(Mask))
26236 return DAG.getConstant(0, dl, MaskVT);
26237
26238 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26239
26240 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26241 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26242 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26243 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26244 SDValue Lo, Hi;
26245 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26246 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26247 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26248 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26249 } else {
26250 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26251 Mask.getSimpleValueType().getSizeInBits());
26252 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26253 // are extracted by EXTRACT_SUBVECTOR.
26254 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26255 DAG.getBitcast(BitcastVT, Mask),
26256 DAG.getVectorIdxConstant(0, dl));
26257 }
26258}
26259
26260/// Return (and \p Op, \p Mask) for compare instructions or
26261/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26262/// necessary casting or extending for \p Mask when lowering masking intrinsics
26264 SDValue PreservedSrc,
26265 const X86Subtarget &Subtarget,
26266 SelectionDAG &DAG) {
26267 MVT VT = Op.getSimpleValueType();
26268 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26269 unsigned OpcodeSelect = ISD::VSELECT;
26270 SDLoc dl(Op);
26271
26272 if (isAllOnesConstant(Mask))
26273 return Op;
26274
26275 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26276
26277 if (PreservedSrc.isUndef())
26278 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26279 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26280}
26281
26282/// Creates an SDNode for a predicated scalar operation.
26283/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26284/// The mask is coming as MVT::i8 and it should be transformed
26285/// to MVT::v1i1 while lowering masking intrinsics.
26286/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26287/// "X86select" instead of "vselect". We just can't create the "vselect" node
26288/// for a scalar instruction.
26290 SDValue PreservedSrc,
26291 const X86Subtarget &Subtarget,
26292 SelectionDAG &DAG) {
26293 auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
26294 if (MaskConst && (MaskConst->getZExtValue() & 0x1))
26295 return Op;
26296
26297 MVT VT = Op.getSimpleValueType();
26298 SDLoc dl(Op);
26299
26300 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26301 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26302 DAG.getBitcast(MVT::v8i1, Mask),
26303 DAG.getVectorIdxConstant(0, dl));
26304 if (Op.getOpcode() == X86ISD::FSETCCM ||
26305 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26306 Op.getOpcode() == X86ISD::VFPCLASSS)
26307 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26308
26309 if (PreservedSrc.isUndef())
26310 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26311
26312 if (MaskConst) {
26313 assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
26314 // Discard op and blend passthrough with scalar op src/dst.
26316 std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
26317 ShuffleMask[0] = VT.getVectorNumElements();
26318 return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
26319 ShuffleMask);
26320 }
26321
26322 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26323}
26324
26326 if (!Fn->hasPersonalityFn())
26328 "querying registration node size for function without personality");
26329 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26330 // WinEHStatePass for the full struct definition.
26331 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26332 case EHPersonality::MSVC_X86SEH: return 24;
26333 case EHPersonality::MSVC_CXX: return 16;
26334 default: break;
26335 }
26337 "can only recover FP for 32-bit MSVC EH personality functions");
26338}
26339
26340/// When the MSVC runtime transfers control to us, either to an outlined
26341/// function or when returning to a parent frame after catching an exception, we
26342/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26343/// Here's the math:
26344/// RegNodeBase = EntryEBP - RegNodeSize
26345/// ParentFP = RegNodeBase - ParentFrameOffset
26346/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26347/// subtracting the offset (negative on x86) takes us back to the parent FP.
26349 SDValue EntryEBP) {
26351 SDLoc dl;
26352
26353 // It's possible that the parent function no longer has a personality function
26354 // if the exceptional code was optimized away, in which case we just return
26355 // the incoming EBP.
26356 if (!Fn->hasPersonalityFn())
26357 return EntryEBP;
26358
26359 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26360 // registration, or the .set_setframe offset.
26363 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26364 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26365 SDValue ParentFrameOffset =
26366 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26367
26368 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26369 // prologue to RBP in the parent function.
26370 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26371 if (Subtarget.is64Bit())
26372 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26373
26374 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26375 // RegNodeBase = EntryEBP - RegNodeSize
26376 // ParentFP = RegNodeBase - ParentFrameOffset
26377 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26378 DAG.getConstant(RegNodeSize, dl, PtrVT));
26379 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26380}
26381
26382SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26383 SelectionDAG &DAG) const {
26384 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26385 auto isRoundModeCurDirection = [](SDValue Rnd) {
26386 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26387 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26388
26389 return false;
26390 };
26391 auto isRoundModeSAE = [](SDValue Rnd) {
26392 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26393 unsigned RC = C->getZExtValue();
26395 // Clear the NO_EXC bit and check remaining bits.
26397 // As a convenience we allow no other bits or explicitly
26398 // current direction.
26399 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26400 }
26401 }
26402
26403 return false;
26404 };
26405 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26406 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26407 RC = C->getZExtValue();
26409 // Clear the NO_EXC bit and check remaining bits.
26415 }
26416 }
26417
26418 return false;
26419 };
26420
26421 SDLoc dl(Op);
26422 unsigned IntNo = Op.getConstantOperandVal(0);
26423 MVT VT = Op.getSimpleValueType();
26424 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26425
26426 // Propagate flags from original node to transformed node(s).
26427 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26428
26429 if (IntrData) {
26430 switch(IntrData->Type) {
26431 case INTR_TYPE_1OP: {
26432 // We specify 2 possible opcodes for intrinsics with rounding modes.
26433 // First, we check if the intrinsic may have non-default rounding mode,
26434 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26435 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26436 if (IntrWithRoundingModeOpcode != 0) {
26437 SDValue Rnd = Op.getOperand(2);
26438 unsigned RC = 0;
26439 if (isRoundModeSAEToX(Rnd, RC))
26440 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26441 Op.getOperand(1),
26442 DAG.getTargetConstant(RC, dl, MVT::i32));
26443 if (!isRoundModeCurDirection(Rnd))
26444 return SDValue();
26445 }
26446 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26447 Op.getOperand(1));
26448 }
26449 case INTR_TYPE_1OP_SAE: {
26450 SDValue Sae = Op.getOperand(2);
26451
26452 unsigned Opc;
26453 if (isRoundModeCurDirection(Sae))
26454 Opc = IntrData->Opc0;
26455 else if (isRoundModeSAE(Sae))
26456 Opc = IntrData->Opc1;
26457 else
26458 return SDValue();
26459
26460 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26461 }
26462 case INTR_TYPE_2OP: {
26463 SDValue Src2 = Op.getOperand(2);
26464
26465 // We specify 2 possible opcodes for intrinsics with rounding modes.
26466 // First, we check if the intrinsic may have non-default rounding mode,
26467 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26468 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26469 if (IntrWithRoundingModeOpcode != 0) {
26470 SDValue Rnd = Op.getOperand(3);
26471 unsigned RC = 0;
26472 if (isRoundModeSAEToX(Rnd, RC))
26473 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26474 Op.getOperand(1), Src2,
26475 DAG.getTargetConstant(RC, dl, MVT::i32));
26476 if (!isRoundModeCurDirection(Rnd))
26477 return SDValue();
26478 }
26479
26480 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26481 Op.getOperand(1), Src2);
26482 }
26483 case INTR_TYPE_2OP_SAE: {
26484 SDValue Sae = Op.getOperand(3);
26485
26486 unsigned Opc;
26487 if (isRoundModeCurDirection(Sae))
26488 Opc = IntrData->Opc0;
26489 else if (isRoundModeSAE(Sae))
26490 Opc = IntrData->Opc1;
26491 else
26492 return SDValue();
26493
26494 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26495 Op.getOperand(2));
26496 }
26497 case INTR_TYPE_3OP:
26498 case INTR_TYPE_3OP_IMM8: {
26499 SDValue Src1 = Op.getOperand(1);
26500 SDValue Src2 = Op.getOperand(2);
26501 SDValue Src3 = Op.getOperand(3);
26502
26503 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26504 Src3.getValueType() != MVT::i8) {
26505 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26506 }
26507
26508 // We specify 2 possible opcodes for intrinsics with rounding modes.
26509 // First, we check if the intrinsic may have non-default rounding mode,
26510 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26511 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26512 if (IntrWithRoundingModeOpcode != 0) {
26513 SDValue Rnd = Op.getOperand(4);
26514 unsigned RC = 0;
26515 if (isRoundModeSAEToX(Rnd, RC))
26516 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26517 Src1, Src2, Src3,
26518 DAG.getTargetConstant(RC, dl, MVT::i32));
26519 if (!isRoundModeCurDirection(Rnd))
26520 return SDValue();
26521 }
26522
26523 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26524 {Src1, Src2, Src3});
26525 }
26526 case INTR_TYPE_4OP_IMM8: {
26527 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26528 SDValue Src4 = Op.getOperand(4);
26529 if (Src4.getValueType() != MVT::i8) {
26530 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26531 }
26532
26533 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26534 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26535 Src4);
26536 }
26537 case INTR_TYPE_1OP_MASK: {
26538 SDValue Src = Op.getOperand(1);
26539 SDValue PassThru = Op.getOperand(2);
26540 SDValue Mask = Op.getOperand(3);
26541 // We add rounding mode to the Node when
26542 // - RC Opcode is specified and
26543 // - RC is not "current direction".
26544 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26545 if (IntrWithRoundingModeOpcode != 0) {
26546 SDValue Rnd = Op.getOperand(4);
26547 unsigned RC = 0;
26548 if (isRoundModeSAEToX(Rnd, RC))
26549 return getVectorMaskingNode(
26550 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26551 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26552 Mask, PassThru, Subtarget, DAG);
26553 if (!isRoundModeCurDirection(Rnd))
26554 return SDValue();
26555 }
26556 return getVectorMaskingNode(
26557 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26558 Subtarget, DAG);
26559 }
26561 SDValue Src = Op.getOperand(1);
26562 SDValue PassThru = Op.getOperand(2);
26563 SDValue Mask = Op.getOperand(3);
26564 SDValue Rnd = Op.getOperand(4);
26565
26566 unsigned Opc;
26567 if (isRoundModeCurDirection(Rnd))
26568 Opc = IntrData->Opc0;
26569 else if (isRoundModeSAE(Rnd))
26570 Opc = IntrData->Opc1;
26571 else
26572 return SDValue();
26573
26574 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26575 Subtarget, DAG);
26576 }
26577 case INTR_TYPE_SCALAR_MASK: {
26578 SDValue Src1 = Op.getOperand(1);
26579 SDValue Src2 = Op.getOperand(2);
26580 SDValue passThru = Op.getOperand(3);
26581 SDValue Mask = Op.getOperand(4);
26582 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26583 // There are 2 kinds of intrinsics in this group:
26584 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26585 // (2) With rounding mode and sae - 7 operands.
26586 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26587 if (Op.getNumOperands() == (5U + HasRounding)) {
26588 if (HasRounding) {
26589 SDValue Rnd = Op.getOperand(5);
26590 unsigned RC = 0;
26591 if (isRoundModeSAEToX(Rnd, RC))
26592 return getScalarMaskingNode(
26593 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26594 DAG.getTargetConstant(RC, dl, MVT::i32)),
26595 Mask, passThru, Subtarget, DAG);
26596 if (!isRoundModeCurDirection(Rnd))
26597 return SDValue();
26598 }
26599 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26600 Src2),
26601 Mask, passThru, Subtarget, DAG);
26602 }
26603
26604 assert(Op.getNumOperands() == (6U + HasRounding) &&
26605 "Unexpected intrinsic form");
26606 SDValue RoundingMode = Op.getOperand(5);
26607 unsigned Opc = IntrData->Opc0;
26608 if (HasRounding) {
26609 SDValue Sae = Op.getOperand(6);
26610 if (isRoundModeSAE(Sae))
26611 Opc = IntrWithRoundingModeOpcode;
26612 else if (!isRoundModeCurDirection(Sae))
26613 return SDValue();
26614 }
26615 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26616 Src2, RoundingMode),
26617 Mask, passThru, Subtarget, DAG);
26618 }
26620 SDValue Src1 = Op.getOperand(1);
26621 SDValue Src2 = Op.getOperand(2);
26622 SDValue passThru = Op.getOperand(3);
26623 SDValue Mask = Op.getOperand(4);
26624 SDValue Rnd = Op.getOperand(5);
26625
26626 SDValue NewOp;
26627 unsigned RC = 0;
26628 if (isRoundModeCurDirection(Rnd))
26629 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26630 else if (isRoundModeSAEToX(Rnd, RC))
26631 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26632 DAG.getTargetConstant(RC, dl, MVT::i32));
26633 else
26634 return SDValue();
26635
26636 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26637 }
26639 SDValue Src1 = Op.getOperand(1);
26640 SDValue Src2 = Op.getOperand(2);
26641 SDValue passThru = Op.getOperand(3);
26642 SDValue Mask = Op.getOperand(4);
26643 SDValue Sae = Op.getOperand(5);
26644 unsigned Opc;
26645 if (isRoundModeCurDirection(Sae))
26646 Opc = IntrData->Opc0;
26647 else if (isRoundModeSAE(Sae))
26648 Opc = IntrData->Opc1;
26649 else
26650 return SDValue();
26651
26652 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26653 Mask, passThru, Subtarget, DAG);
26654 }
26655 case INTR_TYPE_2OP_MASK: {
26656 SDValue Src1 = Op.getOperand(1);
26657 SDValue Src2 = Op.getOperand(2);
26658 SDValue PassThru = Op.getOperand(3);
26659 SDValue Mask = Op.getOperand(4);
26660 SDValue NewOp;
26661 if (IntrData->Opc1 != 0) {
26662 SDValue Rnd = Op.getOperand(5);
26663 unsigned RC = 0;
26664 if (isRoundModeSAEToX(Rnd, RC))
26665 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26666 DAG.getTargetConstant(RC, dl, MVT::i32));
26667 else if (!isRoundModeCurDirection(Rnd))
26668 return SDValue();
26669 }
26670 if (!NewOp)
26671 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26672 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26673 }
26675 SDValue Src1 = Op.getOperand(1);
26676 SDValue Src2 = Op.getOperand(2);
26677 SDValue PassThru = Op.getOperand(3);
26678 SDValue Mask = Op.getOperand(4);
26679
26680 unsigned Opc = IntrData->Opc0;
26681 if (IntrData->Opc1 != 0) {
26682 SDValue Sae = Op.getOperand(5);
26683 if (isRoundModeSAE(Sae))
26684 Opc = IntrData->Opc1;
26685 else if (!isRoundModeCurDirection(Sae))
26686 return SDValue();
26687 }
26688
26689 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26690 Mask, PassThru, Subtarget, DAG);
26691 }
26693 SDValue Src1 = Op.getOperand(1);
26694 SDValue Src2 = Op.getOperand(2);
26695 SDValue Src3 = Op.getOperand(3);
26696 SDValue PassThru = Op.getOperand(4);
26697 SDValue Mask = Op.getOperand(5);
26698 SDValue Sae = Op.getOperand(6);
26699 unsigned Opc;
26700 if (isRoundModeCurDirection(Sae))
26701 Opc = IntrData->Opc0;
26702 else if (isRoundModeSAE(Sae))
26703 Opc = IntrData->Opc1;
26704 else
26705 return SDValue();
26706
26707 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26708 Mask, PassThru, Subtarget, DAG);
26709 }
26711 SDValue Src1 = Op.getOperand(1);
26712 SDValue Src2 = Op.getOperand(2);
26713 SDValue Src3 = Op.getOperand(3);
26714 SDValue PassThru = Op.getOperand(4);
26715 SDValue Mask = Op.getOperand(5);
26716
26717 unsigned Opc = IntrData->Opc0;
26718 if (IntrData->Opc1 != 0) {
26719 SDValue Sae = Op.getOperand(6);
26720 if (isRoundModeSAE(Sae))
26721 Opc = IntrData->Opc1;
26722 else if (!isRoundModeCurDirection(Sae))
26723 return SDValue();
26724 }
26725 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26726 Mask, PassThru, Subtarget, DAG);
26727 }
26728 case BLENDV: {
26729 SDValue Src1 = Op.getOperand(1);
26730 SDValue Src2 = Op.getOperand(2);
26731 SDValue Src3 = Op.getOperand(3);
26732
26733 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26734 Src3 = DAG.getBitcast(MaskVT, Src3);
26735
26736 // Reverse the operands to match VSELECT order.
26737 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26738 }
26739 case VPERM_2OP : {
26740 SDValue Src1 = Op.getOperand(1);
26741 SDValue Src2 = Op.getOperand(2);
26742
26743 // Swap Src1 and Src2 in the node creation
26744 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26745 }
26746 case CFMA_OP_MASKZ:
26747 case CFMA_OP_MASK: {
26748 SDValue Src1 = Op.getOperand(1);
26749 SDValue Src2 = Op.getOperand(2);
26750 SDValue Src3 = Op.getOperand(3);
26751 SDValue Mask = Op.getOperand(4);
26752 MVT VT = Op.getSimpleValueType();
26753
26754 SDValue PassThru = Src3;
26755 if (IntrData->Type == CFMA_OP_MASKZ)
26756 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26757
26758 // We add rounding mode to the Node when
26759 // - RC Opcode is specified and
26760 // - RC is not "current direction".
26761 SDValue NewOp;
26762 if (IntrData->Opc1 != 0) {
26763 SDValue Rnd = Op.getOperand(5);
26764 unsigned RC = 0;
26765 if (isRoundModeSAEToX(Rnd, RC))
26766 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26767 DAG.getTargetConstant(RC, dl, MVT::i32));
26768 else if (!isRoundModeCurDirection(Rnd))
26769 return SDValue();
26770 }
26771 if (!NewOp)
26772 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26773 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26774 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26775 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26776 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26777 }
26778 case IFMA_OP:
26779 // NOTE: We need to swizzle the operands to pass the multiply operands
26780 // first.
26781 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26782 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26783 case FPCLASSS: {
26784 SDValue Src1 = Op.getOperand(1);
26785 SDValue Imm = Op.getOperand(2);
26786 SDValue Mask = Op.getOperand(3);
26787 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26788 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26789 Subtarget, DAG);
26790 // Need to fill with zeros to ensure the bitcast will produce zeroes
26791 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26792 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26793 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26794 DAG.getVectorIdxConstant(0, dl));
26795 return DAG.getBitcast(MVT::i8, Ins);
26796 }
26797
26798 case CMP_MASK_CC: {
26799 MVT MaskVT = Op.getSimpleValueType();
26800 SDValue CC = Op.getOperand(3);
26801 SDValue Mask = Op.getOperand(4);
26802 // We specify 2 possible opcodes for intrinsics with rounding modes.
26803 // First, we check if the intrinsic may have non-default rounding mode,
26804 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26805 if (IntrData->Opc1 != 0) {
26806 SDValue Sae = Op.getOperand(5);
26807 if (isRoundModeSAE(Sae))
26808 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26809 Op.getOperand(2), CC, Mask, Sae);
26810 if (!isRoundModeCurDirection(Sae))
26811 return SDValue();
26812 }
26813 //default rounding mode
26814 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26815 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26816 }
26817 case CMP_MASK_SCALAR_CC: {
26818 SDValue Src1 = Op.getOperand(1);
26819 SDValue Src2 = Op.getOperand(2);
26820 SDValue CC = Op.getOperand(3);
26821 SDValue Mask = Op.getOperand(4);
26822
26823 SDValue Cmp;
26824 if (IntrData->Opc1 != 0) {
26825 SDValue Sae = Op.getOperand(5);
26826 if (isRoundModeSAE(Sae))
26827 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26828 else if (!isRoundModeCurDirection(Sae))
26829 return SDValue();
26830 }
26831 //default rounding mode
26832 if (!Cmp.getNode())
26833 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26834
26835 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26836 Subtarget, DAG);
26837 // Need to fill with zeros to ensure the bitcast will produce zeroes
26838 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26839 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26840 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26841 DAG.getVectorIdxConstant(0, dl));
26842 return DAG.getBitcast(MVT::i8, Ins);
26843 }
26844 case COMI: { // Comparison intrinsics
26845 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26846 SDValue LHS = Op.getOperand(1);
26847 SDValue RHS = Op.getOperand(2);
26848 // Some conditions require the operands to be swapped.
26849 if (CC == ISD::SETLT || CC == ISD::SETLE)
26850 std::swap(LHS, RHS);
26851
26852 // For AVX10.2, Support EQ and NE.
26853 bool HasAVX10_2_COMX =
26854 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26855
26856 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26857 // For BF type we need to fall back.
26858 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26859
26860 auto ComiOpCode = IntrData->Opc0;
26861 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26862
26863 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26864 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26865
26866 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26867
26868 SDValue SetCC;
26869 switch (CC) {
26870 case ISD::SETEQ: {
26871 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26872 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26873 break;
26874 // (ZF = 1 and PF = 0)
26875 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26876 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26877 break;
26878 }
26879 case ISD::SETNE: {
26880 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26881 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26882 break;
26883 // (ZF = 0 or PF = 1)
26884 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26885 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26886 break;
26887 }
26888 case ISD::SETGT: // (CF = 0 and ZF = 0)
26889 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26890 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26891 break;
26892 }
26893 case ISD::SETGE: // CF = 0
26894 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26895 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26896 break;
26897 default:
26898 llvm_unreachable("Unexpected illegal condition!");
26899 }
26900 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26901 }
26902 case COMI_RM: { // Comparison intrinsics with Sae
26903 SDValue LHS = Op.getOperand(1);
26904 SDValue RHS = Op.getOperand(2);
26905 unsigned CondVal = Op.getConstantOperandVal(3);
26906 SDValue Sae = Op.getOperand(4);
26907
26908 SDValue FCmp;
26909 if (isRoundModeCurDirection(Sae))
26910 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26911 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26912 else if (isRoundModeSAE(Sae))
26913 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26914 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26915 else
26916 return SDValue();
26917 // Need to fill with zeros to ensure the bitcast will produce zeroes
26918 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26919 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26920 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26921 DAG.getVectorIdxConstant(0, dl));
26922 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26923 DAG.getBitcast(MVT::i16, Ins));
26924 }
26925 case VSHIFT: {
26926 SDValue SrcOp = Op.getOperand(1);
26927 SDValue ShAmt = Op.getOperand(2);
26928 assert(ShAmt.getValueType() == MVT::i32 &&
26929 "Unexpected VSHIFT amount type");
26930
26931 // Catch shift-by-constant.
26932 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26933 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26934 Op.getSimpleValueType(), SrcOp,
26935 CShAmt->getZExtValue(), DAG);
26936
26937 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26938 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26939 SrcOp, ShAmt, 0, Subtarget, DAG);
26940 }
26942 SDValue Mask = Op.getOperand(3);
26943 SDValue DataToCompress = Op.getOperand(1);
26944 SDValue PassThru = Op.getOperand(2);
26945 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26946 return Op.getOperand(1);
26947
26948 // Avoid false dependency.
26949 if (PassThru.isUndef())
26950 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26951
26952 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26953 Mask);
26954 }
26955 case FIXUPIMM:
26956 case FIXUPIMM_MASKZ: {
26957 SDValue Src1 = Op.getOperand(1);
26958 SDValue Src2 = Op.getOperand(2);
26959 SDValue Src3 = Op.getOperand(3);
26960 SDValue Imm = Op.getOperand(4);
26961 SDValue Mask = Op.getOperand(5);
26962 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26963 ? Src1
26964 : getZeroVector(VT, Subtarget, DAG, dl);
26965
26966 unsigned Opc = IntrData->Opc0;
26967 if (IntrData->Opc1 != 0) {
26968 SDValue Sae = Op.getOperand(6);
26969 if (isRoundModeSAE(Sae))
26970 Opc = IntrData->Opc1;
26971 else if (!isRoundModeCurDirection(Sae))
26972 return SDValue();
26973 }
26974
26975 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26976
26978 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26979
26980 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26981 }
26982 case ROUNDP: {
26983 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26984 // Clear the upper bits of the rounding immediate so that the legacy
26985 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26986 uint64_t Round = Op.getConstantOperandVal(2);
26987 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26988 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26989 Op.getOperand(1), RoundingMode);
26990 }
26991 case ROUNDS: {
26992 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26993 // Clear the upper bits of the rounding immediate so that the legacy
26994 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26995 uint64_t Round = Op.getConstantOperandVal(3);
26996 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26997 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26998 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26999 }
27000 case BEXTRI: {
27001 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
27002
27003 uint64_t Imm = Op.getConstantOperandVal(2);
27004 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
27005 Op.getValueType());
27006 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27007 Op.getOperand(1), Control);
27008 }
27009 // ADC/SBB
27010 case ADX: {
27011 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
27012 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
27013
27014 SDValue Res;
27015 // If the carry in is zero, then we should just use ADD/SUB instead of
27016 // ADC/SBB.
27017 if (isNullConstant(Op.getOperand(1))) {
27018 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
27019 Op.getOperand(3));
27020 } else {
27021 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
27022 DAG.getAllOnesConstant(dl, MVT::i8));
27023 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
27024 Op.getOperand(3), GenCF.getValue(1));
27025 }
27026 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27027 SDValue Results[] = { SetCC, Res };
27028 return DAG.getMergeValues(Results, dl);
27029 }
27030 case CVTPD2PS_MASK:
27031 case CVTPD2DQ_MASK:
27032 case CVTQQ2PS_MASK:
27033 case TRUNCATE_TO_REG: {
27034 SDValue Src = Op.getOperand(1);
27035 SDValue PassThru = Op.getOperand(2);
27036 SDValue Mask = Op.getOperand(3);
27037
27038 if (isAllOnesConstant(Mask))
27039 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27040
27041 MVT SrcVT = Src.getSimpleValueType();
27042 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27043 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27044 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27045 {Src, PassThru, Mask});
27046 }
27047 case TRUNCATE2_TO_REG: {
27048 SDValue Src = Op.getOperand(1);
27049 SDValue Src2 = Op.getOperand(2);
27050 SDValue PassThru = Op.getOperand(3);
27051 SDValue Mask = Op.getOperand(4);
27052
27053 if (isAllOnesConstant(Mask))
27054 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27055
27056 MVT Src2VT = Src2.getSimpleValueType();
27057 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27058 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27059 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27060 {Src, Src2, PassThru, Mask});
27061 }
27062 case CVTPS2PH_MASK: {
27063 SDValue Src = Op.getOperand(1);
27064 SDValue Rnd = Op.getOperand(2);
27065 SDValue PassThru = Op.getOperand(3);
27066 SDValue Mask = Op.getOperand(4);
27067
27068 unsigned RC = 0;
27069 unsigned Opc = IntrData->Opc0;
27070 bool SAE = Src.getValueType().is512BitVector() &&
27071 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27072 if (SAE) {
27074 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27075 }
27076
27077 if (isAllOnesConstant(Mask))
27078 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27079
27080 if (SAE)
27082 else
27083 Opc = IntrData->Opc1;
27084 MVT SrcVT = Src.getSimpleValueType();
27085 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27086 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27087 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27088 }
27089 case CVTNEPS2BF16_MASK: {
27090 SDValue Src = Op.getOperand(1);
27091 SDValue PassThru = Op.getOperand(2);
27092 SDValue Mask = Op.getOperand(3);
27093
27094 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27095 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27096
27097 // Break false dependency.
27098 if (PassThru.isUndef())
27099 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27100
27101 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27102 Mask);
27103 }
27104 default:
27105 break;
27106 }
27107 }
27108
27109 switch (IntNo) {
27110 default: return SDValue(); // Don't custom lower most intrinsics.
27111
27112 // ptest and testp intrinsics. The intrinsic these come from are designed to
27113 // return an integer value, not just an instruction so lower it to the ptest
27114 // or testp pattern and a setcc for the result.
27115 case Intrinsic::x86_avx512_ktestc_b:
27116 case Intrinsic::x86_avx512_ktestc_w:
27117 case Intrinsic::x86_avx512_ktestc_d:
27118 case Intrinsic::x86_avx512_ktestc_q:
27119 case Intrinsic::x86_avx512_ktestz_b:
27120 case Intrinsic::x86_avx512_ktestz_w:
27121 case Intrinsic::x86_avx512_ktestz_d:
27122 case Intrinsic::x86_avx512_ktestz_q:
27123 case Intrinsic::x86_sse41_ptestz:
27124 case Intrinsic::x86_sse41_ptestc:
27125 case Intrinsic::x86_sse41_ptestnzc:
27126 case Intrinsic::x86_avx_ptestz_256:
27127 case Intrinsic::x86_avx_ptestc_256:
27128 case Intrinsic::x86_avx_ptestnzc_256:
27129 case Intrinsic::x86_avx_vtestz_ps:
27130 case Intrinsic::x86_avx_vtestc_ps:
27131 case Intrinsic::x86_avx_vtestnzc_ps:
27132 case Intrinsic::x86_avx_vtestz_pd:
27133 case Intrinsic::x86_avx_vtestc_pd:
27134 case Intrinsic::x86_avx_vtestnzc_pd:
27135 case Intrinsic::x86_avx_vtestz_ps_256:
27136 case Intrinsic::x86_avx_vtestc_ps_256:
27137 case Intrinsic::x86_avx_vtestnzc_ps_256:
27138 case Intrinsic::x86_avx_vtestz_pd_256:
27139 case Intrinsic::x86_avx_vtestc_pd_256:
27140 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27141 unsigned TestOpc = X86ISD::PTEST;
27142 X86::CondCode X86CC;
27143 switch (IntNo) {
27144 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27145 case Intrinsic::x86_avx512_ktestc_b:
27146 case Intrinsic::x86_avx512_ktestc_w:
27147 case Intrinsic::x86_avx512_ktestc_d:
27148 case Intrinsic::x86_avx512_ktestc_q:
27149 // CF = 1
27150 TestOpc = X86ISD::KTEST;
27151 X86CC = X86::COND_B;
27152 break;
27153 case Intrinsic::x86_avx512_ktestz_b:
27154 case Intrinsic::x86_avx512_ktestz_w:
27155 case Intrinsic::x86_avx512_ktestz_d:
27156 case Intrinsic::x86_avx512_ktestz_q:
27157 TestOpc = X86ISD::KTEST;
27158 X86CC = X86::COND_E;
27159 break;
27160 case Intrinsic::x86_avx_vtestz_ps:
27161 case Intrinsic::x86_avx_vtestz_pd:
27162 case Intrinsic::x86_avx_vtestz_ps_256:
27163 case Intrinsic::x86_avx_vtestz_pd_256:
27164 TestOpc = X86ISD::TESTP;
27165 [[fallthrough]];
27166 case Intrinsic::x86_sse41_ptestz:
27167 case Intrinsic::x86_avx_ptestz_256:
27168 // ZF = 1
27169 X86CC = X86::COND_E;
27170 break;
27171 case Intrinsic::x86_avx_vtestc_ps:
27172 case Intrinsic::x86_avx_vtestc_pd:
27173 case Intrinsic::x86_avx_vtestc_ps_256:
27174 case Intrinsic::x86_avx_vtestc_pd_256:
27175 TestOpc = X86ISD::TESTP;
27176 [[fallthrough]];
27177 case Intrinsic::x86_sse41_ptestc:
27178 case Intrinsic::x86_avx_ptestc_256:
27179 // CF = 1
27180 X86CC = X86::COND_B;
27181 break;
27182 case Intrinsic::x86_avx_vtestnzc_ps:
27183 case Intrinsic::x86_avx_vtestnzc_pd:
27184 case Intrinsic::x86_avx_vtestnzc_ps_256:
27185 case Intrinsic::x86_avx_vtestnzc_pd_256:
27186 TestOpc = X86ISD::TESTP;
27187 [[fallthrough]];
27188 case Intrinsic::x86_sse41_ptestnzc:
27189 case Intrinsic::x86_avx_ptestnzc_256:
27190 // ZF and CF = 0
27191 X86CC = X86::COND_A;
27192 break;
27193 }
27194
27195 SDValue LHS = Op.getOperand(1);
27196 SDValue RHS = Op.getOperand(2);
27197 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27198 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27199 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27200 }
27201
27202 case Intrinsic::x86_sse42_pcmpistria128:
27203 case Intrinsic::x86_sse42_pcmpestria128:
27204 case Intrinsic::x86_sse42_pcmpistric128:
27205 case Intrinsic::x86_sse42_pcmpestric128:
27206 case Intrinsic::x86_sse42_pcmpistrio128:
27207 case Intrinsic::x86_sse42_pcmpestrio128:
27208 case Intrinsic::x86_sse42_pcmpistris128:
27209 case Intrinsic::x86_sse42_pcmpestris128:
27210 case Intrinsic::x86_sse42_pcmpistriz128:
27211 case Intrinsic::x86_sse42_pcmpestriz128: {
27212 unsigned Opcode;
27213 X86::CondCode X86CC;
27214 switch (IntNo) {
27215 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27216 case Intrinsic::x86_sse42_pcmpistria128:
27217 Opcode = X86ISD::PCMPISTR;
27218 X86CC = X86::COND_A;
27219 break;
27220 case Intrinsic::x86_sse42_pcmpestria128:
27221 Opcode = X86ISD::PCMPESTR;
27222 X86CC = X86::COND_A;
27223 break;
27224 case Intrinsic::x86_sse42_pcmpistric128:
27225 Opcode = X86ISD::PCMPISTR;
27226 X86CC = X86::COND_B;
27227 break;
27228 case Intrinsic::x86_sse42_pcmpestric128:
27229 Opcode = X86ISD::PCMPESTR;
27230 X86CC = X86::COND_B;
27231 break;
27232 case Intrinsic::x86_sse42_pcmpistrio128:
27233 Opcode = X86ISD::PCMPISTR;
27234 X86CC = X86::COND_O;
27235 break;
27236 case Intrinsic::x86_sse42_pcmpestrio128:
27237 Opcode = X86ISD::PCMPESTR;
27238 X86CC = X86::COND_O;
27239 break;
27240 case Intrinsic::x86_sse42_pcmpistris128:
27241 Opcode = X86ISD::PCMPISTR;
27242 X86CC = X86::COND_S;
27243 break;
27244 case Intrinsic::x86_sse42_pcmpestris128:
27245 Opcode = X86ISD::PCMPESTR;
27246 X86CC = X86::COND_S;
27247 break;
27248 case Intrinsic::x86_sse42_pcmpistriz128:
27249 Opcode = X86ISD::PCMPISTR;
27250 X86CC = X86::COND_E;
27251 break;
27252 case Intrinsic::x86_sse42_pcmpestriz128:
27253 Opcode = X86ISD::PCMPESTR;
27254 X86CC = X86::COND_E;
27255 break;
27256 }
27258 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27259 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27260 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27261 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27262 }
27263
27264 case Intrinsic::x86_sse42_pcmpistri128:
27265 case Intrinsic::x86_sse42_pcmpestri128: {
27266 unsigned Opcode;
27267 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27268 Opcode = X86ISD::PCMPISTR;
27269 else
27270 Opcode = X86ISD::PCMPESTR;
27271
27273 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27274 return DAG.getNode(Opcode, dl, VTs, NewOps);
27275 }
27276
27277 case Intrinsic::x86_sse42_pcmpistrm128:
27278 case Intrinsic::x86_sse42_pcmpestrm128: {
27279 unsigned Opcode;
27280 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27281 Opcode = X86ISD::PCMPISTR;
27282 else
27283 Opcode = X86ISD::PCMPESTR;
27284
27286 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27287 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27288 }
27289
27290 case Intrinsic::eh_sjlj_lsda: {
27291 MachineFunction &MF = DAG.getMachineFunction();
27292 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27293 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27294 auto &Context = MF.getContext();
27295 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27296 Twine(MF.getFunctionNumber()));
27297 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27298 DAG.getMCSymbol(S, PtrVT));
27299 }
27300
27301 case Intrinsic::x86_seh_lsda: {
27302 // Compute the symbol for the LSDA. We know it'll get emitted later.
27303 MachineFunction &MF = DAG.getMachineFunction();
27304 SDValue Op1 = Op.getOperand(1);
27305 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27308
27309 // Generate a simple absolute symbol reference. This intrinsic is only
27310 // supported on 32-bit Windows, which isn't PIC.
27311 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27312 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27313 }
27314
27315 case Intrinsic::eh_recoverfp: {
27316 SDValue FnOp = Op.getOperand(1);
27317 SDValue IncomingFPOp = Op.getOperand(2);
27318 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27319 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27320 if (!Fn)
27322 "llvm.eh.recoverfp must take a function as the first argument");
27323 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27324 }
27325
27326 case Intrinsic::localaddress: {
27327 // Returns one of the stack, base, or frame pointer registers, depending on
27328 // which is used to reference local variables.
27329 MachineFunction &MF = DAG.getMachineFunction();
27330 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27331 Register Reg;
27332 if (RegInfo->hasBasePointer(MF))
27333 Reg = RegInfo->getBaseRegister();
27334 else { // Handles the SP or FP case.
27335 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27336 if (CantUseFP)
27337 Reg = RegInfo->getPtrSizedStackRegister(MF);
27338 else
27339 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27340 }
27341 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27342 }
27343 case Intrinsic::x86_avx512_vp2intersect_q_512:
27344 case Intrinsic::x86_avx512_vp2intersect_q_256:
27345 case Intrinsic::x86_avx512_vp2intersect_q_128:
27346 case Intrinsic::x86_avx512_vp2intersect_d_512:
27347 case Intrinsic::x86_avx512_vp2intersect_d_256:
27348 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27349 SDLoc DL(Op);
27350 MVT MaskVT = Op.getSimpleValueType();
27351 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27353 Op.getOperand(1), Op.getOperand(2));
27354 SDValue Result0 =
27355 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27356 SDValue Result1 =
27357 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27358 return DAG.getMergeValues({Result0, Result1}, DL);
27359 }
27360 case Intrinsic::x86_mmx_pslli_w:
27361 case Intrinsic::x86_mmx_pslli_d:
27362 case Intrinsic::x86_mmx_pslli_q:
27363 case Intrinsic::x86_mmx_psrli_w:
27364 case Intrinsic::x86_mmx_psrli_d:
27365 case Intrinsic::x86_mmx_psrli_q:
27366 case Intrinsic::x86_mmx_psrai_w:
27367 case Intrinsic::x86_mmx_psrai_d: {
27368 SDLoc DL(Op);
27369 SDValue ShAmt = Op.getOperand(2);
27370 // If the argument is a constant, convert it to a target constant.
27371 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27372 // Clamp out of bounds shift amounts since they will otherwise be masked
27373 // to 8-bits which may make it no longer out of bounds.
27374 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27375 if (ShiftAmount == 0)
27376 return Op.getOperand(1);
27377
27378 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27379 Op.getOperand(0), Op.getOperand(1),
27380 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27381 }
27382
27383 unsigned NewIntrinsic;
27384 switch (IntNo) {
27385 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27386 case Intrinsic::x86_mmx_pslli_w:
27387 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27388 break;
27389 case Intrinsic::x86_mmx_pslli_d:
27390 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27391 break;
27392 case Intrinsic::x86_mmx_pslli_q:
27393 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27394 break;
27395 case Intrinsic::x86_mmx_psrli_w:
27396 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27397 break;
27398 case Intrinsic::x86_mmx_psrli_d:
27399 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27400 break;
27401 case Intrinsic::x86_mmx_psrli_q:
27402 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27403 break;
27404 case Intrinsic::x86_mmx_psrai_w:
27405 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27406 break;
27407 case Intrinsic::x86_mmx_psrai_d:
27408 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27409 break;
27410 }
27411
27412 // The vector shift intrinsics with scalars uses 32b shift amounts but
27413 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27414 // MMX register.
27415 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27416 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27417 DAG.getTargetConstant(NewIntrinsic, DL,
27419 Op.getOperand(1), ShAmt);
27420 }
27421 case Intrinsic::thread_pointer: {
27422 if (Subtarget.isTargetELF()) {
27423 SDLoc dl(Op);
27424 EVT PtrVT = Op.getValueType();
27425 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27427 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27428 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27429 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27430 }
27432 "Target OS doesn't support __builtin_thread_pointer() yet.");
27433 }
27434 }
27435}
27436
27438 SDValue Src, SDValue Mask, SDValue Base,
27439 SDValue Index, SDValue ScaleOp, SDValue Chain,
27440 const X86Subtarget &Subtarget) {
27441 SDLoc dl(Op);
27442 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27443 // Scale must be constant.
27444 if (!C)
27445 return SDValue();
27446 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27447 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27448 TLI.getPointerTy(DAG.getDataLayout()));
27449 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27450 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27451 // If source is undef or we know it won't be used, use a zero vector
27452 // to break register dependency.
27453 // TODO: use undef instead and let BreakFalseDeps deal with it?
27454 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27455 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27456
27457 // Cast mask to an integer type.
27458 Mask = DAG.getBitcast(MaskVT, Mask);
27459
27461
27462 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27463 SDValue Res =
27465 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27466 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27467}
27468
27470 SDValue Src, SDValue Mask, SDValue Base,
27471 SDValue Index, SDValue ScaleOp, SDValue Chain,
27472 const X86Subtarget &Subtarget) {
27473 MVT VT = Op.getSimpleValueType();
27474 SDLoc dl(Op);
27475 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27476 // Scale must be constant.
27477 if (!C)
27478 return SDValue();
27479 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27480 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27481 TLI.getPointerTy(DAG.getDataLayout()));
27482 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27484 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27485
27486 // We support two versions of the gather intrinsics. One with scalar mask and
27487 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27488 if (Mask.getValueType() != MaskVT)
27489 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27490
27491 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27492 // If source is undef or we know it won't be used, use a zero vector
27493 // to break register dependency.
27494 // TODO: use undef instead and let BreakFalseDeps deal with it?
27495 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27496 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27497
27499
27500 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27501 SDValue Res =
27503 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27504 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27505}
27506
27508 SDValue Src, SDValue Mask, SDValue Base,
27509 SDValue Index, SDValue ScaleOp, SDValue Chain,
27510 const X86Subtarget &Subtarget) {
27511 SDLoc dl(Op);
27512 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27513 // Scale must be constant.
27514 if (!C)
27515 return SDValue();
27516 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27517 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27518 TLI.getPointerTy(DAG.getDataLayout()));
27519 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27520 Src.getSimpleValueType().getVectorNumElements());
27521 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27522
27523 // We support two versions of the scatter intrinsics. One with scalar mask and
27524 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27525 if (Mask.getValueType() != MaskVT)
27526 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27527
27529
27530 SDVTList VTs = DAG.getVTList(MVT::Other);
27531 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27532 SDValue Res =
27534 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27535 return Res;
27536}
27537
27539 SDValue Mask, SDValue Base, SDValue Index,
27540 SDValue ScaleOp, SDValue Chain,
27541 const X86Subtarget &Subtarget) {
27542 SDLoc dl(Op);
27543 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27544 // Scale must be constant.
27545 if (!C)
27546 return SDValue();
27547 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27548 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27549 TLI.getPointerTy(DAG.getDataLayout()));
27550 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27551 SDValue Segment = DAG.getRegister(0, MVT::i32);
27552 MVT MaskVT =
27553 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27554 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27555 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27556 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27557 return SDValue(Res, 0);
27558}
27559
27560/// Handles the lowering of builtin intrinsics with chain that return their
27561/// value into registers EDX:EAX.
27562/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27563/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27564/// TargetOpcode.
27565/// Returns a Glue value which can be used to add extra copy-from-reg if the
27566/// expanded intrinsics implicitly defines extra registers (i.e. not just
27567/// EDX:EAX).
27569 SelectionDAG &DAG,
27570 unsigned TargetOpcode,
27571 unsigned SrcReg,
27572 const X86Subtarget &Subtarget,
27574 SDValue Chain = N->getOperand(0);
27575 SDValue Glue;
27576
27577 if (SrcReg) {
27578 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27579 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27580 Glue = Chain.getValue(1);
27581 }
27582
27583 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27584 SDValue N1Ops[] = {Chain, Glue};
27585 SDNode *N1 = DAG.getMachineNode(
27586 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27587 Chain = SDValue(N1, 0);
27588
27589 // Reads the content of XCR and returns it in registers EDX:EAX.
27590 SDValue LO, HI;
27591 if (Subtarget.is64Bit()) {
27592 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27593 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27594 LO.getValue(2));
27595 } else {
27596 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27597 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27598 LO.getValue(2));
27599 }
27600 Chain = HI.getValue(1);
27601 Glue = HI.getValue(2);
27602
27603 if (Subtarget.is64Bit()) {
27604 // Merge the two 32-bit values into a 64-bit one.
27605 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27606 DAG.getConstant(32, DL, MVT::i8));
27607 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27608 Results.push_back(Chain);
27609 return Glue;
27610 }
27611
27612 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27613 SDValue Ops[] = { LO, HI };
27614 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27615 Results.push_back(Pair);
27616 Results.push_back(Chain);
27617 return Glue;
27618}
27619
27620/// Handles the lowering of builtin intrinsics that read the time stamp counter
27621/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27622/// READCYCLECOUNTER nodes.
27623static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27624 SelectionDAG &DAG,
27625 const X86Subtarget &Subtarget,
27627 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27628 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27629 // and the EAX register is loaded with the low-order 32 bits.
27630 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27631 /* NoRegister */0, Subtarget,
27632 Results);
27633 if (Opcode != X86::RDTSCP)
27634 return;
27635
27636 SDValue Chain = Results[1];
27637 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27638 // the ECX register. Add 'ecx' explicitly to the chain.
27639 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27640 Results[1] = ecx;
27641 Results.push_back(ecx.getValue(1));
27642}
27643
27645 SelectionDAG &DAG) {
27647 SDLoc DL(Op);
27648 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27649 Results);
27650 return DAG.getMergeValues(Results, DL);
27651}
27652
27655 SDValue Chain = Op.getOperand(0);
27656 SDValue RegNode = Op.getOperand(2);
27657 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27658 if (!EHInfo)
27659 report_fatal_error("EH registrations only live in functions using WinEH");
27660
27661 // Cast the operand to an alloca, and remember the frame index.
27662 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27663 if (!FINode)
27664 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27665 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27666
27667 // Return the chain operand without making any DAG nodes.
27668 return Chain;
27669}
27670
27673 SDValue Chain = Op.getOperand(0);
27674 SDValue EHGuard = Op.getOperand(2);
27675 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27676 if (!EHInfo)
27677 report_fatal_error("EHGuard only live in functions using WinEH");
27678
27679 // Cast the operand to an alloca, and remember the frame index.
27680 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27681 if (!FINode)
27682 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27683 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27684
27685 // Return the chain operand without making any DAG nodes.
27686 return Chain;
27687}
27688
27689/// Emit Truncating Store with signed or unsigned saturation.
27690static SDValue
27691EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27692 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27693 SelectionDAG &DAG) {
27694 SDVTList VTs = DAG.getVTList(MVT::Other);
27695 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27696 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27697 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27698 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27699}
27700
27701/// Emit Masked Truncating Store with signed or unsigned saturation.
27702static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27703 const SDLoc &DL,
27704 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27705 MachineMemOperand *MMO, SelectionDAG &DAG) {
27706 SDVTList VTs = DAG.getVTList(MVT::Other);
27707 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27708 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27709 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27710}
27711
27713 const MachineFunction &MF) {
27714 if (!Subtarget.is64Bit())
27715 return false;
27716 // 64-bit targets support extended Swift async frame setup,
27717 // except for targets that use the windows 64 prologue.
27718 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27719}
27720
27722 SelectionDAG &DAG) {
27723 unsigned IntNo = Op.getConstantOperandVal(1);
27724 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27725 if (!IntrData) {
27726 switch (IntNo) {
27727
27728 case Intrinsic::swift_async_context_addr: {
27729 SDLoc dl(Op);
27730 auto &MF = DAG.getMachineFunction();
27731 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27732 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27734 X86FI->setHasSwiftAsyncContext(true);
27735 SDValue Chain = Op->getOperand(0);
27736 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27737 SDValue Result =
27738 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27739 DAG.getTargetConstant(8, dl, MVT::i32)),
27740 0);
27741 // Return { result, chain }.
27742 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27743 CopyRBP.getValue(1));
27744 } else {
27745 // No special extended frame, create or reuse an existing stack slot.
27746 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27747 if (!X86FI->getSwiftAsyncContextFrameIdx())
27748 X86FI->setSwiftAsyncContextFrameIdx(
27749 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27750 false));
27751 SDValue Result =
27752 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27753 PtrSize == 8 ? MVT::i64 : MVT::i32);
27754 // Return { result, chain }.
27755 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27756 Op->getOperand(0));
27757 }
27758 }
27759
27760 case llvm::Intrinsic::x86_seh_ehregnode:
27761 return MarkEHRegistrationNode(Op, DAG);
27762 case llvm::Intrinsic::x86_seh_ehguard:
27763 return MarkEHGuard(Op, DAG);
27764 case llvm::Intrinsic::x86_rdpkru: {
27765 SDLoc dl(Op);
27766 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27767 // Create a RDPKRU node and pass 0 to the ECX parameter.
27768 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27769 DAG.getConstant(0, dl, MVT::i32));
27770 }
27771 case llvm::Intrinsic::x86_wrpkru: {
27772 SDLoc dl(Op);
27773 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27774 // to the EDX and ECX parameters.
27775 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27776 Op.getOperand(0), Op.getOperand(2),
27777 DAG.getConstant(0, dl, MVT::i32),
27778 DAG.getConstant(0, dl, MVT::i32));
27779 }
27780 case llvm::Intrinsic::asan_check_memaccess: {
27781 // Mark this as adjustsStack because it will be lowered to a call.
27783 // Don't do anything here, we will expand these intrinsics out later.
27784 return Op;
27785 }
27786 case llvm::Intrinsic::x86_flags_read_u32:
27787 case llvm::Intrinsic::x86_flags_read_u64:
27788 case llvm::Intrinsic::x86_flags_write_u32:
27789 case llvm::Intrinsic::x86_flags_write_u64: {
27790 // We need a frame pointer because this will get lowered to a PUSH/POP
27791 // sequence.
27794 // Don't do anything here, we will expand these intrinsics out later
27795 // during FinalizeISel in EmitInstrWithCustomInserter.
27796 return Op;
27797 }
27798 case Intrinsic::x86_lwpins32:
27799 case Intrinsic::x86_lwpins64:
27800 case Intrinsic::x86_umwait:
27801 case Intrinsic::x86_tpause: {
27802 SDLoc dl(Op);
27803 SDValue Chain = Op->getOperand(0);
27804 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27805 unsigned Opcode;
27806
27807 switch (IntNo) {
27808 default: llvm_unreachable("Impossible intrinsic");
27809 case Intrinsic::x86_umwait:
27810 Opcode = X86ISD::UMWAIT;
27811 break;
27812 case Intrinsic::x86_tpause:
27813 Opcode = X86ISD::TPAUSE;
27814 break;
27815 case Intrinsic::x86_lwpins32:
27816 case Intrinsic::x86_lwpins64:
27817 Opcode = X86ISD::LWPINS;
27818 break;
27819 }
27820
27822 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27823 Op->getOperand(3), Op->getOperand(4));
27824 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27825 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27826 Operation.getValue(1));
27827 }
27828 case Intrinsic::x86_enqcmd:
27829 case Intrinsic::x86_enqcmds: {
27830 SDLoc dl(Op);
27831 SDValue Chain = Op.getOperand(0);
27832 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27833 unsigned Opcode;
27834 switch (IntNo) {
27835 default: llvm_unreachable("Impossible intrinsic!");
27836 case Intrinsic::x86_enqcmd:
27837 Opcode = X86ISD::ENQCMD;
27838 break;
27839 case Intrinsic::x86_enqcmds:
27840 Opcode = X86ISD::ENQCMDS;
27841 break;
27842 }
27843 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27844 Op.getOperand(3));
27845 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27846 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27847 Operation.getValue(1));
27848 }
27849 case Intrinsic::x86_aesenc128kl:
27850 case Intrinsic::x86_aesdec128kl:
27851 case Intrinsic::x86_aesenc256kl:
27852 case Intrinsic::x86_aesdec256kl: {
27853 SDLoc DL(Op);
27854 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27855 SDValue Chain = Op.getOperand(0);
27856 unsigned Opcode;
27857
27858 switch (IntNo) {
27859 default: llvm_unreachable("Impossible intrinsic");
27860 case Intrinsic::x86_aesenc128kl:
27861 Opcode = X86ISD::AESENC128KL;
27862 break;
27863 case Intrinsic::x86_aesdec128kl:
27864 Opcode = X86ISD::AESDEC128KL;
27865 break;
27866 case Intrinsic::x86_aesenc256kl:
27867 Opcode = X86ISD::AESENC256KL;
27868 break;
27869 case Intrinsic::x86_aesdec256kl:
27870 Opcode = X86ISD::AESDEC256KL;
27871 break;
27872 }
27873
27875 MachineMemOperand *MMO = MemIntr->getMemOperand();
27876 EVT MemVT = MemIntr->getMemoryVT();
27878 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27879 MMO);
27880 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27881
27882 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27883 {ZF, Operation.getValue(0), Operation.getValue(2)});
27884 }
27885 case Intrinsic::x86_aesencwide128kl:
27886 case Intrinsic::x86_aesdecwide128kl:
27887 case Intrinsic::x86_aesencwide256kl:
27888 case Intrinsic::x86_aesdecwide256kl: {
27889 SDLoc DL(Op);
27890 SDVTList VTs = DAG.getVTList(
27891 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27892 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27893 SDValue Chain = Op.getOperand(0);
27894 unsigned Opcode;
27895
27896 switch (IntNo) {
27897 default: llvm_unreachable("Impossible intrinsic");
27898 case Intrinsic::x86_aesencwide128kl:
27899 Opcode = X86ISD::AESENCWIDE128KL;
27900 break;
27901 case Intrinsic::x86_aesdecwide128kl:
27902 Opcode = X86ISD::AESDECWIDE128KL;
27903 break;
27904 case Intrinsic::x86_aesencwide256kl:
27905 Opcode = X86ISD::AESENCWIDE256KL;
27906 break;
27907 case Intrinsic::x86_aesdecwide256kl:
27908 Opcode = X86ISD::AESDECWIDE256KL;
27909 break;
27910 }
27911
27913 MachineMemOperand *MMO = MemIntr->getMemOperand();
27914 EVT MemVT = MemIntr->getMemoryVT();
27916 Opcode, DL, VTs,
27917 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27918 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27919 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27920 MemVT, MMO);
27921 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27922
27923 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27924 {ZF, Operation.getValue(1), Operation.getValue(2),
27925 Operation.getValue(3), Operation.getValue(4),
27926 Operation.getValue(5), Operation.getValue(6),
27927 Operation.getValue(7), Operation.getValue(8),
27928 Operation.getValue(9)});
27929 }
27930 case Intrinsic::x86_testui: {
27931 SDLoc dl(Op);
27932 SDValue Chain = Op.getOperand(0);
27933 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27934 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27935 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27936 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27937 Operation.getValue(1));
27938 }
27939 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27940 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27941 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27942 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27943 case Intrinsic::x86_t2rpntlvwz0_internal:
27944 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27945 case Intrinsic::x86_t2rpntlvwz1_internal:
27946 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27947 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27949 unsigned IntNo = Op.getConstantOperandVal(1);
27950 unsigned Opc = 0;
27951 switch (IntNo) {
27952 default:
27953 llvm_unreachable("Unexpected intrinsic!");
27954 case Intrinsic::x86_t2rpntlvwz0_internal:
27955 Opc = X86::PT2RPNTLVWZ0V;
27956 break;
27957 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27958 Opc = X86::PT2RPNTLVWZ0T1V;
27959 break;
27960 case Intrinsic::x86_t2rpntlvwz1_internal:
27961 Opc = X86::PT2RPNTLVWZ1V;
27962 break;
27963 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27964 Opc = X86::PT2RPNTLVWZ1T1V;
27965 break;
27966 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27967 Opc = X86::PT2RPNTLVWZ0RSV;
27968 break;
27969 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27970 Opc = X86::PT2RPNTLVWZ0RST1V;
27971 break;
27972 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27973 Opc = X86::PT2RPNTLVWZ1RSV;
27974 break;
27975 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27976 Opc = X86::PT2RPNTLVWZ1RST1V;
27977 break;
27978 }
27979
27980 SDLoc DL(Op);
27981 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27982
27983 SDValue Ops[] = {Op.getOperand(2), // Row
27984 Op.getOperand(3), // Col0
27985 Op.getOperand(4), // Col1
27986 Op.getOperand(5), // Base
27987 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27988 Op.getOperand(6), // Index
27989 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27990 DAG.getRegister(0, MVT::i16), // Segment
27991 Op.getOperand(0)}; // Chain
27992
27993 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27994 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27995 SDValue(Res, 0));
27996 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27997 SDValue(Res, 0));
27998 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
27999 }
28000 case Intrinsic::x86_atomic_bts_rm:
28001 case Intrinsic::x86_atomic_btc_rm:
28002 case Intrinsic::x86_atomic_btr_rm: {
28003 SDLoc DL(Op);
28004 MVT VT = Op.getSimpleValueType();
28005 SDValue Chain = Op.getOperand(0);
28006 SDValue Op1 = Op.getOperand(2);
28007 SDValue Op2 = Op.getOperand(3);
28008 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
28009 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
28011 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28012 SDValue Res =
28013 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28014 {Chain, Op1, Op2}, VT, MMO);
28015 Chain = Res.getValue(1);
28016 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28017 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28018 }
28019 case Intrinsic::x86_atomic_bts:
28020 case Intrinsic::x86_atomic_btc:
28021 case Intrinsic::x86_atomic_btr: {
28022 SDLoc DL(Op);
28023 MVT VT = Op.getSimpleValueType();
28024 SDValue Chain = Op.getOperand(0);
28025 SDValue Op1 = Op.getOperand(2);
28026 SDValue Op2 = Op.getOperand(3);
28027 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28028 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28029 : X86ISD::LBTR;
28030 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28031 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28032 SDValue Res =
28033 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28034 {Chain, Op1, Op2, Size}, VT, MMO);
28035 Chain = Res.getValue(1);
28036 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28037 unsigned Imm = Op2->getAsZExtVal();
28038 if (Imm)
28039 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28040 DAG.getShiftAmountConstant(Imm, VT, DL));
28041 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28042 }
28043 case Intrinsic::x86_cmpccxadd32:
28044 case Intrinsic::x86_cmpccxadd64: {
28045 SDLoc DL(Op);
28046 SDValue Chain = Op.getOperand(0);
28047 SDValue Addr = Op.getOperand(2);
28048 SDValue Src1 = Op.getOperand(3);
28049 SDValue Src2 = Op.getOperand(4);
28050 SDValue CC = Op.getOperand(5);
28051 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28053 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28054 MVT::i32, MMO);
28055 return Operation;
28056 }
28057 case Intrinsic::x86_aadd32:
28058 case Intrinsic::x86_aadd64:
28059 case Intrinsic::x86_aand32:
28060 case Intrinsic::x86_aand64:
28061 case Intrinsic::x86_aor32:
28062 case Intrinsic::x86_aor64:
28063 case Intrinsic::x86_axor32:
28064 case Intrinsic::x86_axor64: {
28065 SDLoc DL(Op);
28066 SDValue Chain = Op.getOperand(0);
28067 SDValue Op1 = Op.getOperand(2);
28068 SDValue Op2 = Op.getOperand(3);
28069 MVT VT = Op2.getSimpleValueType();
28070 unsigned Opc = 0;
28071 switch (IntNo) {
28072 default:
28073 llvm_unreachable("Unknown Intrinsic");
28074 case Intrinsic::x86_aadd32:
28075 case Intrinsic::x86_aadd64:
28076 Opc = X86ISD::AADD;
28077 break;
28078 case Intrinsic::x86_aand32:
28079 case Intrinsic::x86_aand64:
28080 Opc = X86ISD::AAND;
28081 break;
28082 case Intrinsic::x86_aor32:
28083 case Intrinsic::x86_aor64:
28084 Opc = X86ISD::AOR;
28085 break;
28086 case Intrinsic::x86_axor32:
28087 case Intrinsic::x86_axor64:
28088 Opc = X86ISD::AXOR;
28089 break;
28090 }
28091 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28092 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28093 {Chain, Op1, Op2}, VT, MMO);
28094 }
28095 case Intrinsic::x86_atomic_add_cc:
28096 case Intrinsic::x86_atomic_sub_cc:
28097 case Intrinsic::x86_atomic_or_cc:
28098 case Intrinsic::x86_atomic_and_cc:
28099 case Intrinsic::x86_atomic_xor_cc: {
28100 SDLoc DL(Op);
28101 SDValue Chain = Op.getOperand(0);
28102 SDValue Op1 = Op.getOperand(2);
28103 SDValue Op2 = Op.getOperand(3);
28104 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28105 MVT VT = Op2.getSimpleValueType();
28106 unsigned Opc = 0;
28107 switch (IntNo) {
28108 default:
28109 llvm_unreachable("Unknown Intrinsic");
28110 case Intrinsic::x86_atomic_add_cc:
28111 Opc = X86ISD::LADD;
28112 break;
28113 case Intrinsic::x86_atomic_sub_cc:
28114 Opc = X86ISD::LSUB;
28115 break;
28116 case Intrinsic::x86_atomic_or_cc:
28117 Opc = X86ISD::LOR;
28118 break;
28119 case Intrinsic::x86_atomic_and_cc:
28120 Opc = X86ISD::LAND;
28121 break;
28122 case Intrinsic::x86_atomic_xor_cc:
28123 Opc = X86ISD::LXOR;
28124 break;
28125 }
28126 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28127 SDValue LockArith =
28128 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28129 {Chain, Op1, Op2}, VT, MMO);
28130 Chain = LockArith.getValue(1);
28131 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28132 }
28133 }
28134 return SDValue();
28135 }
28136
28137 SDLoc dl(Op);
28138 switch(IntrData->Type) {
28139 default: llvm_unreachable("Unknown Intrinsic Type");
28140 case RDSEED:
28141 case RDRAND: {
28142 // Emit the node with the right value type.
28143 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28144 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28145
28146 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28147 // Otherwise return the value from Rand, which is always 0, casted to i32.
28148 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28149 DAG.getConstant(1, dl, Op->getValueType(1)),
28150 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28151 SDValue(Result.getNode(), 1)};
28152 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28153
28154 // Return { result, isValid, chain }.
28155 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28156 SDValue(Result.getNode(), 2));
28157 }
28158 case GATHER_AVX2: {
28159 SDValue Chain = Op.getOperand(0);
28160 SDValue Src = Op.getOperand(2);
28161 SDValue Base = Op.getOperand(3);
28162 SDValue Index = Op.getOperand(4);
28163 SDValue Mask = Op.getOperand(5);
28164 SDValue Scale = Op.getOperand(6);
28165 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28166 Scale, Chain, Subtarget);
28167 }
28168 case GATHER: {
28169 //gather(v1, mask, index, base, scale);
28170 SDValue Chain = Op.getOperand(0);
28171 SDValue Src = Op.getOperand(2);
28172 SDValue Base = Op.getOperand(3);
28173 SDValue Index = Op.getOperand(4);
28174 SDValue Mask = Op.getOperand(5);
28175 SDValue Scale = Op.getOperand(6);
28176 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28177 Chain, Subtarget);
28178 }
28179 case SCATTER: {
28180 //scatter(base, mask, index, v1, scale);
28181 SDValue Chain = Op.getOperand(0);
28182 SDValue Base = Op.getOperand(2);
28183 SDValue Mask = Op.getOperand(3);
28184 SDValue Index = Op.getOperand(4);
28185 SDValue Src = Op.getOperand(5);
28186 SDValue Scale = Op.getOperand(6);
28187 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28188 Scale, Chain, Subtarget);
28189 }
28190 case PREFETCH: {
28191 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28192 assert((HintVal == 2 || HintVal == 3) &&
28193 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28194 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28195 SDValue Chain = Op.getOperand(0);
28196 SDValue Mask = Op.getOperand(2);
28197 SDValue Index = Op.getOperand(3);
28198 SDValue Base = Op.getOperand(4);
28199 SDValue Scale = Op.getOperand(5);
28200 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28201 Subtarget);
28202 }
28203 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28204 case RDTSC: {
28206 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28207 Results);
28208 return DAG.getMergeValues(Results, dl);
28209 }
28210 // Read Performance Monitoring Counters.
28211 case RDPMC:
28212 // Read Processor Register.
28213 case RDPRU:
28214 // GetExtended Control Register.
28215 case XGETBV: {
28217
28218 // RDPMC uses ECX to select the index of the performance counter to read.
28219 // RDPRU uses ECX to select the processor register to read.
28220 // XGETBV uses ECX to select the index of the XCR register to return.
28221 // The result is stored into registers EDX:EAX.
28222 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28223 Subtarget, Results);
28224 return DAG.getMergeValues(Results, dl);
28225 }
28226 // XTEST intrinsics.
28227 case XTEST: {
28228 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28229 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28230
28231 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28232 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28233 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28234 Ret, SDValue(InTrans.getNode(), 1));
28235 }
28238 case TRUNCATE_TO_MEM_VI32: {
28239 SDValue Mask = Op.getOperand(4);
28240 SDValue DataToTruncate = Op.getOperand(3);
28241 SDValue Addr = Op.getOperand(2);
28242 SDValue Chain = Op.getOperand(0);
28243
28245 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28246
28247 EVT MemVT = MemIntr->getMemoryVT();
28248
28249 uint16_t TruncationOp = IntrData->Opc0;
28250 switch (TruncationOp) {
28251 case X86ISD::VTRUNC: {
28252 if (isAllOnesConstant(Mask)) // return just a truncate store
28253 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28254 MemIntr->getMemOperand());
28255
28256 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28257 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28258 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28259
28260 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28261 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28262 true /* truncating */);
28263 }
28264 case X86ISD::VTRUNCUS:
28265 case X86ISD::VTRUNCS: {
28266 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28267 if (isAllOnesConstant(Mask))
28268 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28269 MemIntr->getMemOperand(), DAG);
28270
28271 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28272 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28273
28274 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28275 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28276 }
28277 default:
28278 llvm_unreachable("Unsupported truncstore intrinsic");
28279 }
28280 }
28281 case INTR_TYPE_CAST_MMX:
28282 return SDValue(); // handled in combineINTRINSIC_*
28283 }
28284}
28285
28286SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28287 SelectionDAG &DAG) const {
28288 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28289 MFI.setReturnAddressIsTaken(true);
28290
28291 unsigned Depth = Op.getConstantOperandVal(0);
28292 SDLoc dl(Op);
28293 EVT PtrVT = Op.getValueType();
28294
28295 if (Depth > 0) {
28296 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28297 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28298 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28299 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28300 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28301 MachinePointerInfo());
28302 }
28303
28304 // Just load the return address.
28305 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28306 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28307 MachinePointerInfo());
28308}
28309
28310SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28311 SelectionDAG &DAG) const {
28313 return getReturnAddressFrameIndex(DAG);
28314}
28315
28316SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28317 MachineFunction &MF = DAG.getMachineFunction();
28318 MachineFrameInfo &MFI = MF.getFrameInfo();
28319 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
28320 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28321 EVT VT = Op.getValueType();
28322
28323 MFI.setFrameAddressIsTaken(true);
28324
28325 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28326 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28327 // is not possible to crawl up the stack without looking at the unwind codes
28328 // simultaneously.
28329 int FrameAddrIndex = FuncInfo->getFAIndex();
28330 if (!FrameAddrIndex) {
28331 // Set up a frame object for the return address.
28332 unsigned SlotSize = RegInfo->getSlotSize();
28333 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28334 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28335 FuncInfo->setFAIndex(FrameAddrIndex);
28336 }
28337 return DAG.getFrameIndex(FrameAddrIndex, VT);
28338 }
28339
28340 Register FrameReg =
28341 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28342 SDLoc dl(Op); // FIXME probably not meaningful
28343 unsigned Depth = Op.getConstantOperandVal(0);
28344 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28345 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28346 "Invalid Frame Register!");
28347 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28348 while (Depth--)
28349 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28350 MachinePointerInfo());
28351 return FrameAddr;
28352}
28353
28354// FIXME? Maybe this could be a TableGen attribute on some registers and
28355// this table could be generated automatically from RegInfo.
28357 const MachineFunction &MF) const {
28358 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28359
28361 .Case("esp", X86::ESP)
28362 .Case("rsp", X86::RSP)
28363 .Case("ebp", X86::EBP)
28364 .Case("rbp", X86::RBP)
28365 .Case("r14", X86::R14)
28366 .Case("r15", X86::R15)
28367 .Default(0);
28368
28369 if (Reg == X86::EBP || Reg == X86::RBP) {
28370 if (!TFI.hasFP(MF))
28371 report_fatal_error("register " + StringRef(RegName) +
28372 " is allocatable: function has no frame pointer");
28373#ifndef NDEBUG
28374 else {
28375 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28376 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28377 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28378 "Invalid Frame Register!");
28379 }
28380#endif
28381 }
28382
28383 return Reg;
28384}
28385
28386SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28387 SelectionDAG &DAG) const {
28388 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28389 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28390}
28391
28393 const Constant *PersonalityFn) const {
28394 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28395 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28396
28397 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28398}
28399
28401 const Constant *PersonalityFn) const {
28402 // Funclet personalities don't use selectors (the runtime does the selection).
28404 return X86::NoRegister;
28405 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28406}
28407
28409 return Subtarget.isTargetWin64();
28410}
28411
28412SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28413 SDValue Chain = Op.getOperand(0);
28414 SDValue Offset = Op.getOperand(1);
28415 SDValue Handler = Op.getOperand(2);
28416 SDLoc dl (Op);
28417
28418 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28419 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28420 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28421 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28422 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28423 "Invalid Frame Register!");
28424 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28425 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28426
28427 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28428 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28429 dl));
28430 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28431 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28432 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28433
28434 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28435 DAG.getRegister(StoreAddrReg, PtrVT));
28436}
28437
28438SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28439 SelectionDAG &DAG) const {
28440 SDLoc DL(Op);
28441 // If the subtarget is not 64bit, we may need the global base reg
28442 // after isel expand pseudo, i.e., after CGBR pass ran.
28443 // Therefore, ask for the GlobalBaseReg now, so that the pass
28444 // inserts the code for us in case we need it.
28445 // Otherwise, we will end up in a situation where we will
28446 // reference a virtual register that is not defined!
28447 if (!Subtarget.is64Bit()) {
28448 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28449 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28450 }
28451 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28452 DAG.getVTList(MVT::i32, MVT::Other),
28453 Op.getOperand(0), Op.getOperand(1));
28454}
28455
28456SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28457 SelectionDAG &DAG) const {
28458 SDLoc DL(Op);
28459 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28460 Op.getOperand(0), Op.getOperand(1));
28461}
28462
28463SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28464 SelectionDAG &DAG) const {
28465 SDLoc DL(Op);
28466 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28467 Op.getOperand(0));
28468}
28469
28471 return Op.getOperand(0);
28472}
28473
28474SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28475 SelectionDAG &DAG) const {
28476 SDValue Root = Op.getOperand(0);
28477 SDValue Trmp = Op.getOperand(1); // trampoline
28478 SDValue FPtr = Op.getOperand(2); // nested function
28479 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28480 SDLoc dl (Op);
28481
28482 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28483 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28484
28485 if (Subtarget.is64Bit()) {
28486 SDValue OutChains[6];
28487
28488 // Large code-model.
28489 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28490 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28491
28492 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28493 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28494
28495 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28496
28497 // Load the pointer to the nested function into R11.
28498 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28499 SDValue Addr = Trmp;
28500 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28501 Addr, MachinePointerInfo(TrmpAddr));
28502
28503 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28504 DAG.getConstant(2, dl, MVT::i64));
28505 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28506 MachinePointerInfo(TrmpAddr, 2), Align(2));
28507
28508 // Load the 'nest' parameter value into R10.
28509 // R10 is specified in X86CallingConv.td
28510 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28511 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28512 DAG.getConstant(10, dl, MVT::i64));
28513 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28514 Addr, MachinePointerInfo(TrmpAddr, 10));
28515
28516 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28517 DAG.getConstant(12, dl, MVT::i64));
28518 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28519 MachinePointerInfo(TrmpAddr, 12), Align(2));
28520
28521 // Jump to the nested function.
28522 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28523 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28524 DAG.getConstant(20, dl, MVT::i64));
28525 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28526 Addr, MachinePointerInfo(TrmpAddr, 20));
28527
28528 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28529 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28530 DAG.getConstant(22, dl, MVT::i64));
28531 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28532 Addr, MachinePointerInfo(TrmpAddr, 22));
28533
28534 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28535 } else {
28536 const Function *Func =
28537 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28538 CallingConv::ID CC = Func->getCallingConv();
28539 unsigned NestReg;
28540
28541 switch (CC) {
28542 default:
28543 llvm_unreachable("Unsupported calling convention");
28544 case CallingConv::C:
28546 // Pass 'nest' parameter in ECX.
28547 // Must be kept in sync with X86CallingConv.td
28548 NestReg = X86::ECX;
28549
28550 // Check that ECX wasn't needed by an 'inreg' parameter.
28551 FunctionType *FTy = Func->getFunctionType();
28552 const AttributeList &Attrs = Func->getAttributes();
28553
28554 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28555 unsigned InRegCount = 0;
28556 unsigned Idx = 0;
28557
28558 for (FunctionType::param_iterator I = FTy->param_begin(),
28559 E = FTy->param_end(); I != E; ++I, ++Idx)
28560 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28561 const DataLayout &DL = DAG.getDataLayout();
28562 // FIXME: should only count parameters that are lowered to integers.
28563 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28564 }
28565
28566 if (InRegCount > 2) {
28567 report_fatal_error("Nest register in use - reduce number of inreg"
28568 " parameters!");
28569 }
28570 }
28571 break;
28572 }
28575 case CallingConv::Fast:
28576 case CallingConv::Tail:
28578 // Pass 'nest' parameter in EAX.
28579 // Must be kept in sync with X86CallingConv.td
28580 NestReg = X86::EAX;
28581 break;
28582 }
28583
28584 SDValue OutChains[4];
28585 SDValue Addr, Disp;
28586
28587 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28588 DAG.getConstant(10, dl, MVT::i32));
28589 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28590
28591 // This is storing the opcode for MOV32ri.
28592 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28593 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28594 OutChains[0] =
28595 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28596 Trmp, MachinePointerInfo(TrmpAddr));
28597
28598 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28599 DAG.getConstant(1, dl, MVT::i32));
28600 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28601 MachinePointerInfo(TrmpAddr, 1), Align(1));
28602
28603 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28604 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28605 DAG.getConstant(5, dl, MVT::i32));
28606 OutChains[2] =
28607 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28608 MachinePointerInfo(TrmpAddr, 5), Align(1));
28609
28610 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28611 DAG.getConstant(6, dl, MVT::i32));
28612 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28613 MachinePointerInfo(TrmpAddr, 6), Align(1));
28614
28615 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28616 }
28617}
28618
28619SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28620 SelectionDAG &DAG) const {
28621 /*
28622 The rounding mode is in bits 11:10 of FPSR, and has the following
28623 settings:
28624 00 Round to nearest
28625 01 Round to -inf
28626 10 Round to +inf
28627 11 Round to 0
28628
28629 GET_ROUNDING, on the other hand, expects the following:
28630 -1 Undefined
28631 0 Round to 0
28632 1 Round to nearest
28633 2 Round to +inf
28634 3 Round to -inf
28635
28636 To perform the conversion, we use a packed lookup table of the four 2-bit
28637 values that we can index by FPSP[11:10]
28638 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28639
28640 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28641 */
28642
28643 MachineFunction &MF = DAG.getMachineFunction();
28644 MVT VT = Op.getSimpleValueType();
28645 SDLoc DL(Op);
28646
28647 // Save FP Control Word to stack slot
28648 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28649 SDValue StackSlot =
28650 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28651
28652 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28653
28654 SDValue Chain = Op.getOperand(0);
28655 SDValue Ops[] = {Chain, StackSlot};
28657 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28659
28660 // Load FP Control Word from stack slot
28661 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28662 Chain = CWD.getValue(1);
28663
28664 // Mask and turn the control bits into a shift for the lookup table.
28665 SDValue Shift =
28666 DAG.getNode(ISD::SRL, DL, MVT::i16,
28667 DAG.getNode(ISD::AND, DL, MVT::i16,
28668 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28669 DAG.getConstant(9, DL, MVT::i8));
28670 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28671
28672 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28673 SDValue RetVal =
28674 DAG.getNode(ISD::AND, DL, MVT::i32,
28675 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28676 DAG.getConstant(3, DL, MVT::i32));
28677
28678 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28679
28680 return DAG.getMergeValues({RetVal, Chain}, DL);
28681}
28682
28683SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28684 SelectionDAG &DAG) const {
28685 MachineFunction &MF = DAG.getMachineFunction();
28686 SDLoc DL(Op);
28687 SDValue Chain = Op.getNode()->getOperand(0);
28688
28689 // FP control word may be set only from data in memory. So we need to allocate
28690 // stack space to save/load FP control word.
28691 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28692 SDValue StackSlot =
28693 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28694 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28695 MachineMemOperand *MMO =
28697
28698 // Store FP control word into memory.
28699 SDValue Ops[] = {Chain, StackSlot};
28700 Chain = DAG.getMemIntrinsicNode(
28701 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28702
28703 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28704 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28705 Chain = CWD.getValue(1);
28706 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28707 DAG.getConstant(0xf3ff, DL, MVT::i16));
28708
28709 // Calculate new rounding mode.
28710 SDValue NewRM = Op.getNode()->getOperand(1);
28711 SDValue RMBits;
28712 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28713 uint64_t RM = CVal->getZExtValue();
28714 int FieldVal = X86::getRoundingModeX86(RM);
28715
28716 if (FieldVal == X86::rmInvalid) {
28717 LLVMContext &C = MF.getFunction().getContext();
28718 C.diagnose(DiagnosticInfoUnsupported(
28719 MF.getFunction(), "rounding mode is not supported by X86 hardware",
28720 DiagnosticLocation(DL.getDebugLoc()), DS_Error));
28721 return {};
28722 }
28723 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28724 } else {
28725 // Need to convert argument into bits of control word:
28726 // 0 Round to 0 -> 11
28727 // 1 Round to nearest -> 00
28728 // 2 Round to +inf -> 10
28729 // 3 Round to -inf -> 01
28730 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28731 // To make the conversion, put all these values into a value 0xc9 and shift
28732 // it left depending on the rounding mode:
28733 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28734 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28735 // ...
28736 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28737 SDValue ShiftValue =
28738 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28739 DAG.getNode(ISD::ADD, DL, MVT::i32,
28740 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28741 DAG.getConstant(1, DL, MVT::i8)),
28742 DAG.getConstant(4, DL, MVT::i32)));
28743 SDValue Shifted =
28744 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28745 ShiftValue);
28746 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28747 DAG.getConstant(0xc00, DL, MVT::i16));
28748 }
28749
28750 // Update rounding mode bits and store the new FP Control Word into stack.
28751 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28752 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28753
28754 // Load FP control word from the slot.
28755 SDValue OpsLD[] = {Chain, StackSlot};
28756 MachineMemOperand *MMOL =
28758 Chain = DAG.getMemIntrinsicNode(
28759 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28760
28761 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28762 // same way but in bits 14:13.
28763 if (Subtarget.hasSSE1()) {
28764 // Store MXCSR into memory.
28765 Chain = DAG.getNode(
28766 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28767 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28768 StackSlot);
28769
28770 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28771 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28772 Chain = CWD.getValue(1);
28773 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28774 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28775
28776 // Shift X87 RM bits from 11:10 to 14:13.
28777 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28778 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28779 DAG.getConstant(3, DL, MVT::i8));
28780
28781 // Update rounding mode bits and store the new FP Control Word into stack.
28782 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28783 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28784
28785 // Load MXCSR from the slot.
28786 Chain = DAG.getNode(
28787 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28788 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28789 StackSlot);
28790 }
28791
28792 return Chain;
28793}
28794
28795const unsigned X87StateSize = 28;
28796const unsigned FPStateSize = 32;
28797[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28798
28799SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28800 SelectionDAG &DAG) const {
28802 SDLoc DL(Op);
28803 SDValue Chain = Op->getOperand(0);
28804 SDValue Ptr = Op->getOperand(1);
28806 EVT MemVT = Node->getMemoryVT();
28808 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28809
28810 // Get x87 state, if it presents.
28811 if (Subtarget.hasX87()) {
28812 Chain =
28813 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28814 {Chain, Ptr}, MemVT, MMO);
28815
28816 // FNSTENV changes the exception mask, so load back the stored environment.
28817 MachineMemOperand::Flags NewFlags =
28820 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28821 Chain =
28822 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28823 {Chain, Ptr}, MemVT, MMO);
28824 }
28825
28826 // If target supports SSE, get MXCSR as well.
28827 if (Subtarget.hasSSE1()) {
28828 // Get pointer to the MXCSR location in memory.
28830 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28831 DAG.getConstant(X87StateSize, DL, PtrVT));
28832 // Store MXCSR into memory.
28833 Chain = DAG.getNode(
28834 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28835 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28836 MXCSRAddr);
28837 }
28838
28839 return Chain;
28840}
28841
28843 EVT MemVT, MachineMemOperand *MMO,
28844 SelectionDAG &DAG,
28845 const X86Subtarget &Subtarget) {
28846 // Set x87 state, if it presents.
28847 if (Subtarget.hasX87())
28848 Chain =
28849 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28850 {Chain, Ptr}, MemVT, MMO);
28851 // If target supports SSE, set MXCSR as well.
28852 if (Subtarget.hasSSE1()) {
28853 // Get pointer to the MXCSR location in memory.
28855 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28856 DAG.getConstant(X87StateSize, DL, PtrVT));
28857 // Load MXCSR from memory.
28858 Chain = DAG.getNode(
28859 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28860 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28861 MXCSRAddr);
28862 }
28863 return Chain;
28864}
28865
28866SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28867 SelectionDAG &DAG) const {
28868 SDLoc DL(Op);
28869 SDValue Chain = Op->getOperand(0);
28870 SDValue Ptr = Op->getOperand(1);
28872 EVT MemVT = Node->getMemoryVT();
28874 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28875 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28876}
28877
28878SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28879 SelectionDAG &DAG) const {
28880 MachineFunction &MF = DAG.getMachineFunction();
28881 SDLoc DL(Op);
28882 SDValue Chain = Op.getNode()->getOperand(0);
28883
28884 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28885 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28887
28888 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28889 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28890 // for compatibility with glibc.
28891 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28892 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28893 Constant *Zero = ConstantInt::get(ItemTy, 0);
28894 for (unsigned I = 0; I < 6; ++I)
28895 FPEnvVals.push_back(Zero);
28896
28897 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28898 // all exceptions, sets DAZ and FTZ to 0.
28899 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28900 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28901 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
28902 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28903 MachinePointerInfo MPI =
28905 MachineMemOperand *MMO = MF.getMachineMemOperand(
28907
28908 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28909}
28910
28911// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28912uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28913 assert((Amt < 8) && "Shift/Rotation amount out of range");
28914 switch (Opcode) {
28915 case ISD::BITREVERSE:
28916 return 0x8040201008040201ULL;
28917 case ISD::SHL:
28918 return ((0x0102040810204080ULL >> (Amt)) &
28919 (0x0101010101010101ULL * (0xFF >> (Amt))));
28920 case ISD::SRL:
28921 return ((0x0102040810204080ULL << (Amt)) &
28922 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28923 case ISD::SRA:
28924 return (getGFNICtrlImm(ISD::SRL, Amt) |
28925 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28926 case ISD::ROTL:
28927 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28928 case ISD::ROTR:
28929 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28930 }
28931 llvm_unreachable("Unsupported GFNI opcode");
28932}
28933
28934// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28935SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28936 MVT VT, unsigned Amt = 0) {
28937 assert(VT.getVectorElementType() == MVT::i8 &&
28938 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28939 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28940 SmallVector<SDValue> MaskBits;
28941 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28942 uint64_t Bits = (Imm >> (I % 64)) & 255;
28943 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28944 }
28945 return DAG.getBuildVector(VT, DL, MaskBits);
28946}
28947
28948/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28949//
28950// i8/i16 vector implemented using dword LZCNT vector instruction
28951// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28952// split the vector, perform operation on it's Lo a Hi part and
28953// concatenate the results.
28955 const X86Subtarget &Subtarget) {
28956 assert(Op.getOpcode() == ISD::CTLZ);
28957 SDLoc dl(Op);
28958 MVT VT = Op.getSimpleValueType();
28959 MVT EltVT = VT.getVectorElementType();
28960 unsigned NumElems = VT.getVectorNumElements();
28961
28962 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28963 "Unsupported element type");
28964
28965 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28966 if (NumElems > 16 ||
28967 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28968 return splitVectorIntUnary(Op, DAG, dl);
28969
28970 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28971 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28972 "Unsupported value type for operation");
28973
28974 // Use native supported vector instruction vplzcntd.
28975 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28976 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28977 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28978 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28979
28980 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28981}
28982
28983// Lower CTLZ using a PSHUFB lookup table implementation.
28985 const X86Subtarget &Subtarget,
28986 SelectionDAG &DAG) {
28987 MVT VT = Op.getSimpleValueType();
28988 int NumElts = VT.getVectorNumElements();
28989 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28990 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28991
28992 // Per-nibble leading zero PSHUFB lookup table.
28993 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28994 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28995 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28996 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28997
28999 for (int i = 0; i < NumBytes; ++i)
29000 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29001 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29002
29003 // Begin by bitcasting the input to byte vector, then split those bytes
29004 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
29005 // If the hi input nibble is zero then we add both results together, otherwise
29006 // we just take the hi result (by masking the lo result to zero before the
29007 // add).
29008 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29009 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29010
29011 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29012 SDValue Lo = Op0;
29013 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29014 SDValue HiZ;
29015 if (CurrVT.is512BitVector()) {
29016 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29017 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29018 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29019 } else {
29020 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29021 }
29022
29023 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29024 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29025 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29026 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29027
29028 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29029 // of the current vector width in the same way we did for the nibbles.
29030 // If the upper half of the input element is zero then add the halves'
29031 // leading zero counts together, otherwise just use the upper half's.
29032 // Double the width of the result until we are at target width.
29033 while (CurrVT != VT) {
29034 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29035 int CurrNumElts = CurrVT.getVectorNumElements();
29036 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29037 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29038 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29039
29040 // Check if the upper half of the input element is zero.
29041 if (CurrVT.is512BitVector()) {
29042 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29043 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29044 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29045 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29046 } else {
29047 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29048 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29049 }
29050 HiZ = DAG.getBitcast(NextVT, HiZ);
29051
29052 // Move the upper/lower halves to the lower bits as we'll be extending to
29053 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29054 // together.
29055 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29056 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29057 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29058 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29059 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29060 CurrVT = NextVT;
29061 }
29062
29063 return Res;
29064}
29065
29067 const X86Subtarget &Subtarget,
29068 SelectionDAG &DAG) {
29069 MVT VT = Op.getSimpleValueType();
29070
29071 if (Subtarget.hasCDI() &&
29072 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29073 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29074 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29075
29076 // Decompose 256-bit ops into smaller 128-bit ops.
29077 if (VT.is256BitVector() && !Subtarget.hasInt256())
29078 return splitVectorIntUnary(Op, DAG, DL);
29079
29080 // Decompose 512-bit ops into smaller 256-bit ops.
29081 if (VT.is512BitVector() && !Subtarget.hasBWI())
29082 return splitVectorIntUnary(Op, DAG, DL);
29083
29084 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29085 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29086}
29087
29089 SelectionDAG &DAG,
29090 const X86Subtarget &Subtarget) {
29091 MVT VT = Op.getSimpleValueType();
29092 SDValue Input = Op.getOperand(0);
29093
29094 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29095 "Expected vXi8 input for GFNI-based CTLZ lowering");
29096
29097 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29098
29099 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29100 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29101
29102 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29103 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29104 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29105
29106 SDValue LZCNT =
29107 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29108 DAG.getTargetConstant(8, DL, MVT::i8));
29109 return LZCNT;
29110}
29111
29112static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29113 SelectionDAG &DAG) {
29114 MVT VT = Op.getSimpleValueType();
29115 MVT OpVT = VT;
29116 unsigned NumBits = VT.getSizeInBits();
29117 SDLoc dl(Op);
29118 unsigned Opc = Op.getOpcode();
29119
29120 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29121 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29122
29123 if (VT.isVector())
29124 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29125
29126 Op = Op.getOperand(0);
29127 if (VT == MVT::i8) {
29128 // Zero extend to i32 since there is not an i8 bsr.
29129 OpVT = MVT::i32;
29130 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29131 }
29132
29133 // Check if we can safely pass a result though BSR for zero sources.
29134 SDValue PassThru = DAG.getUNDEF(OpVT);
29135 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29136 !DAG.isKnownNeverZero(Op))
29137 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29138
29139 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29140 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29141 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29142
29143 // Skip CMOV if we're using a pass through value.
29144 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29145 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29146 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29147 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29148 Op.getValue(1)};
29149 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29150 }
29151
29152 // Finally xor with NumBits-1.
29153 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29154 DAG.getConstant(NumBits - 1, dl, OpVT));
29155
29156 if (VT == MVT::i8)
29157 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29158 return Op;
29159}
29160
29161static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29162 SelectionDAG &DAG) {
29163 MVT VT = Op.getSimpleValueType();
29164 unsigned NumBits = VT.getScalarSizeInBits();
29165 SDValue N0 = Op.getOperand(0);
29166 SDLoc dl(Op);
29167 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29168
29169 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29170 "Only scalar CTTZ requires custom lowering");
29171
29172 // Check if we can safely pass a result though BSF for zero sources.
29173 SDValue PassThru = DAG.getUNDEF(VT);
29174 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29175 PassThru = DAG.getConstant(NumBits, dl, VT);
29176
29177 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29178 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29179 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29180
29181 // Skip CMOV if src is never zero or we're using a pass through value.
29182 if (NonZeroSrc || !PassThru.isUndef())
29183 return Op;
29184
29185 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29186 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29187 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29188 Op.getValue(1)};
29189 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29190}
29191
29193 const X86Subtarget &Subtarget) {
29194 MVT VT = Op.getSimpleValueType();
29195 SDLoc DL(Op);
29196
29197 if (VT == MVT::i16 || VT == MVT::i32)
29198 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29199
29200 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29201 return splitVectorIntBinary(Op, DAG, DL);
29202
29203 assert(Op.getSimpleValueType().is256BitVector() &&
29204 Op.getSimpleValueType().isInteger() &&
29205 "Only handle AVX 256-bit vector integer operation");
29206 return splitVectorIntBinary(Op, DAG, DL);
29207}
29208
29210 const X86Subtarget &Subtarget) {
29211 MVT VT = Op.getSimpleValueType();
29212 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29213 unsigned Opcode = Op.getOpcode();
29214 SDLoc DL(Op);
29215
29216 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29217 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29218 assert(Op.getSimpleValueType().isInteger() &&
29219 "Only handle AVX vector integer operation");
29220 return splitVectorIntBinary(Op, DAG, DL);
29221 }
29222
29223 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29224 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29225 EVT SetCCResultType =
29226 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29227
29228 unsigned BitWidth = VT.getScalarSizeInBits();
29229 if (Opcode == ISD::USUBSAT) {
29230 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29231 // Handle a special-case with a bit-hack instead of cmp+select:
29232 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29233 // If the target can use VPTERNLOG, DAGToDAG will match this as
29234 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29235 // "broadcast" constant load.
29237 if (C && C->getAPIntValue().isSignMask()) {
29238 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29239 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29240 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29241 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29242 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29243 }
29244 }
29245 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29246 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29247 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29248 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29249 // TODO: Move this to DAGCombiner?
29250 if (SetCCResultType == VT &&
29251 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29252 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29253 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29254 }
29255 }
29256
29257 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29258 (!VT.isVector() || VT == MVT::v2i64)) {
29261 SDValue Zero = DAG.getConstant(0, DL, VT);
29262 SDValue Result =
29263 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29264 DAG.getVTList(VT, SetCCResultType), X, Y);
29265 SDValue SumDiff = Result.getValue(0);
29266 SDValue Overflow = Result.getValue(1);
29267 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29268 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29269 SDValue SumNeg =
29270 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29271 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29272 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29273 }
29274
29275 // Use default expansion.
29276 return SDValue();
29277}
29278
29279static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29280 SelectionDAG &DAG) {
29281 MVT VT = Op.getSimpleValueType();
29282 SDLoc DL(Op);
29283
29284 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29285 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29286 // 8-bit integer abs to NEG and CMOV.
29287 SDValue N0 = Op.getOperand(0);
29288 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29289 DAG.getConstant(0, DL, VT), N0);
29290 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29291 SDValue(Neg.getNode(), 1)};
29292 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29293 }
29294
29295 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29296 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29297 SDValue Src = Op.getOperand(0);
29298 SDValue Neg = DAG.getNegative(Src, DL, VT);
29299 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29300 }
29301
29302 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29303 assert(VT.isInteger() &&
29304 "Only handle AVX 256-bit vector integer operation");
29305 return splitVectorIntUnary(Op, DAG, DL);
29306 }
29307
29308 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29309 return splitVectorIntUnary(Op, DAG, DL);
29310
29311 // Default to expand.
29312 return SDValue();
29313}
29314
29315static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29316 SelectionDAG &DAG) {
29317 MVT VT = Op.getSimpleValueType();
29318 SDLoc DL(Op);
29319
29320 // For AVX1 cases, split to use legal ops.
29321 if (VT.is256BitVector() && !Subtarget.hasInt256())
29322 return splitVectorIntBinary(Op, DAG, DL);
29323
29324 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29325 return splitVectorIntBinary(Op, DAG, DL);
29326
29327 // Default to expand.
29328 return SDValue();
29329}
29330
29331static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29332 SelectionDAG &DAG) {
29333 MVT VT = Op.getSimpleValueType();
29334 SDLoc DL(Op);
29335
29336 // For AVX1 cases, split to use legal ops.
29337 if (VT.is256BitVector() && !Subtarget.hasInt256())
29338 return splitVectorIntBinary(Op, DAG, DL);
29339
29340 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29341 return splitVectorIntBinary(Op, DAG, DL);
29342
29343 // Default to expand.
29344 return SDValue();
29345}
29346
29348 SelectionDAG &DAG) {
29349 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29350 EVT VT = Op.getValueType();
29351 SDValue X = Op.getOperand(0);
29352 SDValue Y = Op.getOperand(1);
29353 SDLoc DL(Op);
29354 bool IsMaxOp =
29355 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29356 bool IsNum =
29357 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29358 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29359 unsigned Opc = 0;
29360 if (VT.isVector())
29362 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29364
29365 if (Opc) {
29366 SDValue Imm =
29367 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29368 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29369 }
29370 }
29371
29372 uint64_t SizeInBits = VT.getScalarSizeInBits();
29373 APInt PreferredZero = APInt::getZero(SizeInBits);
29374 APInt OppositeZero = PreferredZero;
29375 EVT IVT = VT.changeTypeToInteger();
29376 X86ISD::NodeType MinMaxOp;
29377 if (IsMaxOp) {
29378 MinMaxOp = X86ISD::FMAX;
29379 OppositeZero.setSignBit();
29380 } else {
29381 PreferredZero.setSignBit();
29382 MinMaxOp = X86ISD::FMIN;
29383 }
29384 EVT SetCCType =
29385 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29386
29387 // The tables below show the expected result of Max in cases of NaN and
29388 // signed zeros.
29389 //
29390 // Y Y
29391 // Num xNaN +0 -0
29392 // --------------- ---------------
29393 // Num | Max | Y | +0 | +0 | +0 |
29394 // X --------------- X ---------------
29395 // xNaN | X | X/Y | -0 | +0 | -0 |
29396 // --------------- ---------------
29397 //
29398 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29399 // reordering.
29400 //
29401 // We check if any of operands is NaN and return NaN. Then we check if any of
29402 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29403 // to ensure the correct zero is returned.
29404 auto MatchesZero = [](SDValue Op, APInt Zero) {
29406 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29407 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29408 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29409 return CstOp->getAPIntValue() == Zero;
29410 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29411 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29412 for (const SDValue &OpVal : Op->op_values()) {
29413 if (OpVal.isUndef())
29414 continue;
29415 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29416 if (!CstOp)
29417 return false;
29418 if (!CstOp->getValueAPF().isZero())
29419 continue;
29420 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29421 return false;
29422 }
29423 return true;
29424 }
29425 return false;
29426 };
29427
29428 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29429 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29430 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29431 Op->getFlags().hasNoSignedZeros() ||
29432 DAG.isKnownNeverZeroFloat(X) ||
29434 SDValue NewX, NewY;
29435 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29436 MatchesZero(X, OppositeZero)) {
29437 // Operands are already in right order or order does not matter.
29438 NewX = X;
29439 NewY = Y;
29440 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29441 NewX = Y;
29442 NewY = X;
29443 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29444 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29445 if (IsXNeverNaN)
29446 std::swap(X, Y);
29447 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29448 // xmm register.
29449 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29451 // Bits of classes:
29452 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29453 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29454 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29455 DL, MVT::i32);
29456 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29457 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29458 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29459 DAG.getVectorIdxConstant(0, DL));
29460 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29461 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29462 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29463 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29464 } else {
29465 SDValue IsXSigned;
29466 if (Subtarget.is64Bit() || VT != MVT::f64) {
29467 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29468 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29469 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29470 } else {
29471 assert(VT == MVT::f64);
29472 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29473 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29474 DAG.getVectorIdxConstant(0, DL));
29475 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29476 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29477 DAG.getVectorIdxConstant(1, DL));
29478 Hi = DAG.getBitcast(MVT::i32, Hi);
29479 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29480 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29481 *DAG.getContext(), MVT::i32);
29482 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29483 }
29484 if (MinMaxOp == X86ISD::FMAX) {
29485 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29486 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29487 } else {
29488 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29489 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29490 }
29491 }
29492
29493 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29494 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29495
29496 // If we did no ordering operands for signed zero handling and we need
29497 // to process NaN and we know that one of the operands is not NaN then:
29498 // - For minimum/maximum, put it in the first operand,
29499 // - For minimumnum/maximumnum, put it in the second operand,
29500 // and we will not need to post handle NaN after max/min.
29501 if (IgnoreSignedZero && !IgnoreNaN &&
29502 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29503 std::swap(NewX, NewY);
29504
29505 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29506
29507 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29508 return MinMax;
29509
29510 if (DAG.isKnownNeverNaN(NewX))
29511 NewX = NewY;
29512
29513 SDValue IsNaN =
29514 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29515
29516 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29517}
29518
29519static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29520 SelectionDAG &DAG) {
29521 MVT VT = Op.getSimpleValueType();
29522 SDLoc dl(Op);
29523
29524 // For AVX1 cases, split to use legal ops.
29525 if (VT.is256BitVector() && !Subtarget.hasInt256())
29526 return splitVectorIntBinary(Op, DAG, dl);
29527
29528 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29529 return splitVectorIntBinary(Op, DAG, dl);
29530
29531 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29532 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29533
29534 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29535 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29536 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29537
29538 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29539 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29540 if (VT.bitsGE(MVT::i32)) {
29541 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29542 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29543 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29544 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29545 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29546 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29547 DAG.getTargetConstant(CC, dl, MVT::i8),
29548 Diff1.getValue(1));
29549 }
29550
29551 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29552 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29553 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29554 MVT WideVT = MVT::getIntegerVT(WideBits);
29555 if (TLI.isTypeLegal(WideVT)) {
29556 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29557 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29558 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29559 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29560 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29561 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29562 DAG.getTargetConstant(CC, dl, MVT::i8),
29563 Diff1.getValue(1));
29564 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29565 }
29566 }
29567
29568 // Default to expand.
29569 return SDValue();
29570}
29571
29572static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29573 SelectionDAG &DAG) {
29574 SDLoc dl(Op);
29575 MVT VT = Op.getSimpleValueType();
29576
29577 // Decompose 256-bit ops into 128-bit ops.
29578 if (VT.is256BitVector() && !Subtarget.hasInt256())
29579 return splitVectorIntBinary(Op, DAG, dl);
29580
29581 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29582 return splitVectorIntBinary(Op, DAG, dl);
29583
29584 SDValue A = Op.getOperand(0);
29585 SDValue B = Op.getOperand(1);
29586
29587 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29588 // vector pairs, multiply and truncate.
29589 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29590 unsigned NumElts = VT.getVectorNumElements();
29591 unsigned NumLanes = VT.getSizeInBits() / 128;
29592 unsigned NumEltsPerLane = NumElts / NumLanes;
29593
29594 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29595 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29596 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29597 return DAG.getNode(
29598 ISD::TRUNCATE, dl, VT,
29599 DAG.getNode(ISD::MUL, dl, ExVT,
29600 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29601 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29602 }
29603
29604 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29605
29606 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29607 // Don't do this if we only need to unpack one half.
29608 if (Subtarget.hasSSSE3()) {
29609 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29610 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29611 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29612 if (BIsBuildVector) {
29613 for (auto [Idx, Val] : enumerate(B->ops())) {
29614 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29615 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29616 else
29617 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29618 }
29619 }
29620 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29621 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29622 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29623 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29624 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29625 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29626 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29627 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29628 DAG.getTargetConstant(8, dl, MVT::i8));
29629 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29630 }
29631 }
29632
29633 // Extract the lo/hi parts to any extend to i16.
29634 // We're going to mask off the low byte of each result element of the
29635 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29636 // element.
29637 SDValue Undef = DAG.getUNDEF(VT);
29638 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29639 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29640
29641 SDValue BLo, BHi;
29642 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29643 // If the RHS is a constant, manually unpackl/unpackh.
29644 SmallVector<SDValue, 16> LoOps, HiOps;
29645 for (unsigned i = 0; i != NumElts; i += 16) {
29646 for (unsigned j = 0; j != 8; ++j) {
29647 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29648 MVT::i16));
29649 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29650 MVT::i16));
29651 }
29652 }
29653
29654 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29655 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29656 } else {
29657 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29658 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29659 }
29660
29661 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29662 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29663 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29664 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29665 }
29666
29667 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29668 if (VT == MVT::v4i32) {
29669 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29670 "Should not custom lower when pmulld is available!");
29671
29672 // Extract the odd parts.
29673 static const int UnpackMask[] = {1, 1, 3, 3};
29674 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29675 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29676
29677 // Multiply the even parts.
29678 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29679 DAG.getBitcast(MVT::v2i64, A),
29680 DAG.getBitcast(MVT::v2i64, B));
29681 // Now multiply odd parts.
29682 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29683 DAG.getBitcast(MVT::v2i64, Aodds),
29684 DAG.getBitcast(MVT::v2i64, Bodds));
29685
29686 Evens = DAG.getBitcast(VT, Evens);
29687 Odds = DAG.getBitcast(VT, Odds);
29688
29689 // Merge the two vectors back together with a shuffle. This expands into 2
29690 // shuffles.
29691 static const int ShufMask[] = { 0, 4, 2, 6 };
29692 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29693 }
29694
29695 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29696 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29697 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29698
29699 // Ahi = psrlqi(a, 32);
29700 // Bhi = psrlqi(b, 32);
29701 //
29702 // AloBlo = pmuludq(a, b);
29703 // AloBhi = pmuludq(a, Bhi);
29704 // AhiBlo = pmuludq(Ahi, b);
29705 //
29706 // Hi = psllqi(AloBhi + AhiBlo, 32);
29707 // return AloBlo + Hi;
29708 KnownBits AKnown = DAG.computeKnownBits(A);
29709 KnownBits BKnown = DAG.computeKnownBits(B);
29710
29711 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29712 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29713 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29714
29715 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29716 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29717 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29718
29719 SDValue Zero = DAG.getConstant(0, dl, VT);
29720
29721 // Only multiply lo/hi halves that aren't known to be zero.
29722 SDValue AloBlo = Zero;
29723 if (!ALoIsZero && !BLoIsZero)
29724 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29725
29726 SDValue AloBhi = Zero;
29727 if (!ALoIsZero && !BHiIsZero) {
29728 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29729 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29730 }
29731
29732 SDValue AhiBlo = Zero;
29733 if (!AHiIsZero && !BLoIsZero) {
29734 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29735 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29736 }
29737
29738 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29739 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29740
29741 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29742}
29743
29745 MVT VT, bool IsSigned,
29746 const X86Subtarget &Subtarget,
29747 SelectionDAG &DAG,
29748 SDValue *Low = nullptr) {
29749 unsigned NumElts = VT.getVectorNumElements();
29750
29751 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29752 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29753 // lane results back together.
29754
29755 // We'll take different approaches for signed and unsigned.
29756 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29757 // and use pmullw to calculate the full 16-bit product.
29758 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29759 // shift them left into the upper byte of each word. This allows us to use
29760 // pmulhw to calculate the full 16-bit product. This trick means we don't
29761 // need to sign extend the bytes to use pmullw.
29762
29763 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29764 SDValue Zero = DAG.getConstant(0, dl, VT);
29765
29766 SDValue ALo, AHi;
29767 if (IsSigned) {
29768 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29769 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29770 } else {
29771 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29772 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29773 }
29774
29775 SDValue BLo, BHi;
29776 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29777 // If the RHS is a constant, manually unpackl/unpackh and extend.
29778 SmallVector<SDValue, 16> LoOps, HiOps;
29779 for (unsigned i = 0; i != NumElts; i += 16) {
29780 for (unsigned j = 0; j != 8; ++j) {
29781 SDValue LoOp = B.getOperand(i + j);
29782 SDValue HiOp = B.getOperand(i + j + 8);
29783
29784 if (IsSigned) {
29785 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29786 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29787 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29788 DAG.getConstant(8, dl, MVT::i16));
29789 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29790 DAG.getConstant(8, dl, MVT::i16));
29791 } else {
29792 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29793 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29794 }
29795
29796 LoOps.push_back(LoOp);
29797 HiOps.push_back(HiOp);
29798 }
29799 }
29800
29801 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29802 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29803 } else if (IsSigned) {
29804 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29805 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29806 } else {
29807 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29808 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29809 }
29810
29811 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29812 // pack back to vXi8.
29813 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29814 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29815 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29816
29817 if (Low)
29818 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29819
29820 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29821}
29822
29823static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29824 SelectionDAG &DAG) {
29825 SDLoc dl(Op);
29826 MVT VT = Op.getSimpleValueType();
29827 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29828 unsigned NumElts = VT.getVectorNumElements();
29829 SDValue A = Op.getOperand(0);
29830 SDValue B = Op.getOperand(1);
29831
29832 // Decompose 256-bit ops into 128-bit ops.
29833 if (VT.is256BitVector() && !Subtarget.hasInt256())
29834 return splitVectorIntBinary(Op, DAG, dl);
29835
29836 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29837 return splitVectorIntBinary(Op, DAG, dl);
29838
29839 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29840 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29841 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29842 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29843
29844 // PMULxD operations multiply each even value (starting at 0) of LHS with
29845 // the related value of RHS and produce a widen result.
29846 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29847 // => <2 x i64> <ae|cg>
29848 //
29849 // In other word, to have all the results, we need to perform two PMULxD:
29850 // 1. one with the even values.
29851 // 2. one with the odd values.
29852 // To achieve #2, with need to place the odd values at an even position.
29853 //
29854 // Place the odd value at an even position (basically, shift all values 1
29855 // step to the left):
29856 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29857 9, -1, 11, -1, 13, -1, 15, -1};
29858 // <a|b|c|d> => <b|undef|d|undef>
29859 SDValue Odd0 =
29860 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29861 // <e|f|g|h> => <f|undef|h|undef>
29862 SDValue Odd1 =
29863 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29864
29865 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29866 // ints.
29867 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29868 unsigned Opcode =
29869 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29870 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29871 // => <2 x i64> <ae|cg>
29872 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29873 DAG.getBitcast(MulVT, A),
29874 DAG.getBitcast(MulVT, B)));
29875 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29876 // => <2 x i64> <bf|dh>
29877 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29878 DAG.getBitcast(MulVT, Odd0),
29879 DAG.getBitcast(MulVT, Odd1)));
29880
29881 // Shuffle it back into the right order.
29882 SmallVector<int, 16> ShufMask(NumElts);
29883 for (int i = 0; i != (int)NumElts; ++i)
29884 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29885
29886 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29887
29888 // If we have a signed multiply but no PMULDQ fix up the result of an
29889 // unsigned multiply.
29890 if (IsSigned && !Subtarget.hasSSE41()) {
29891 SDValue Zero = DAG.getConstant(0, dl, VT);
29892 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29893 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29894 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29895 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29896
29897 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29898 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29899 }
29900
29901 return Res;
29902 }
29903
29904 // Only i8 vectors should need custom lowering after this.
29905 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29906 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29907 "Unsupported vector type");
29908
29909 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29910 // logical shift down the upper half and pack back to i8.
29911
29912 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29913 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29914
29915 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29916 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29917 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29918 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29919 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29920 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29921 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29922 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29923 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29924 }
29925
29926 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29927}
29928
29929// Custom lowering for SMULO/UMULO.
29930static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29931 SelectionDAG &DAG) {
29932 MVT VT = Op.getSimpleValueType();
29933
29934 // Scalars defer to LowerXALUO.
29935 if (!VT.isVector())
29936 return LowerXALUO(Op, DAG);
29937
29938 SDLoc dl(Op);
29939 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29940 SDValue A = Op.getOperand(0);
29941 SDValue B = Op.getOperand(1);
29942 EVT OvfVT = Op->getValueType(1);
29943
29944 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29945 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29946 // Extract the LHS Lo/Hi vectors
29947 SDValue LHSLo, LHSHi;
29948 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29949
29950 // Extract the RHS Lo/Hi vectors
29951 SDValue RHSLo, RHSHi;
29952 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29953
29954 EVT LoOvfVT, HiOvfVT;
29955 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29956 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29957 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29958
29959 // Issue the split operations.
29960 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29961 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29962
29963 // Join the separate data results and the overflow results.
29964 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29965 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29966 Hi.getValue(1));
29967
29968 return DAG.getMergeValues({Res, Ovf}, dl);
29969 }
29970
29971 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29972 EVT SetccVT =
29973 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29974
29975 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29976 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29977 unsigned NumElts = VT.getVectorNumElements();
29978 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29979 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29980 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29981 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29982 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29983
29984 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29985
29986 SDValue Ovf;
29987 if (IsSigned) {
29988 SDValue High, LowSign;
29989 if (OvfVT.getVectorElementType() == MVT::i1 &&
29990 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29991 // Rather the truncating try to do the compare on vXi16 or vXi32.
29992 // Shift the high down filling with sign bits.
29993 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29994 // Fill all 16 bits with the sign bit from the low.
29995 LowSign =
29996 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29997 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29998 15, DAG);
29999 SetccVT = OvfVT;
30000 if (!Subtarget.hasBWI()) {
30001 // We can't do a vXi16 compare so sign extend to v16i32.
30002 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
30003 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
30004 }
30005 } else {
30006 // Otherwise do the compare at vXi8.
30007 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30008 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30009 LowSign =
30010 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30011 }
30012
30013 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30014 } else {
30015 SDValue High =
30016 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30017 if (OvfVT.getVectorElementType() == MVT::i1 &&
30018 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30019 // Rather the truncating try to do the compare on vXi16 or vXi32.
30020 SetccVT = OvfVT;
30021 if (!Subtarget.hasBWI()) {
30022 // We can't do a vXi16 compare so sign extend to v16i32.
30023 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30024 }
30025 } else {
30026 // Otherwise do the compare at vXi8.
30027 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30028 }
30029
30030 Ovf =
30031 DAG.getSetCC(dl, SetccVT, High,
30032 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30033 }
30034
30035 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30036
30037 return DAG.getMergeValues({Low, Ovf}, dl);
30038 }
30039
30040 SDValue Low;
30041 SDValue High =
30042 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30043
30044 SDValue Ovf;
30045 if (IsSigned) {
30046 // SMULO overflows if the high bits don't match the sign of the low.
30047 SDValue LowSign =
30048 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30049 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30050 } else {
30051 // UMULO overflows if the high bits are non-zero.
30052 Ovf =
30053 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30054 }
30055
30056 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30057
30058 return DAG.getMergeValues({Low, Ovf}, dl);
30059}
30060
30061SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30062 assert(Subtarget.isTargetWin64() && "Unexpected target");
30063 EVT VT = Op.getValueType();
30064 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30065 "Unexpected return type for lowering");
30066
30067 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30069 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30070 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30071 }
30072
30073 RTLIB::Libcall LC;
30074 bool isSigned;
30075 switch (Op->getOpcode()) {
30076 // clang-format off
30077 default: llvm_unreachable("Unexpected request for libcall!");
30078 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30079 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30080 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30081 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30082 // clang-format on
30083 }
30084
30085 SDLoc dl(Op);
30086 SDValue InChain = DAG.getEntryNode();
30087
30089 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30090 EVT ArgVT = Op->getOperand(i).getValueType();
30091 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30092 "Unexpected argument type for lowering");
30093 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30094 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30095 MachinePointerInfo MPI =
30097 InChain =
30098 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30099 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30100 }
30101
30104
30105 TargetLowering::CallLoweringInfo CLI(DAG);
30106 CLI.setDebugLoc(dl)
30107 .setChain(InChain)
30108 .setLibCallee(
30110 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30111 std::move(Args))
30112 .setInRegister()
30113 .setSExtResult(isSigned)
30114 .setZExtResult(!isSigned);
30115
30116 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30117 return DAG.getBitcast(VT, CallInfo.first);
30118}
30119
30120SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30121 SelectionDAG &DAG,
30122 SDValue &Chain) const {
30123 assert(Subtarget.isTargetWin64() && "Unexpected target");
30124 EVT VT = Op.getValueType();
30125 bool IsStrict = Op->isStrictFPOpcode();
30126
30127 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30128 EVT ArgVT = Arg.getValueType();
30129
30130 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30131 "Unexpected return type for lowering");
30132
30133 RTLIB::Libcall LC;
30134 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30135 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30136 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30137 else
30138 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30139 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30140
30141 SDLoc dl(Op);
30142 MakeLibCallOptions CallOptions;
30143 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30144
30146 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30147 // expected VT (i128).
30148 std::tie(Result, Chain) =
30149 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30150 Result = DAG.getBitcast(VT, Result);
30151 return Result;
30152}
30153
30154SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30155 SelectionDAG &DAG) const {
30156 assert(Subtarget.isTargetWin64() && "Unexpected target");
30157 EVT VT = Op.getValueType();
30158 bool IsStrict = Op->isStrictFPOpcode();
30159
30160 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30161 EVT ArgVT = Arg.getValueType();
30162
30163 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30164 "Unexpected argument type for lowering");
30165
30166 RTLIB::Libcall LC;
30167 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30168 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30169 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30170 else
30171 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30172 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30173
30174 SDLoc dl(Op);
30175 MakeLibCallOptions CallOptions;
30176 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30177
30178 // Pass the i128 argument as an indirect argument on the stack.
30179 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30180 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30181 MachinePointerInfo MPI =
30183 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30184
30186 std::tie(Result, Chain) =
30187 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30188 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30189}
30190
30191// Return true if the required (according to Opcode) shift-imm form is natively
30192// supported by the Subtarget
30193static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30194 unsigned Opcode) {
30195 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30196 "Unexpected shift opcode");
30197
30198 if (!VT.isSimple())
30199 return false;
30200
30201 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30202 return false;
30203
30204 if (VT.getScalarSizeInBits() < 16)
30205 return false;
30206
30207 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30208 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30209 return true;
30210
30211 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30212 (VT.is256BitVector() && Subtarget.hasInt256());
30213
30214 bool AShift = LShift && (Subtarget.hasAVX512() ||
30215 (VT != MVT::v2i64 && VT != MVT::v4i64));
30216 return (Opcode == ISD::SRA) ? AShift : LShift;
30217}
30218
30219// The shift amount is a variable, but it is the same for all vector lanes.
30220// These instructions are defined together with shift-immediate.
30221static
30223 unsigned Opcode) {
30224 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30225}
30226
30227// Return true if the required (according to Opcode) variable-shift form is
30228// natively supported by the Subtarget
30229static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30230 unsigned Opcode) {
30231 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30232 "Unexpected shift opcode");
30233
30234 if (!VT.isSimple())
30235 return false;
30236
30237 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30238 return false;
30239
30240 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30241 return false;
30242
30243 // vXi16 supported only on AVX-512, BWI
30244 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30245 return false;
30246
30247 if (Subtarget.hasAVX512() &&
30248 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30249 return true;
30250
30251 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30252 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30253 return (Opcode == ISD::SRA) ? AShift : LShift;
30254}
30255
30257 const X86Subtarget &Subtarget) {
30258 MVT VT = Op.getSimpleValueType();
30259 SDLoc dl(Op);
30260 SDValue R = Op.getOperand(0);
30261 SDValue Amt = Op.getOperand(1);
30262 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30263 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30264
30265 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30266 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30267 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30268 SDValue Ex = DAG.getBitcast(ExVT, R);
30269
30270 // ashr(R, 63) === cmp_slt(R, 0)
30271 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30272 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30273 "Unsupported PCMPGT op");
30274 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30275 }
30276
30277 if (ShiftAmt >= 32) {
30278 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30279 SDValue Upper =
30280 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30282 ShiftAmt - 32, DAG);
30283 if (VT == MVT::v2i64)
30284 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30285 if (VT == MVT::v4i64)
30286 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30287 {9, 1, 11, 3, 13, 5, 15, 7});
30288 } else {
30289 // SRA upper i32, SRL whole i64 and select lower i32.
30291 ShiftAmt, DAG);
30292 SDValue Lower =
30293 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30294 Lower = DAG.getBitcast(ExVT, Lower);
30295 if (VT == MVT::v2i64)
30296 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30297 if (VT == MVT::v4i64)
30298 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30299 {8, 1, 10, 3, 12, 5, 14, 7});
30300 }
30301 return DAG.getBitcast(VT, Ex);
30302 };
30303
30304 // Optimize shl/srl/sra with constant shift amount.
30305 APInt APIntShiftAmt;
30306 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30307 return SDValue();
30308
30309 // If the shift amount is out of range, return undef.
30310 if (APIntShiftAmt.uge(EltSizeInBits))
30311 return DAG.getUNDEF(VT);
30312
30313 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30314
30315 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
30316 // Hardware support for vector shifts is sparse which makes us scalarize the
30317 // vector operations in many cases. Also, on sandybridge ADD is faster than
30318 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
30319 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30320 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30321 // must be 0). (add undef, undef) however can be any value. To make this
30322 // safe, we must freeze R to ensure that register allocation uses the same
30323 // register for an undefined value. This ensures that the result will
30324 // still be even and preserves the original semantics.
30325 R = DAG.getFreeze(R);
30326 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30327 }
30328
30329 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30330 }
30331
30332 // i64 SRA needs to be performed as partial shifts.
30333 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30334 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30335 Op.getOpcode() == ISD::SRA)
30336 return ArithmeticShiftRight64(ShiftAmt);
30337
30338 // If we're logical shifting an all-signbits value then we can just perform as
30339 // a mask.
30340 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30341 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30342 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30343 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30344 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30345 }
30346
30347 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30348 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30349 unsigned NumElts = VT.getVectorNumElements();
30350 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30351
30352 // Simple i8 add case
30353 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30354 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30355 // must be 0). (add undef, undef) however can be any value. To make this
30356 // safe, we must freeze R to ensure that register allocation uses the same
30357 // register for an undefined value. This ensures that the result will
30358 // still be even and preserves the original semantics.
30359 R = DAG.getFreeze(R);
30360 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30361 }
30362
30363 // ashr(R, 7) === cmp_slt(R, 0)
30364 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30365 SDValue Zeros = DAG.getConstant(0, dl, VT);
30366 if (VT.is512BitVector()) {
30367 assert(VT == MVT::v64i8 && "Unexpected element type!");
30368 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30369 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30370 }
30371 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30372 }
30373
30374 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30375 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30376 return SDValue();
30377
30378 if (Subtarget.hasGFNI()) {
30379 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30380 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30381 DAG.getTargetConstant(0, dl, MVT::i8));
30382 }
30383
30384 if (Op.getOpcode() == ISD::SHL) {
30385 // Make a large shift.
30386 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30387 ShiftAmt, DAG);
30388 SHL = DAG.getBitcast(VT, SHL);
30389 // Zero out the rightmost bits.
30390 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30391 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30392 }
30393 if (Op.getOpcode() == ISD::SRL) {
30394 // Make a large shift.
30395 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30396 ShiftAmt, DAG);
30397 SRL = DAG.getBitcast(VT, SRL);
30398 // Zero out the leftmost bits.
30399 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30400 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30401 }
30402 if (Op.getOpcode() == ISD::SRA) {
30403 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30404 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30405
30406 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30407 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30408 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30409 return Res;
30410 }
30411 llvm_unreachable("Unknown shift opcode.");
30412 }
30413
30414 return SDValue();
30415}
30416
30418 const X86Subtarget &Subtarget) {
30419 MVT VT = Op.getSimpleValueType();
30420 SDLoc dl(Op);
30421 SDValue R = Op.getOperand(0);
30422 SDValue Amt = Op.getOperand(1);
30423 unsigned Opcode = Op.getOpcode();
30424 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30425
30426 int BaseShAmtIdx = -1;
30427 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30428 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30429 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30430 Subtarget, DAG);
30431
30432 // vXi8 shifts - shift as v8i16 + mask result.
30433 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30434 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30435 VT == MVT::v64i8) &&
30436 !Subtarget.hasXOP()) {
30437 unsigned NumElts = VT.getVectorNumElements();
30438 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30439 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30440 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30441 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30442
30443 // Create the mask using vXi16 shifts. For shift-rights we need to move
30444 // the upper byte down before splatting the vXi8 mask.
30445 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30446 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30447 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30448 if (Opcode != ISD::SHL)
30449 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30450 8, DAG);
30451 BitMask = DAG.getBitcast(VT, BitMask);
30452 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30453 SmallVector<int, 64>(NumElts, 0));
30454
30455 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30456 DAG.getBitcast(ExtVT, R), BaseShAmt,
30457 BaseShAmtIdx, Subtarget, DAG);
30458 Res = DAG.getBitcast(VT, Res);
30459 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30460
30461 if (Opcode == ISD::SRA) {
30462 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30463 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30464 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30465 SignMask =
30466 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30467 BaseShAmtIdx, Subtarget, DAG);
30468 SignMask = DAG.getBitcast(VT, SignMask);
30469 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30470 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30471 }
30472 return Res;
30473 }
30474 }
30475 }
30476
30477 return SDValue();
30478}
30479
30480// Convert a shift/rotate left amount to a multiplication scale factor.
30482 const X86Subtarget &Subtarget,
30483 SelectionDAG &DAG) {
30484 MVT VT = Amt.getSimpleValueType();
30485 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30486 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30487 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30488 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30489 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30490 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30491 return SDValue();
30492
30493 MVT SVT = VT.getVectorElementType();
30494 unsigned SVTBits = SVT.getSizeInBits();
30495 unsigned NumElems = VT.getVectorNumElements();
30496
30497 APInt UndefElts;
30498 SmallVector<APInt> EltBits;
30499 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30500 APInt One(SVTBits, 1);
30501 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30502 for (unsigned I = 0; I != NumElems; ++I) {
30503 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30504 continue;
30505 uint64_t ShAmt = EltBits[I].getZExtValue();
30506 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30507 }
30508 return DAG.getBuildVector(VT, dl, Elts);
30509 }
30510
30511 // If the target doesn't support variable shifts, use either FP conversion
30512 // or integer multiplication to avoid shifting each element individually.
30513 if (VT == MVT::v4i32) {
30514 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30515 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30516 DAG.getConstant(0x3f800000U, dl, VT));
30517 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30518 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30519 }
30520
30521 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30522 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30523 SDValue Z = DAG.getConstant(0, dl, VT);
30524 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30525 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30526 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30527 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30528 if (Subtarget.hasSSE41())
30529 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30530 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30531 }
30532
30533 return SDValue();
30534}
30535
30536static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30537 SelectionDAG &DAG) {
30538 MVT VT = Op.getSimpleValueType();
30539 SDLoc dl(Op);
30540 SDValue R = Op.getOperand(0);
30541 SDValue Amt = Op.getOperand(1);
30542 unsigned NumElts = VT.getVectorNumElements();
30543 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30544 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30545
30546 unsigned Opc = Op.getOpcode();
30547 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30548 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30549
30550 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30551 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30552
30553 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30554 return V;
30555
30556 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30557 return V;
30558
30559 if (supportedVectorVarShift(VT, Subtarget, Opc))
30560 return Op;
30561
30562 // i64 vector arithmetic shift can be emulated with the transform:
30563 // M = lshr(SIGN_MASK, Amt)
30564 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30565 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30566 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30567 Opc == ISD::SRA) {
30568 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30569 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30570 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30571 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30572 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30573 return R;
30574 }
30575
30576 // XOP has 128-bit variable logical/arithmetic shifts.
30577 // +ve/-ve Amt = shift left/right.
30578 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30579 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30580 if (Opc == ISD::SRL || Opc == ISD::SRA)
30581 Amt = DAG.getNegative(Amt, dl, VT);
30582 if (Opc == ISD::SHL || Opc == ISD::SRL)
30583 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30584 if (Opc == ISD::SRA)
30585 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30586 }
30587
30588 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30589 // shifts per-lane and then shuffle the partial results back together.
30590 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30591 // Splat the shift amounts so the scalar shifts above will catch it.
30592 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30593 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30594 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30595 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30596 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30597 }
30598
30599 // Build a map of inrange constant amounts with element mask where they occur.
30601 if (ConstantAmt) {
30602 for (unsigned I = 0; I != NumElts; ++I) {
30603 SDValue A = Amt.getOperand(I);
30604 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30605 continue;
30606 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30607 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30608 if (!Inserted) {
30609 It->second.setBit(I);
30610 continue;
30611 }
30612 It->second = APInt::getOneBitSet(NumElts, I);
30613 }
30614 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30615 }
30616
30617 // If possible, lower this shift as a sequence of two shifts by
30618 // constant plus a BLENDing shuffle instead of scalarizing it.
30619 // Example:
30620 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30621 //
30622 // Could be rewritten as:
30623 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30624 //
30625 // The advantage is that the two shifts from the example would be
30626 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30627 if (UniqueCstAmt.size() == 2 &&
30628 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30629 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30630 unsigned AmtA = UniqueCstAmt.begin()->first;
30631 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30632 const APInt &MaskA = UniqueCstAmt.begin()->second;
30633 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30634 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30635 for (unsigned I = 0; I != NumElts; ++I) {
30636 if (MaskA[I])
30637 ShuffleMask[I] = I;
30638 if (MaskB[I])
30639 ShuffleMask[I] = I + NumElts;
30640 }
30641
30642 // Only perform this blend if we can perform it without loading a mask.
30643 if ((VT != MVT::v16i16 ||
30644 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30645 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30646 canWidenShuffleElements(ShuffleMask))) {
30647 SDValue Shift1 =
30648 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30649 SDValue Shift2 =
30650 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30651 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30652 }
30653 }
30654
30655 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30656 // using vYiM vector operations where X*N == Y*M and M > N.
30657 if (ConstantAmt &&
30658 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30659 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30660 !Subtarget.hasXOP()) {
30661 MVT NarrowScalarVT = VT.getScalarType();
30662 // We can do this extra fast if each pair of narrow elements is shifted by
30663 // the same amount by doing this SWAR style: use a shift to move the valid
30664 // bits to the right position, mask out any bits which crossed from one
30665 // element to the other.
30666 // This optimized lowering is only valid if the elements in a pair can
30667 // be treated identically.
30668 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30669 SmallVector<SDValue, 32> TmpAmtWideElts;
30670 int WideEltSizeInBits = EltSizeInBits;
30671 while (WideEltSizeInBits < 32) {
30672 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30673 // unprofitable.
30674 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30675 break;
30676 }
30677 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30678 bool SameShifts = true;
30679 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30680 unsigned DstI = SrcI / 2;
30681 // Both elements are undef? Make a note and keep going.
30682 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30683 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30684 continue;
30685 }
30686 // Even element is undef? We will shift it by the same shift amount as
30687 // the odd element.
30688 if (AmtWideElts[SrcI].isUndef()) {
30689 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30690 continue;
30691 }
30692 // Odd element is undef? We will shift it by the same shift amount as
30693 // the even element.
30694 if (AmtWideElts[SrcI + 1].isUndef()) {
30695 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30696 continue;
30697 }
30698 // Both elements are equal.
30699 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30700 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30701 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30702 continue;
30703 }
30704 // One of the provisional wide elements will not have the same shift
30705 // amount. Let's bail.
30706 SameShifts = false;
30707 break;
30708 }
30709 if (!SameShifts) {
30710 break;
30711 }
30712 WideEltSizeInBits *= 2;
30713 std::swap(TmpAmtWideElts, AmtWideElts);
30714 }
30715 APInt APIntShiftAmt;
30716 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30717 bool Profitable = WidenShift;
30718 // AVX512BW brings support for vpsllvw.
30719 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30720 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30721 Profitable = false;
30722 }
30723 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30724 // fairly cheaply in other ways.
30725 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30726 Profitable = false;
30727 }
30728 // Leave it up to GFNI if we have it around.
30729 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30730 // is probably a win to use other strategies in some cases.
30731 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30732 Profitable = false;
30733 }
30734
30735 // AVX1 does not have vpand which makes our masking impractical. It does
30736 // have vandps but that is an FP instruction and crossing FP<->int typically
30737 // has some cost.
30738 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30739 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30740 Profitable = false;
30741 }
30742 unsigned WideNumElts = AmtWideElts.size();
30743 // We are only dealing with identical pairs.
30744 if (Profitable && WideNumElts != NumElts) {
30745 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30746 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30747 // Cast the operand to vXiM.
30748 SDValue RWide = DAG.getBitcast(WideVT, R);
30749 // Create our new vector of shift amounts.
30750 SDValue AmtWide = DAG.getBuildVector(
30751 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30752 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30753 // Perform the actual shift.
30754 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30755 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30756 // Now we need to construct a mask which will "drop" bits that get
30757 // shifted past the LSB/MSB. For a logical shift left, it will look
30758 // like:
30759 // FullMask = (1 << EltSizeInBits) - 1
30760 // Mask = FullMask << Amt
30761 //
30762 // This masking ensures that bits cannot migrate from one narrow lane to
30763 // another. The construction of this mask will be constant folded.
30764 // The mask for a logical right shift is nearly identical, the only
30765 // difference is that the all ones mask is shifted right instead of left.
30766 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30767 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30768 Mask = DAG.getBitcast(WideVT, Mask);
30769 // Finally, we mask the shifted vector with the SWAR mask.
30770 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30771 Masked = DAG.getBitcast(VT, Masked);
30772 if (Opc != ISD::SRA) {
30773 // Logical shifts are complete at this point.
30774 return Masked;
30775 }
30776 // At this point, we have done a *logical* shift right. We now need to
30777 // sign extend the result so that we get behavior equivalent to an
30778 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30779 // are `EltSizeInBits-AmtWide` bits wide.
30780 //
30781 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30782 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30783 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30784 // can use the following trick to accomplish this:
30785 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30786 // (Masked ^ SignBitMask) - SignBitMask
30787 //
30788 // When the sign bit is already clear, this will compute:
30789 // Masked + SignBitMask - SignBitMask
30790 //
30791 // This is equal to Masked which is what we want: the sign bit was clear
30792 // so sign extending should be a no-op.
30793 //
30794 // When the sign bit is set, this will compute:
30795 // Masked - SignBitmask - SignBitMask
30796 //
30797 // This is equal to Masked - 2*SignBitMask which will correctly sign
30798 // extend our result.
30799 SDValue SplatHighBit =
30800 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30801 // This does not induce recursion, all operands are constants.
30802 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30803 SDValue FlippedSignBit =
30804 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30805 SDValue Subtraction =
30806 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30807 return Subtraction;
30808 }
30809 }
30810
30811 // If possible, lower this packed shift into a vector multiply instead of
30812 // expanding it into a sequence of scalar shifts.
30813 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30814 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30815 Subtarget.canExtendTo512BW())))
30816 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30817 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30818
30819 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30820 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30821 if (Opc == ISD::SRL && ConstantAmt &&
30822 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30823 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30824 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30825 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30826 SDValue Zero = DAG.getConstant(0, dl, VT);
30827 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30828 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30829 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30830 }
30831 }
30832
30833 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30834 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30835 // TODO: Special case handling for shift by 0/1, really we can afford either
30836 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30837 if (Opc == ISD::SRA && ConstantAmt &&
30838 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30839 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30840 !Subtarget.hasAVX512()) ||
30841 DAG.isKnownNeverZero(Amt))) {
30842 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30843 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30844 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30845 SDValue Amt0 =
30846 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30847 SDValue Amt1 =
30848 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30849 SDValue Sra1 =
30850 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30851 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30852 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30853 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30854 }
30855 }
30856
30857 // v4i32 Non Uniform Shifts.
30858 // If the shift amount is constant we can shift each lane using the SSE2
30859 // immediate shifts, else we need to zero-extend each lane to the lower i64
30860 // and shift using the SSE2 variable shifts.
30861 // The separate results can then be blended together.
30862 if (VT == MVT::v4i32) {
30863 SDValue Amt0, Amt1, Amt2, Amt3;
30864 if (ConstantAmt) {
30865 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30866 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30867 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30868 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30869 } else {
30870 // The SSE2 shifts use the lower i64 as the same shift amount for
30871 // all lanes and the upper i64 is ignored. On AVX we're better off
30872 // just zero-extending, but for SSE just duplicating the top 16-bits is
30873 // cheaper and has the same effect for out of range values.
30874 if (Subtarget.hasAVX()) {
30875 SDValue Z = DAG.getConstant(0, dl, VT);
30876 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30877 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30878 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30879 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30880 } else {
30881 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30882 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30883 {4, 5, 6, 7, -1, -1, -1, -1});
30884 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30885 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30886 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30887 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30888 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30889 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30890 }
30891 }
30892
30893 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30894 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30895 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30896 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30897 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30898
30899 // Merge the shifted lane results optimally with/without PBLENDW.
30900 // TODO - ideally shuffle combining would handle this.
30901 if (Subtarget.hasSSE41()) {
30902 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30903 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30904 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30905 }
30906 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30907 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30908 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30909 }
30910
30911 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30912 // look up the pre-computed shift values.
30913 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30914 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30915 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30916 unsigned NumLanes = VT.getSizeInBits() / 128u;
30917 unsigned NumEltsPerLane = NumElts / NumLanes;
30919 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30920 unsigned LoElt = Lane * NumEltsPerLane;
30921 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30922 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30923 if (!KnownLane.isConstant())
30924 break;
30925 const APInt &LaneSplat = KnownLane.getConstant();
30926 for (unsigned I = 0; I != 8; ++I) {
30927 if (Opc == ISD::SHL)
30928 LUT.push_back(LaneSplat.shl(I));
30929 else if (Opc == ISD::SRL)
30930 LUT.push_back(LaneSplat.lshr(I));
30931 else if (Opc == ISD::SRA)
30932 LUT.push_back(LaneSplat.ashr(I));
30933 }
30934 LUT.append(8, APInt::getZero(8));
30935 }
30936 if (LUT.size() == NumElts) {
30937 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30938 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30939 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30940 }
30941 }
30942
30943 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30944 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30945 // make the existing SSE solution better.
30946 // NOTE: We honor prefered vector width before promoting to 512-bits.
30947 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30948 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30949 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30950 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30951 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30952 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30953 "Unexpected vector type");
30954 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30955 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30956 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30957 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30958 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30959 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30960 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30961 }
30962
30963 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30964 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30965 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30966 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30967 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30968 !Subtarget.hasXOP()) {
30969 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30970 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30971
30972 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30973 // isn't legal).
30974 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30975 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30976 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30977 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30979 "Constant build vector expected");
30980
30981 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30982 bool IsSigned = Opc == ISD::SRA;
30983 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30984 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30985 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30986 return DAG.getZExtOrTrunc(R, dl, VT);
30987 }
30988
30989 SmallVector<SDValue, 16> LoAmt, HiAmt;
30990 for (unsigned i = 0; i != NumElts; i += 16) {
30991 for (int j = 0; j != 8; ++j) {
30992 LoAmt.push_back(Amt.getOperand(i + j));
30993 HiAmt.push_back(Amt.getOperand(i + j + 8));
30994 }
30995 }
30996
30997 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30998 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30999
31000 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
31001 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
31002 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
31003 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
31004 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
31005 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
31006 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
31007 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
31008 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
31009 }
31010
31011 if (VT == MVT::v16i8 ||
31012 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
31013 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
31014 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31015
31016 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31017 if (VT.is512BitVector()) {
31018 // On AVX512BW targets we make use of the fact that VSELECT lowers
31019 // to a masked blend which selects bytes based just on the sign bit
31020 // extracted to a mask.
31021 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31022 V0 = DAG.getBitcast(VT, V0);
31023 V1 = DAG.getBitcast(VT, V1);
31024 Sel = DAG.getBitcast(VT, Sel);
31025 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31026 ISD::SETGT);
31027 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31028 } else if (Subtarget.hasSSE41()) {
31029 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31030 // on the sign bit.
31031 V0 = DAG.getBitcast(VT, V0);
31032 V1 = DAG.getBitcast(VT, V1);
31033 Sel = DAG.getBitcast(VT, Sel);
31034 return DAG.getBitcast(SelVT,
31035 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31036 }
31037 // On pre-SSE41 targets we test for the sign bit by comparing to
31038 // zero - a negative value will set all bits of the lanes to true
31039 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31040 SDValue Z = DAG.getConstant(0, dl, SelVT);
31041 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31042 return DAG.getSelect(dl, SelVT, C, V0, V1);
31043 };
31044
31045 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31046 // We can safely do this using i16 shifts as we're only interested in
31047 // the 3 lower bits of each byte.
31048 Amt = DAG.getBitcast(ExtVT, Amt);
31049 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31050 Amt = DAG.getBitcast(VT, Amt);
31051
31052 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31053 // r = VSELECT(r, shift(r, 4), a);
31054 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31055 R = SignBitSelect(VT, Amt, M, R);
31056
31057 // a += a
31058 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31059
31060 // r = VSELECT(r, shift(r, 2), a);
31061 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31062 R = SignBitSelect(VT, Amt, M, R);
31063
31064 // a += a
31065 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31066
31067 // return VSELECT(r, shift(r, 1), a);
31068 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31069 R = SignBitSelect(VT, Amt, M, R);
31070 return R;
31071 }
31072
31073 if (Opc == ISD::SRA) {
31074 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31075 // so we can correctly sign extend. We don't care what happens to the
31076 // lower byte.
31077 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31078 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31079 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31080 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31081 ALo = DAG.getBitcast(ExtVT, ALo);
31082 AHi = DAG.getBitcast(ExtVT, AHi);
31083 RLo = DAG.getBitcast(ExtVT, RLo);
31084 RHi = DAG.getBitcast(ExtVT, RHi);
31085
31086 // r = VSELECT(r, shift(r, 4), a);
31087 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31088 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31089 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31090 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31091
31092 // a += a
31093 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31094 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31095
31096 // r = VSELECT(r, shift(r, 2), a);
31097 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31098 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31099 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31100 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31101
31102 // a += a
31103 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31104 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31105
31106 // r = VSELECT(r, shift(r, 1), a);
31107 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31108 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31109 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31110 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31111
31112 // Logical shift the result back to the lower byte, leaving a zero upper
31113 // byte meaning that we can safely pack with PACKUSWB.
31114 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31115 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31116 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31117 }
31118 }
31119
31120 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31121 MVT ExtVT = MVT::v8i32;
31122 SDValue Z = DAG.getConstant(0, dl, VT);
31123 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31124 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31125 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31126 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31127 ALo = DAG.getBitcast(ExtVT, ALo);
31128 AHi = DAG.getBitcast(ExtVT, AHi);
31129 RLo = DAG.getBitcast(ExtVT, RLo);
31130 RHi = DAG.getBitcast(ExtVT, RHi);
31131 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31132 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31133 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31134 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31135 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31136 }
31137
31138 if (VT == MVT::v8i16) {
31139 // If we have a constant shift amount, the non-SSE41 path is best as
31140 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31141 bool UseSSE41 = Subtarget.hasSSE41() &&
31143
31144 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31145 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31146 // the sign bit.
31147 if (UseSSE41) {
31148 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31149 V0 = DAG.getBitcast(ExtVT, V0);
31150 V1 = DAG.getBitcast(ExtVT, V1);
31151 Sel = DAG.getBitcast(ExtVT, Sel);
31152 return DAG.getBitcast(
31153 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31154 }
31155 // On pre-SSE41 targets we splat the sign bit - a negative value will
31156 // set all bits of the lanes to true and VSELECT uses that in
31157 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31158 SDValue C =
31159 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31160 return DAG.getSelect(dl, VT, C, V0, V1);
31161 };
31162
31163 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31164 if (UseSSE41) {
31165 // On SSE41 targets we need to replicate the shift mask in both
31166 // bytes for PBLENDVB.
31167 Amt = DAG.getNode(
31168 ISD::OR, dl, VT,
31169 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31170 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31171 } else {
31172 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31173 }
31174
31175 // r = VSELECT(r, shift(r, 8), a);
31176 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31177 R = SignBitSelect(Amt, M, R);
31178
31179 // a += a
31180 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31181
31182 // r = VSELECT(r, shift(r, 4), a);
31183 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31184 R = SignBitSelect(Amt, M, R);
31185
31186 // a += a
31187 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31188
31189 // r = VSELECT(r, shift(r, 2), a);
31190 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31191 R = SignBitSelect(Amt, M, R);
31192
31193 // a += a
31194 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31195
31196 // return VSELECT(r, shift(r, 1), a);
31197 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31198 R = SignBitSelect(Amt, M, R);
31199 return R;
31200 }
31201
31202 // Decompose 256-bit shifts into 128-bit shifts.
31203 if (VT.is256BitVector())
31204 return splitVectorIntBinary(Op, DAG, dl);
31205
31206 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31207 return splitVectorIntBinary(Op, DAG, dl);
31208
31209 return SDValue();
31210}
31211
31213 SelectionDAG &DAG) {
31214 MVT VT = Op.getSimpleValueType();
31215 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31216 "Unexpected funnel shift opcode!");
31217
31218 SDLoc DL(Op);
31219 SDValue Op0 = Op.getOperand(0);
31220 SDValue Op1 = Op.getOperand(1);
31221 SDValue Amt = Op.getOperand(2);
31222 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31223 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31224
31225 if (VT.isVector()) {
31226 APInt APIntShiftAmt;
31227 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31228 unsigned NumElts = VT.getVectorNumElements();
31229
31230 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31231 if (IsFSHR)
31232 std::swap(Op0, Op1);
31233
31234 if (IsCstSplat) {
31235 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31236 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31237 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31238 {Op0, Op1, Imm}, DAG, Subtarget);
31239 }
31240 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
31241 {Op0, Op1, Amt}, DAG, Subtarget);
31242 }
31243 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31244 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31245 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31246 "Unexpected funnel shift type!");
31247
31248 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31249 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31250 if (IsCstSplat) {
31251 // TODO: Can't use generic expansion as UNDEF amt elements can be
31252 // converted to other values when folded to shift amounts, losing the
31253 // splat.
31254 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31255 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31256 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31257 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31258 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31259
31260 if (EltSizeInBits == 8 &&
31261 (Subtarget.hasXOP() ||
31262 (useVPTERNLOG(Subtarget, VT) &&
31263 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31264 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31265 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31266 // the original vector width to handle cases where we split.
31267 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31268 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31269 SDValue ShX =
31270 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31271 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31272 SDValue ShY =
31273 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31274 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31275 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31276 DAG.getConstant(MaskX, DL, VT));
31277 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31278 DAG.getConstant(MaskY, DL, VT));
31279 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31280 }
31281
31282 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31283 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31284 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31285 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31286 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31287 }
31288
31289 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31290 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31291 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31292
31293 // Constant vXi16 funnel shifts can be efficiently handled by default.
31294 if (IsCst && EltSizeInBits == 16)
31295 return SDValue();
31296
31297 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31298 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31299 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31300
31301 // Split 256-bit integers on XOP/pre-AVX2 targets.
31302 // Split 512-bit integers on non 512-bit BWI targets.
31303 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31304 !Subtarget.hasAVX2())) ||
31305 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31306 EltSizeInBits < 32)) {
31307 // Pre-mask the amount modulo using the wider vector.
31308 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31309 return splitVectorOp(Op, DAG, DL);
31310 }
31311
31312 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31313 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31314 int ScalarAmtIdx = -1;
31315 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31316 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31317 if (EltSizeInBits == 16)
31318 return SDValue();
31319
31320 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31321 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31322 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31323 ScalarAmtIdx, Subtarget, DAG);
31324 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31325 ScalarAmtIdx, Subtarget, DAG);
31326 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31327 }
31328 }
31329
31330 MVT WideSVT = MVT::getIntegerVT(
31331 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31332 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31333
31334 // If per-element shifts are legal, fallback to generic expansion.
31335 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31336 return SDValue();
31337
31338 // Attempt to fold as:
31339 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31340 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31341 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31342 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31343 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31344 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31345 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31346 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31347 EltSizeInBits, DAG);
31348 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31349 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31350 if (!IsFSHR)
31351 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31352 EltSizeInBits, DAG);
31353 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31354 }
31355
31356 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31357 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31358 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31359 SDValue Z = DAG.getConstant(0, DL, VT);
31360 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31361 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31362 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31363 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31364 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31365 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31366 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31367 }
31368
31369 // Fallback to generic expansion.
31370 return SDValue();
31371 }
31372 assert(
31373 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31374 "Unexpected funnel shift type!");
31375
31376 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31377 bool OptForSize = DAG.shouldOptForSize();
31378 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31379
31380 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31381 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31382 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31383 !isa<ConstantSDNode>(Amt)) {
31384 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31385 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31386 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31387 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31388 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31389 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31390 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31391 if (IsFSHR) {
31392 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31393 } else {
31394 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31395 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31396 }
31397 return DAG.getZExtOrTrunc(Res, DL, VT);
31398 }
31399
31400 if (VT == MVT::i8 || ExpandFunnel)
31401 return SDValue();
31402
31403 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31404 if (VT == MVT::i16) {
31405 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31406 DAG.getConstant(15, DL, Amt.getValueType()));
31407 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31408 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31409 }
31410
31411 return Op;
31412}
31413
31414static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31415 SelectionDAG &DAG) {
31416 MVT VT = Op.getSimpleValueType();
31417 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31418
31419 SDLoc DL(Op);
31420 SDValue R = Op.getOperand(0);
31421 SDValue Amt = Op.getOperand(1);
31422 unsigned Opcode = Op.getOpcode();
31423 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31424 int NumElts = VT.getVectorNumElements();
31425 bool IsROTL = Opcode == ISD::ROTL;
31426
31427 // Check for constant splat rotation amount.
31428 APInt CstSplatValue;
31429 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31430
31431 // Check for splat rotate by zero.
31432 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31433 return R;
31434
31435 // AVX512 implicitly uses modulo rotation amounts.
31436 if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
31437 // Attempt to rotate by immediate.
31438 if (IsCstSplat) {
31439 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31440 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31441 return DAG.getNode(RotOpc, DL, VT, R,
31442 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31443 }
31444
31445 // Else, fall-back on VPROLV/VPRORV.
31446 return Op;
31447 }
31448
31449 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31450 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31451 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31452 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31453 }
31454
31455 SDValue Z = DAG.getConstant(0, DL, VT);
31456
31457 if (!IsROTL) {
31458 // If the ISD::ROTR amount is constant, we're always better converting to
31459 // ISD::ROTL.
31460 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31461 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31462
31463 // XOP targets always prefers ISD::ROTL.
31464 if (Subtarget.hasXOP())
31465 return DAG.getNode(ISD::ROTL, DL, VT, R,
31466 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31467 }
31468
31469 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31470 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31472 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31473 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31474 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31475 DAG.getTargetConstant(0, DL, MVT::i8));
31476 }
31477
31478 // Split 256-bit integers on XOP/pre-AVX2 targets.
31479 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31480 return splitVectorIntBinary(Op, DAG, DL);
31481
31482 // XOP has 128-bit vector variable + immediate rotates.
31483 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31484 // XOP implicitly uses modulo rotation amounts.
31485 if (Subtarget.hasXOP()) {
31486 assert(IsROTL && "Only ROTL expected");
31487 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31488
31489 // Attempt to rotate by immediate.
31490 if (IsCstSplat) {
31491 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31492 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31493 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31494 }
31495
31496 // Use general rotate by variable (per-element).
31497 return Op;
31498 }
31499
31500 // Rotate by an uniform constant - expand back to shifts.
31501 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31502 // to other values when folded to shift amounts, losing the splat.
31503 if (IsCstSplat) {
31504 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31505 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31506 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31507 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31508 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31509 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31510 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31511 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31512 }
31513
31514 // Split 512-bit integers on non 512-bit BWI targets.
31515 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31516 return splitVectorIntBinary(Op, DAG, DL);
31517
31518 assert(
31519 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31520 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31521 Subtarget.hasAVX2()) ||
31522 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31523 "Only vXi32/vXi16/vXi8 vector rotates supported");
31524
31525 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31526 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31527
31528 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31529 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31530
31531 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31532 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31533 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31534 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31535 int BaseRotAmtIdx = -1;
31536 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31537 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31538 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31539 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31540 }
31541 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31542 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31543 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31544 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31545 BaseRotAmtIdx, Subtarget, DAG);
31546 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31547 BaseRotAmtIdx, Subtarget, DAG);
31548 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31549 }
31550 }
31551
31552 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31553 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31554
31555 // Attempt to fold as unpack(x,x) << zext(y):
31556 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31557 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31558 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31559 if (!(ConstantAmt && EltSizeInBits != 8) &&
31560 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31561 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31562 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31563 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31564 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31565 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31566 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31567 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31568 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31569 }
31570
31571 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31572 // the amount bit.
31573 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31574 if (EltSizeInBits == 8) {
31575 MVT WideVT =
31576 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31577
31578 // Attempt to fold as:
31579 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31580 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31581 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31582 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31583 // If we're rotating by constant, just use default promotion.
31584 if (ConstantAmt)
31585 return SDValue();
31586 // See if we can perform this by widening to vXi16 or vXi32.
31587 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31588 R = DAG.getNode(
31589 ISD::OR, DL, WideVT, R,
31590 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31591 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31592 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31593 if (IsROTL)
31594 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31595 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31596 }
31597
31598 // We don't need ModuloAmt here as we just peek at individual bits.
31599 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31600 if (Subtarget.hasSSE41()) {
31601 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31602 // on the sign bit.
31603 V0 = DAG.getBitcast(VT, V0);
31604 V1 = DAG.getBitcast(VT, V1);
31605 Sel = DAG.getBitcast(VT, Sel);
31606 return DAG.getBitcast(SelVT,
31607 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31608 }
31609 // On pre-SSE41 targets we test for the sign bit by comparing to
31610 // zero - a negative value will set all bits of the lanes to true
31611 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31612 SDValue Z = DAG.getConstant(0, DL, SelVT);
31613 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31614 return DAG.getSelect(DL, SelVT, C, V0, V1);
31615 };
31616
31617 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31618 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31619 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31620 IsROTL = true;
31621 }
31622
31623 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31624 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31625
31626 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31627 // We can safely do this using i16 shifts as we're only interested in
31628 // the 3 lower bits of each byte.
31629 Amt = DAG.getBitcast(ExtVT, Amt);
31630 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31631 Amt = DAG.getBitcast(VT, Amt);
31632
31633 // r = VSELECT(r, rot(r, 4), a);
31634 SDValue M;
31635 M = DAG.getNode(
31636 ISD::OR, DL, VT,
31637 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31638 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31639 R = SignBitSelect(VT, Amt, M, R);
31640
31641 // a += a
31642 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31643
31644 // r = VSELECT(r, rot(r, 2), a);
31645 M = DAG.getNode(
31646 ISD::OR, DL, VT,
31647 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31648 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31649 R = SignBitSelect(VT, Amt, M, R);
31650
31651 // a += a
31652 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31653
31654 // return VSELECT(r, rot(r, 1), a);
31655 M = DAG.getNode(
31656 ISD::OR, DL, VT,
31657 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31658 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31659 return SignBitSelect(VT, Amt, M, R);
31660 }
31661
31662 bool IsSplatAmt = DAG.isSplatValue(Amt);
31663 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31664 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31665
31666 // Fallback for splats + all supported variable shifts.
31667 // Fallback for non-constants AVX2 vXi16 as well.
31668 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31669 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31670 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31671 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31672 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31673 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31674 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31675 }
31676
31677 // Everything below assumes ISD::ROTL.
31678 if (!IsROTL) {
31679 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31680 IsROTL = true;
31681 }
31682
31683 // ISD::ROT* uses modulo rotate amounts.
31684 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31685
31686 assert(IsROTL && "Only ROTL supported");
31687
31688 // As with shifts, attempt to convert the rotation amount to a multiplication
31689 // factor, fallback to general expansion.
31690 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31691 if (!Scale)
31692 return SDValue();
31693
31694 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31695 if (EltSizeInBits == 16) {
31696 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31697 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31698 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31699 }
31700
31701 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31702 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31703 // that can then be OR'd with the lower 32-bits.
31704 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31705 static const int OddMask[] = {1, 1, 3, 3};
31706 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31707 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31708
31709 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31710 DAG.getBitcast(MVT::v2i64, R),
31711 DAG.getBitcast(MVT::v2i64, Scale));
31712 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31713 DAG.getBitcast(MVT::v2i64, R13),
31714 DAG.getBitcast(MVT::v2i64, Scale13));
31715 Res02 = DAG.getBitcast(VT, Res02);
31716 Res13 = DAG.getBitcast(VT, Res13);
31717
31718 return DAG.getNode(ISD::OR, DL, VT,
31719 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31720 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31721}
31722
31723/// Returns true if the operand type is exactly twice the native width, and
31724/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31725/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31726/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31727bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31728 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31729
31730 if (OpWidth == 64)
31731 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31732 if (OpWidth == 128)
31733 return Subtarget.canUseCMPXCHG16B();
31734
31735 return false;
31736}
31737
31739X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31740 Type *MemType = SI->getValueOperand()->getType();
31741
31742 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31743 !Subtarget.useSoftFloat()) {
31744 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31745 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31747
31748 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31749 Subtarget.hasAVX())
31751 }
31752
31753 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31755}
31756
31757// Note: this turns large loads into lock cmpxchg8b/16b.
31759X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31760 Type *MemType = LI->getType();
31761
31762 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31763 !Subtarget.useSoftFloat()) {
31764 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31765 // can use movq to do the load. If we have X87 we can load into an 80-bit
31766 // X87 register and store it to a stack temporary.
31767 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31768 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31770
31771 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31772 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31773 Subtarget.hasAVX())
31775 }
31776
31777 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31779}
31780
31788
31789static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31790 using namespace llvm::PatternMatch;
31791 BitTestKind BTK = UndefBit;
31792 if (auto *C = dyn_cast<ConstantInt>(V)) {
31793 // Check if V is a power of 2 or NOT power of 2.
31794 if (isPowerOf2_64(C->getZExtValue()))
31795 BTK = ConstantBit;
31796 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31797 BTK = NotConstantBit;
31798 return {V, BTK};
31799 }
31800
31801 // Check if V is some power of 2 pattern known to be non-zero
31802 if (auto *I = dyn_cast<Instruction>(V)) {
31803 bool Not = false;
31804 // Check if we have a NOT
31805 Value *PeekI;
31806 if (match(I, m_Not(m_Value(PeekI))) ||
31807 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31808 Not = true;
31809 I = dyn_cast<Instruction>(PeekI);
31810
31811 // If I is constant, it will fold and we can evaluate later. If its an
31812 // argument or something of that nature, we can't analyze.
31813 if (I == nullptr)
31814 return {nullptr, UndefBit};
31815 }
31816 // We can only use 1 << X without more sophisticated analysis. C << X where
31817 // C is a power of 2 but not 1 can result in zero which cannot be translated
31818 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31819 if (I->getOpcode() == Instruction::Shl) {
31820 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31821 // -X` and some other provable power of 2 patterns that we can use CTZ on
31822 // may be profitable.
31823 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31824 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31825 // be provably a non-zero power of 2.
31826 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31827 // transformable to bittest.
31828 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31829 if (!ShiftVal)
31830 return {nullptr, UndefBit};
31831 if (ShiftVal->equalsInt(1))
31832 BTK = Not ? NotShiftBit : ShiftBit;
31833
31834 if (BTK == UndefBit)
31835 return {nullptr, UndefBit};
31836
31837 Value *BitV = I->getOperand(1);
31838
31839 // Read past a shiftmask instruction to find count
31840 Value *AndOp;
31841 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31842 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31843 BitV = AndOp;
31844
31845 return {BitV, BTK};
31846 }
31847 }
31848 return {nullptr, UndefBit};
31849}
31850
31852X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31853 using namespace llvm::PatternMatch;
31854 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31855 // prefix to a normal instruction for these operations.
31856 if (AI->use_empty())
31858
31859 if (AI->getOperation() == AtomicRMWInst::Xor) {
31860 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31861 // preferable to both `cmpxchg` and `btc`.
31862 if (match(AI->getOperand(1), m_SignMask()))
31864 }
31865
31866 // If the atomicrmw's result is used by a single bit AND, we may use
31867 // bts/btr/btc instruction for these operations.
31868 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31869 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31870 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31871 // detect it.
31872 Instruction *I = AI->user_back();
31873 auto BitChange = FindSingleBitChange(AI->getValOperand());
31874 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31875 I->getOpcode() != Instruction::And ||
31876 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31877 AI->getParent() != I->getParent())
31879
31880 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31881
31882 // This is a redundant AND, it should get cleaned up elsewhere.
31883 if (AI == I->getOperand(OtherIdx))
31885
31886 // The following instruction must be a AND single bit.
31887 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31888 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31889 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31890 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31892 }
31893 if (AI->getOperation() == AtomicRMWInst::And) {
31894 return ~C1->getValue() == C2->getValue()
31897 }
31900 }
31901
31902 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31903
31904 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31905 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31907
31908 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31909
31910 // If shift amounts are not the same we can't use BitTestIntrinsic.
31911 if (BitChange.first != BitTested.first)
31913
31914 // If atomic AND need to be masking all be one bit and testing the one bit
31915 // unset in the mask.
31916 if (AI->getOperation() == AtomicRMWInst::And)
31917 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31920
31921 // If atomic XOR/OR need to be setting and testing the same bit.
31922 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31925}
31926
31927void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31928 IRBuilder<> Builder(AI);
31929 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31932 switch (AI->getOperation()) {
31933 default:
31934 llvm_unreachable("Unknown atomic operation");
31935 case AtomicRMWInst::Or:
31936 IID_C = Intrinsic::x86_atomic_bts;
31937 IID_I = Intrinsic::x86_atomic_bts_rm;
31938 break;
31939 case AtomicRMWInst::Xor:
31940 IID_C = Intrinsic::x86_atomic_btc;
31941 IID_I = Intrinsic::x86_atomic_btc_rm;
31942 break;
31943 case AtomicRMWInst::And:
31944 IID_C = Intrinsic::x86_atomic_btr;
31945 IID_I = Intrinsic::x86_atomic_btr_rm;
31946 break;
31947 }
31948 Instruction *I = AI->user_back();
31949 LLVMContext &Ctx = AI->getContext();
31950 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31952 Value *Result = nullptr;
31953 auto BitTested = FindSingleBitChange(AI->getValOperand());
31954 assert(BitTested.first != nullptr);
31955
31956 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31957 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31958
31959 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31960 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31961 {Addr, Builder.getInt8(Imm)});
31962 } else {
31963 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31964
31965 Value *SI = BitTested.first;
31966 assert(SI != nullptr);
31967
31968 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31969 // mask it.
31970 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31971 Value *BitPos =
31972 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31973 // Todo(1): In many cases it may be provable that SI is less than
31974 // ShiftBits in which case this mask is unnecessary
31975 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31976 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31977 // favor of just a raw BT{S|R|C}.
31978
31979 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31980 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31981
31982 // If the result is only used for zero/non-zero status then we don't need to
31983 // shift value back. Otherwise do so.
31984 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31985 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31986 if (ICmp->isEquality()) {
31987 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31988 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31989 if (C0 || C1) {
31990 assert(C0 == nullptr || C1 == nullptr);
31991 if ((C0 ? C0 : C1)->isZero())
31992 continue;
31993 }
31994 }
31995 }
31996 Result = Builder.CreateShl(Result, BitPos);
31997 break;
31998 }
31999 }
32000
32001 I->replaceAllUsesWith(Result);
32002 I->eraseFromParent();
32003 AI->eraseFromParent();
32004}
32005
32007 using namespace llvm::PatternMatch;
32008 if (!AI->hasOneUse())
32009 return false;
32010
32011 Value *Op = AI->getOperand(1);
32012 CmpPredicate Pred;
32013 Instruction *I = AI->user_back();
32015 if (Opc == AtomicRMWInst::Add) {
32016 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32017 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32018 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32019 if (match(I->user_back(),
32021 return true;
32022 if (match(I->user_back(),
32024 return true;
32025 }
32026 return false;
32027 }
32028 if (Opc == AtomicRMWInst::Sub) {
32029 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32030 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32031 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32032 if (match(I->user_back(),
32034 return true;
32035 if (match(I->user_back(),
32037 return true;
32038 }
32039 return false;
32040 }
32041 if ((Opc == AtomicRMWInst::Or &&
32043 (Opc == AtomicRMWInst::And &&
32045 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32046 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32047 Pred == CmpInst::ICMP_SLT;
32048 if (match(I->user_back(),
32050 return true;
32051 return false;
32052 }
32053 if (Opc == AtomicRMWInst::Xor) {
32054 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32055 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32056 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32057 if (match(I->user_back(),
32059 return true;
32060 if (match(I->user_back(),
32062 return true;
32063 }
32064 return false;
32065 }
32066
32067 return false;
32068}
32069
32070void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32071 AtomicRMWInst *AI) const {
32072 IRBuilder<> Builder(AI);
32073 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32074 Instruction *TempI = nullptr;
32075 LLVMContext &Ctx = AI->getContext();
32076 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32077 if (!ICI) {
32078 TempI = AI->user_back();
32079 assert(TempI->hasOneUse() && "Must have one use");
32080 ICI = cast<ICmpInst>(TempI->user_back());
32081 }
32083 ICmpInst::Predicate Pred = ICI->getPredicate();
32084 switch (Pred) {
32085 default:
32086 llvm_unreachable("Not supported Pred");
32087 case CmpInst::ICMP_EQ:
32088 CC = X86::COND_E;
32089 break;
32090 case CmpInst::ICMP_NE:
32091 CC = X86::COND_NE;
32092 break;
32093 case CmpInst::ICMP_SLT:
32094 CC = X86::COND_S;
32095 break;
32096 case CmpInst::ICMP_SGT:
32097 CC = X86::COND_NS;
32098 break;
32099 }
32101 switch (AI->getOperation()) {
32102 default:
32103 llvm_unreachable("Unknown atomic operation");
32104 case AtomicRMWInst::Add:
32105 IID = Intrinsic::x86_atomic_add_cc;
32106 break;
32107 case AtomicRMWInst::Sub:
32108 IID = Intrinsic::x86_atomic_sub_cc;
32109 break;
32110 case AtomicRMWInst::Or:
32111 IID = Intrinsic::x86_atomic_or_cc;
32112 break;
32113 case AtomicRMWInst::And:
32114 IID = Intrinsic::x86_atomic_and_cc;
32115 break;
32116 case AtomicRMWInst::Xor:
32117 IID = Intrinsic::x86_atomic_xor_cc;
32118 break;
32119 }
32120 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32122 Value *Call = Builder.CreateIntrinsic(
32123 IID, AI->getType(),
32124 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32125 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32126 ICI->replaceAllUsesWith(Result);
32127 ICI->eraseFromParent();
32128 if (TempI)
32129 TempI->eraseFromParent();
32130 AI->eraseFromParent();
32131}
32132
32134X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32135 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32136 Type *MemType = AI->getType();
32137
32138 // If the operand is too big, we must see if cmpxchg8/16b is available
32139 // and default to library calls otherwise.
32140 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32141 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32143 }
32144
32146 switch (Op) {
32149 case AtomicRMWInst::Add:
32150 case AtomicRMWInst::Sub:
32153 // It's better to use xadd, xsub or xchg for these in other cases.
32155 case AtomicRMWInst::Or:
32156 case AtomicRMWInst::And:
32157 case AtomicRMWInst::Xor:
32160 return shouldExpandLogicAtomicRMWInIR(AI);
32162 case AtomicRMWInst::Max:
32163 case AtomicRMWInst::Min:
32174 default:
32175 // These always require a non-trivial set of data operations on x86. We must
32176 // use a cmpxchg loop.
32178 }
32179}
32180
32181LoadInst *
32182X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32183 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32184 Type *MemType = AI->getType();
32185 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32186 // there is no benefit in turning such RMWs into loads, and it is actually
32187 // harmful as it introduces a mfence.
32188 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32189 return nullptr;
32190
32191 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32192 // lowering available in lowerAtomicArith.
32193 // TODO: push more cases through this path.
32194 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32195 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32196 AI->use_empty())
32197 return nullptr;
32198
32199 IRBuilder<> Builder(AI);
32200 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32201 auto SSID = AI->getSyncScopeID();
32202 // We must restrict the ordering to avoid generating loads with Release or
32203 // ReleaseAcquire orderings.
32205
32206 // Before the load we need a fence. Here is an example lifted from
32207 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32208 // is required:
32209 // Thread 0:
32210 // x.store(1, relaxed);
32211 // r1 = y.fetch_add(0, release);
32212 // Thread 1:
32213 // y.fetch_add(42, acquire);
32214 // r2 = x.load(relaxed);
32215 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32216 // lowered to just a load without a fence. A mfence flushes the store buffer,
32217 // making the optimization clearly correct.
32218 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32219 // otherwise, we might be able to be more aggressive on relaxed idempotent
32220 // rmw. In practice, they do not look useful, so we don't try to be
32221 // especially clever.
32222
32223 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32224 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32225 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32226
32227 // Finally we can emit the atomic load.
32228 LoadInst *Loaded = Builder.CreateAlignedLoad(
32229 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32230 Loaded->setAtomic(Order, SSID);
32231 AI->replaceAllUsesWith(Loaded);
32232 AI->eraseFromParent();
32233 return Loaded;
32234}
32235
32236/// Emit a locked operation on a stack location which does not change any
32237/// memory location, but does involve a lock prefix. Location is chosen to be
32238/// a) very likely accessed only by a single thread to minimize cache traffic,
32239/// and b) definitely dereferenceable. Returns the new Chain result.
32241 const X86Subtarget &Subtarget, SDValue Chain,
32242 const SDLoc &DL) {
32243 // Implementation notes:
32244 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32245 // operations issued by the current processor. As such, the location
32246 // referenced is not relevant for the ordering properties of the instruction.
32247 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32248 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32249 // 2) Using an immediate operand appears to be the best encoding choice
32250 // here since it doesn't require an extra register.
32251 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32252 // is small enough it might just be measurement noise.)
32253 // 4) When choosing offsets, there are several contributing factors:
32254 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32255 // line aligned stack object to improve this case.)
32256 // b) To minimize our chances of introducing a false dependence, we prefer
32257 // to offset the stack usage from TOS slightly.
32258 // c) To minimize concerns about cross thread stack usage - in particular,
32259 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32260 // captures state in the TOS frame and accesses it from many threads -
32261 // we want to use an offset such that the offset is in a distinct cache
32262 // line from the TOS frame.
32263 //
32264 // For a general discussion of the tradeoffs and benchmark results, see:
32265 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32266
32267 auto &MF = DAG.getMachineFunction();
32268 auto &TFL = *Subtarget.getFrameLowering();
32269 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32270
32271 if (Subtarget.is64Bit()) {
32272 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32273 SDValue Ops[] = {
32274 DAG.getRegister(X86::RSP, MVT::i64), // Base
32275 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32276 DAG.getRegister(0, MVT::i64), // Index
32277 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32278 DAG.getRegister(0, MVT::i16), // Segment.
32279 Zero,
32280 Chain};
32281 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32282 MVT::Other, Ops);
32283 return SDValue(Res, 1);
32284 }
32285
32286 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32287 SDValue Ops[] = {
32288 DAG.getRegister(X86::ESP, MVT::i32), // Base
32289 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32290 DAG.getRegister(0, MVT::i32), // Index
32291 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32292 DAG.getRegister(0, MVT::i16), // Segment.
32293 Zero,
32294 Chain
32295 };
32296 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32297 MVT::Other, Ops);
32298 return SDValue(Res, 1);
32299}
32300
32302 SelectionDAG &DAG) {
32303 SDLoc dl(Op);
32304 AtomicOrdering FenceOrdering =
32305 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32306 SyncScope::ID FenceSSID =
32307 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32308
32309 // The only fence that needs an instruction is a sequentially-consistent
32310 // cross-thread fence.
32311 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32312 FenceSSID == SyncScope::System) {
32313 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32314 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32315
32316 SDValue Chain = Op.getOperand(0);
32317 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32318 }
32319
32320 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32321 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32322}
32323
32325 SelectionDAG &DAG) {
32326 MVT T = Op.getSimpleValueType();
32327 SDLoc DL(Op);
32328 unsigned Reg = 0;
32329 unsigned size = 0;
32330 switch(T.SimpleTy) {
32331 default: llvm_unreachable("Invalid value type!");
32332 case MVT::i8: Reg = X86::AL; size = 1; break;
32333 case MVT::i16: Reg = X86::AX; size = 2; break;
32334 case MVT::i32: Reg = X86::EAX; size = 4; break;
32335 case MVT::i64:
32336 assert(Subtarget.is64Bit() && "Node not type legal!");
32337 Reg = X86::RAX; size = 8;
32338 break;
32339 }
32340 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32341 Op.getOperand(2), SDValue());
32342 SDValue Ops[] = { cpIn.getValue(0),
32343 Op.getOperand(1),
32344 Op.getOperand(3),
32345 DAG.getTargetConstant(size, DL, MVT::i8),
32346 cpIn.getValue(1) };
32347 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32348 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32350 Ops, T, MMO);
32351
32352 SDValue cpOut =
32353 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32354 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32355 MVT::i32, cpOut.getValue(2));
32356 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32357
32358 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32359 cpOut, Success, EFLAGS.getValue(1));
32360}
32361
32362// Create MOVMSKB, taking into account whether we need to split for AVX1.
32364 const X86Subtarget &Subtarget) {
32365 MVT InVT = V.getSimpleValueType();
32366
32367 if (InVT == MVT::v64i8) {
32368 SDValue Lo, Hi;
32369 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32370 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32371 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32372 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32373 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32374 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32375 DAG.getConstant(32, DL, MVT::i8));
32376 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32377 }
32378 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32379 SDValue Lo, Hi;
32380 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32381 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32382 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32383 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32384 DAG.getConstant(16, DL, MVT::i8));
32385 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32386 }
32387
32388 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32389}
32390
32391static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32392 SelectionDAG &DAG) {
32393 SDValue Src = Op.getOperand(0);
32394 MVT SrcVT = Src.getSimpleValueType();
32395 MVT DstVT = Op.getSimpleValueType();
32396
32397 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32398 // half to v32i1 and concatenating the result.
32399 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32400 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32401 assert(Subtarget.hasBWI() && "Expected BWI target");
32402 SDLoc dl(Op);
32403 SDValue Lo, Hi;
32404 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32405 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32406 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32407 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32408 }
32409
32410 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32411 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32412 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32413 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32414 SDLoc DL(Op);
32415 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32416 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32417 return DAG.getZExtOrTrunc(V, DL, DstVT);
32418 }
32419
32420 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32421 SrcVT == MVT::i64) && "Unexpected VT!");
32422
32423 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32424 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32425 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32426 // This conversion needs to be expanded.
32427 return SDValue();
32428
32429 SDLoc dl(Op);
32430 if (SrcVT.isVector()) {
32431 // Widen the vector in input in the case of MVT::v2i32.
32432 // Example: from MVT::v2i32 to MVT::v4i32.
32434 SrcVT.getVectorNumElements() * 2);
32435 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32436 DAG.getUNDEF(SrcVT));
32437 } else {
32438 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32439 "Unexpected source type in LowerBITCAST");
32440 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32441 }
32442
32443 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32444 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32445
32446 if (DstVT == MVT::x86mmx)
32447 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32448
32449 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32450 DAG.getVectorIdxConstant(0, dl));
32451}
32452
32453/// Compute the horizontal sum of bytes in V for the elements of VT.
32454///
32455/// Requires V to be a byte vector and VT to be an integer vector type with
32456/// wider elements than V's type. The width of the elements of VT determines
32457/// how many bytes of V are summed horizontally to produce each element of the
32458/// result.
32460 const X86Subtarget &Subtarget,
32461 SelectionDAG &DAG) {
32462 SDLoc DL(V);
32463 MVT ByteVecVT = V.getSimpleValueType();
32464 MVT EltVT = VT.getVectorElementType();
32465 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32466 "Expected value to have byte element type.");
32467 assert(EltVT != MVT::i8 &&
32468 "Horizontal byte sum only makes sense for wider elements!");
32469 unsigned VecSize = VT.getSizeInBits();
32470 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32471
32472 // PSADBW instruction horizontally add all bytes and leave the result in i64
32473 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32474 if (EltVT == MVT::i64) {
32475 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32476 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32477 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32478 return DAG.getBitcast(VT, V);
32479 }
32480
32481 if (EltVT == MVT::i32) {
32482 // We unpack the low half and high half into i32s interleaved with zeros so
32483 // that we can use PSADBW to horizontally sum them. The most useful part of
32484 // this is that it lines up the results of two PSADBW instructions to be
32485 // two v2i64 vectors which concatenated are the 4 population counts. We can
32486 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32487 SDValue Zeros = DAG.getConstant(0, DL, VT);
32488 SDValue V32 = DAG.getBitcast(VT, V);
32489 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32490 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32491
32492 // Do the horizontal sums into two v2i64s.
32493 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32494 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32495 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32496 DAG.getBitcast(ByteVecVT, Low), Zeros);
32497 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32498 DAG.getBitcast(ByteVecVT, High), Zeros);
32499
32500 // Merge them together.
32501 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32502 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32503 DAG.getBitcast(ShortVecVT, Low),
32504 DAG.getBitcast(ShortVecVT, High));
32505
32506 return DAG.getBitcast(VT, V);
32507 }
32508
32509 // The only element type left is i16.
32510 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32511
32512 // To obtain pop count for each i16 element starting from the pop count for
32513 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32514 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32515 // directly supported.
32516 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32517 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32518 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32519 DAG.getBitcast(ByteVecVT, V));
32520 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32521}
32522
32524 const X86Subtarget &Subtarget,
32525 SelectionDAG &DAG) {
32526 MVT VT = Op.getSimpleValueType();
32527 MVT EltVT = VT.getVectorElementType();
32528 int NumElts = VT.getVectorNumElements();
32529 (void)EltVT;
32530 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32531
32532 // Implement a lookup table in register by using an algorithm based on:
32533 // http://wm.ite.pl/articles/sse-popcount.html
32534 //
32535 // The general idea is that every lower byte nibble in the input vector is an
32536 // index into a in-register pre-computed pop count table. We then split up the
32537 // input vector in two new ones: (1) a vector with only the shifted-right
32538 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32539 // masked out higher ones) for each byte. PSHUFB is used separately with both
32540 // to index the in-register table. Next, both are added and the result is a
32541 // i8 vector where each element contains the pop count for input byte.
32542 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32543 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32544 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32545 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32546
32548 for (int i = 0; i < NumElts; ++i)
32549 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32550 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32551 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32552
32553 // High nibbles
32554 SDValue FourV = DAG.getConstant(4, DL, VT);
32555 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32556
32557 // Low nibbles
32558 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32559
32560 // The input vector is used as the shuffle mask that index elements into the
32561 // LUT. After counting low and high nibbles, add the vector to obtain the
32562 // final pop count per i8 element.
32563 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32564 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32565 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32566}
32567
32568// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32569// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32571 const X86Subtarget &Subtarget,
32572 SelectionDAG &DAG) {
32573 MVT VT = Op.getSimpleValueType();
32574 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32575 "Unknown CTPOP type to handle");
32576 SDValue Op0 = Op.getOperand(0);
32577
32578 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32579 if (Subtarget.hasVPOPCNTDQ()) {
32580 unsigned NumElems = VT.getVectorNumElements();
32581 assert((VT.getVectorElementType() == MVT::i8 ||
32582 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32583 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32584 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32585 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32586 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32587 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32588 }
32589 }
32590
32591 // Decompose 256-bit ops into smaller 128-bit ops.
32592 if (VT.is256BitVector() && !Subtarget.hasInt256())
32593 return splitVectorIntUnary(Op, DAG, DL);
32594
32595 // Decompose 512-bit ops into smaller 256-bit ops.
32596 if (VT.is512BitVector() && !Subtarget.hasBWI())
32597 return splitVectorIntUnary(Op, DAG, DL);
32598
32599 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32600 if (VT.getScalarType() != MVT::i8) {
32601 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32602 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32603 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32604 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32605 }
32606
32607 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32608 if (!Subtarget.hasSSSE3())
32609 return SDValue();
32610
32611 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32612}
32613
32614static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32615 SelectionDAG &DAG) {
32616 MVT VT = N.getSimpleValueType();
32617 SDValue Op = N.getOperand(0);
32618 SDLoc DL(N);
32619
32620 if (VT.isScalarInteger()) {
32621 // Compute the lower/upper bounds of the active bits of the value,
32622 // allowing us to shift the active bits down if necessary to fit into the
32623 // special cases below.
32624 KnownBits Known = DAG.computeKnownBits(Op);
32625 if (Known.isConstant())
32626 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32627 unsigned LZ = Known.countMinLeadingZeros();
32628 unsigned TZ = Known.countMinTrailingZeros();
32629 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32630 unsigned ActiveBits = Known.getBitWidth() - LZ;
32631 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32632
32633 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32634 if (ShiftedActiveBits <= 2) {
32635 if (ActiveBits > 2)
32636 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32637 DAG.getShiftAmountConstant(TZ, VT, DL));
32638 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32639 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32640 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32641 DAG.getShiftAmountConstant(1, VT, DL)));
32642 return DAG.getZExtOrTrunc(Op, DL, VT);
32643 }
32644
32645 // i3 CTPOP - perform LUT into i32 integer.
32646 if (ShiftedActiveBits <= 3) {
32647 if (ActiveBits > 3)
32648 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32649 DAG.getShiftAmountConstant(TZ, VT, DL));
32650 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32651 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32652 DAG.getShiftAmountConstant(1, VT, DL));
32653 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32654 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32655 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32656 DAG.getConstant(0x3, DL, MVT::i32));
32657 return DAG.getZExtOrTrunc(Op, DL, VT);
32658 }
32659
32660 // i4 CTPOP - perform LUT into i64 integer.
32661 if (ShiftedActiveBits <= 4 &&
32662 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32663 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32664 if (ActiveBits > 4)
32665 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32666 DAG.getShiftAmountConstant(TZ, VT, DL));
32667 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32668 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32669 DAG.getConstant(4, DL, MVT::i32));
32670 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32671 DAG.getShiftAmountOperand(MVT::i64, Op));
32672 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32673 DAG.getConstant(0x7, DL, MVT::i64));
32674 return DAG.getZExtOrTrunc(Op, DL, VT);
32675 }
32676
32677 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32678 if (ShiftedActiveBits <= 8) {
32679 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32680 if (ActiveBits > 8)
32681 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32682 DAG.getShiftAmountConstant(TZ, VT, DL));
32683 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32684 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32685 DAG.getConstant(0x08040201U, DL, MVT::i32));
32686 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32687 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32688 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32689 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32690 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32691 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32692 return DAG.getZExtOrTrunc(Op, DL, VT);
32693 }
32694
32695 return SDValue(); // fallback to generic expansion.
32696 }
32697
32698 assert(VT.isVector() &&
32699 "We only do custom lowering for vector population count.");
32700 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32701}
32702
32704 MVT VT = Op.getSimpleValueType();
32705 SDValue In = Op.getOperand(0);
32706 SDLoc DL(Op);
32707
32708 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32709 // perform the BITREVERSE.
32710 if (!VT.isVector()) {
32711 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32712 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32713 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32714 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32715 DAG.getVectorIdxConstant(0, DL));
32716 }
32717
32718 int NumElts = VT.getVectorNumElements();
32719 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32720
32721 // Decompose 256-bit ops into smaller 128-bit ops.
32722 if (VT.is256BitVector())
32723 return splitVectorIntUnary(Op, DAG, DL);
32724
32725 assert(VT.is128BitVector() &&
32726 "Only 128-bit vector bitreverse lowering supported.");
32727
32728 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32729 // perform the BSWAP in the shuffle.
32730 // Its best to shuffle using the second operand as this will implicitly allow
32731 // memory folding for multiple vectors.
32732 SmallVector<SDValue, 16> MaskElts;
32733 for (int i = 0; i != NumElts; ++i) {
32734 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32735 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32736 int PermuteByte = SourceByte | (2 << 5);
32737 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32738 }
32739 }
32740
32741 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32742 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32743 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32744 Res, Mask);
32745 return DAG.getBitcast(VT, Res);
32746}
32747
32749 SelectionDAG &DAG) {
32750 MVT VT = Op.getSimpleValueType();
32751
32752 if (Subtarget.hasXOP() && !VT.is512BitVector())
32753 return LowerBITREVERSE_XOP(Op, DAG);
32754
32755 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32756 "SSSE3 or GFNI required for BITREVERSE");
32757
32758 SDValue In = Op.getOperand(0);
32759 SDLoc DL(Op);
32760
32761 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32762 if (VT.is512BitVector() && !Subtarget.hasBWI())
32763 return splitVectorIntUnary(Op, DAG, DL);
32764
32765 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32766 if (VT.is256BitVector() && !Subtarget.hasInt256())
32767 return splitVectorIntUnary(Op, DAG, DL);
32768
32769 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32770 if (!VT.isVector()) {
32771 assert(
32772 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32773 "Only tested for i8/i16/i32/i64");
32774 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32775 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32776 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32777 DAG.getBitcast(MVT::v16i8, Res));
32778 Res =
32779 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32780 DAG.getVectorIdxConstant(0, DL));
32781 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32782 }
32783
32784 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32785
32786 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32787 if (VT.getScalarType() != MVT::i8) {
32788 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32789 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32790 Res = DAG.getBitcast(ByteVT, Res);
32791 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32792 return DAG.getBitcast(VT, Res);
32793 }
32794 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32795 "Only byte vector BITREVERSE supported");
32796
32797 unsigned NumElts = VT.getVectorNumElements();
32798
32799 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32800 if (Subtarget.hasGFNI()) {
32802 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32803 DAG.getTargetConstant(0, DL, MVT::i8));
32804 }
32805
32806 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32807 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32808 // 0-15 value (moved to the other nibble).
32809 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32810 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32811 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32812
32813 const int LoLUT[16] = {
32814 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32815 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32816 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32817 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32818 const int HiLUT[16] = {
32819 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32820 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32821 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32822 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32823
32824 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32825 for (unsigned i = 0; i < NumElts; ++i) {
32826 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32827 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32828 }
32829
32830 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32831 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32832 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32833 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32834 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32835}
32836
32837static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32838 SelectionDAG &DAG) {
32839 SDLoc DL(Op);
32840 SDValue X = Op.getOperand(0);
32841 MVT VT = Op.getSimpleValueType();
32842
32843 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32844 if (VT == MVT::i8 ||
32846 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32847 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32848 DAG.getConstant(0, DL, MVT::i8));
32849 // Copy the inverse of the parity flag into a register with setcc.
32850 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32851 // Extend to the original type.
32852 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32853 }
32854
32855 // If we have POPCNT, use the default expansion.
32856 if (Subtarget.hasPOPCNT())
32857 return SDValue();
32858
32859 if (VT == MVT::i64) {
32860 // Xor the high and low 16-bits together using a 32-bit operation.
32861 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32862 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32863 DAG.getConstant(32, DL, MVT::i8)));
32864 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32865 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32866 }
32867
32868 if (VT != MVT::i16) {
32869 // Xor the high and low 16-bits together using a 32-bit operation.
32870 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32871 DAG.getConstant(16, DL, MVT::i8));
32872 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32873 } else {
32874 // If the input is 16-bits, we need to extend to use an i32 shift below.
32875 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32876 }
32877
32878 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32879 // This should allow an h-reg to be used to save a shift.
32880 SDValue Hi = DAG.getNode(
32881 ISD::TRUNCATE, DL, MVT::i8,
32882 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32883 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32884 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32885 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32886
32887 // Copy the inverse of the parity flag into a register with setcc.
32888 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32889 // Extend to the original type.
32890 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32891}
32892
32894 const X86Subtarget &Subtarget) {
32895 unsigned NewOpc = 0;
32896 switch (N->getOpcode()) {
32897 case ISD::ATOMIC_LOAD_ADD:
32898 NewOpc = X86ISD::LADD;
32899 break;
32900 case ISD::ATOMIC_LOAD_SUB:
32901 NewOpc = X86ISD::LSUB;
32902 break;
32903 case ISD::ATOMIC_LOAD_OR:
32904 NewOpc = X86ISD::LOR;
32905 break;
32906 case ISD::ATOMIC_LOAD_XOR:
32907 NewOpc = X86ISD::LXOR;
32908 break;
32909 case ISD::ATOMIC_LOAD_AND:
32910 NewOpc = X86ISD::LAND;
32911 break;
32912 default:
32913 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32914 }
32915
32916 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32917
32918 return DAG.getMemIntrinsicNode(
32919 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32920 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32921 /*MemVT=*/N->getSimpleValueType(0), MMO);
32922}
32923
32924/// Lower atomic_load_ops into LOCK-prefixed operations.
32926 const X86Subtarget &Subtarget) {
32927 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32928 SDValue Chain = N->getOperand(0);
32929 SDValue LHS = N->getOperand(1);
32930 SDValue RHS = N->getOperand(2);
32931 unsigned Opc = N->getOpcode();
32932 MVT VT = N->getSimpleValueType(0);
32933 SDLoc DL(N);
32934
32935 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32936 // can only be lowered when the result is unused. They should have already
32937 // been transformed into a cmpxchg loop in AtomicExpand.
32938 if (N->hasAnyUseOfValue(0)) {
32939 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32940 // select LXADD if LOCK_SUB can't be selected.
32941 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32942 // can use LXADD as opposed to cmpxchg.
32943 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32944 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))
32945 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32946 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32947
32948 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
32949 "Used AtomicRMW ops other than Add should have been expanded!");
32950 return N;
32951 }
32952
32953 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32954 // The core idea here is that since the memory location isn't actually
32955 // changing, all we need is a lowering for the *ordering* impacts of the
32956 // atomicrmw. As such, we can chose a different operation and memory
32957 // location to minimize impact on other code.
32958 // The above holds unless the node is marked volatile in which
32959 // case it needs to be preserved according to the langref.
32960 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32961 // On X86, the only ordering which actually requires an instruction is
32962 // seq_cst which isn't SingleThread, everything just needs to be preserved
32963 // during codegen and then dropped. Note that we expect (but don't assume),
32964 // that orderings other than seq_cst and acq_rel have been canonicalized to
32965 // a store or load.
32968 // Prefer a locked operation against a stack location to minimize cache
32969 // traffic. This assumes that stack locations are very likely to be
32970 // accessed only by the owning thread.
32971 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32972 assert(!N->hasAnyUseOfValue(0));
32973 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32974 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32975 DAG.getUNDEF(VT), NewChain);
32976 }
32977 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32978 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32979 assert(!N->hasAnyUseOfValue(0));
32980 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32981 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32982 DAG.getUNDEF(VT), NewChain);
32983 }
32984
32985 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32986 // RAUW the chain, but don't worry about the result, as it's unused.
32987 assert(!N->hasAnyUseOfValue(0));
32988 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32989 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32990 DAG.getUNDEF(VT), LockOp.getValue(1));
32991}
32992
32994 const X86Subtarget &Subtarget) {
32995 auto *Node = cast<AtomicSDNode>(Op.getNode());
32996 SDLoc dl(Node);
32997 EVT VT = Node->getMemoryVT();
32998
32999 bool IsSeqCst =
33000 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
33001 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
33002
33003 // If this store is not sequentially consistent and the type is legal
33004 // we can just keep it.
33005 if (!IsSeqCst && IsTypeLegal)
33006 return Op;
33007
33008 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
33010 Attribute::NoImplicitFloat)) {
33011 SDValue Chain;
33012 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
33013 // vector store.
33014 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
33015 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
33016 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
33017 Node->getMemOperand());
33018 }
33019
33020 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33021 // is enabled.
33022 if (VT == MVT::i64) {
33023 if (Subtarget.hasSSE1()) {
33024 SDValue SclToVec =
33025 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
33026 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33027 SclToVec = DAG.getBitcast(StVT, SclToVec);
33028 SDVTList Tys = DAG.getVTList(MVT::Other);
33029 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33030 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33031 MVT::i64, Node->getMemOperand());
33032 } else if (Subtarget.hasX87()) {
33033 // First load this into an 80-bit X87 register using a stack temporary.
33034 // This will put the whole integer into the significand.
33035 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33036 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33037 MachinePointerInfo MPI =
33039 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33041 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33042 SDValue LdOps[] = {Chain, StackPtr};
33044 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33045 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33046 Chain = Value.getValue(1);
33047
33048 // Now use an FIST to do the atomic store.
33049 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33050 Chain =
33051 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33052 StoreOps, MVT::i64, Node->getMemOperand());
33053 }
33054 }
33055
33056 if (Chain) {
33057 // If this is a sequentially consistent store, also emit an appropriate
33058 // barrier.
33059 if (IsSeqCst)
33060 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33061
33062 return Chain;
33063 }
33064 }
33065
33066 // Convert seq_cst store -> xchg
33067 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33068 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33069 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33070 Node->getOperand(0), Node->getOperand(2),
33071 Node->getOperand(1), Node->getMemOperand());
33072 return Swap.getValue(1);
33073}
33074
33076 SDNode *N = Op.getNode();
33077 MVT VT = N->getSimpleValueType(0);
33078 unsigned Opc = Op.getOpcode();
33079
33080 // Let legalize expand this if it isn't a legal type yet.
33081 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33082 return SDValue();
33083
33084 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33085 SDLoc DL(N);
33086
33087 // Set the carry flag.
33088 SDValue Carry = Op.getOperand(2);
33089 EVT CarryVT = Carry.getValueType();
33090 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33091 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33092
33093 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33094 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33095 Op.getOperand(0), Op.getOperand(1),
33096 Carry.getValue(1));
33097
33098 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33099 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33100 Sum.getValue(1), DL, DAG);
33101 if (N->getValueType(1) == MVT::i1)
33102 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33103
33104 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33105}
33106
33107static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33108 SelectionDAG &DAG) {
33109 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33110
33111 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33112 // which returns the values as { float, float } (in XMM0) or
33113 // { double, double } (which is returned in XMM0, XMM1).
33114 SDLoc dl(Op);
33115 SDValue Arg = Op.getOperand(0);
33116 EVT ArgVT = Arg.getValueType();
33117 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33118
33120 Args.emplace_back(Arg, ArgTy);
33121
33122 bool isF64 = ArgVT == MVT::f64;
33123 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33124 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33125 // the results are returned via SRet in memory.
33126 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33127 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33128 const char *LibcallName = TLI.getLibcallName(LC);
33129 SDValue Callee =
33130 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33131
33132 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33133 : (Type *)FixedVectorType::get(ArgTy, 4);
33134
33136 CLI.setDebugLoc(dl)
33137 .setChain(DAG.getEntryNode())
33138 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33139
33140 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33141
33142 if (isF64)
33143 // Returned in xmm0 and xmm1.
33144 return CallResult.first;
33145
33146 // Returned in bits 0:31 and 32:64 xmm0.
33147 SDValue SinVal =
33148 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33149 DAG.getVectorIdxConstant(0, dl));
33150 SDValue CosVal =
33151 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33152 DAG.getVectorIdxConstant(1, dl));
33153 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33154 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33155}
33156
33157/// Widen a vector input to a vector of NVT. The
33158/// input vector must have the same element type as NVT.
33160 bool FillWithZeroes = false) {
33161 // Check if InOp already has the right width.
33162 MVT InVT = InOp.getSimpleValueType();
33163 if (InVT == NVT)
33164 return InOp;
33165
33166 if (InOp.isUndef())
33167 return DAG.getUNDEF(NVT);
33168
33170 "input and widen element type must match");
33171
33172 unsigned InNumElts = InVT.getVectorNumElements();
33173 unsigned WidenNumElts = NVT.getVectorNumElements();
33174 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33175 "Unexpected request for vector widening");
33176
33177 SDLoc dl(InOp);
33178 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33179 SDValue N1 = InOp.getOperand(1);
33180 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33181 N1.isUndef()) {
33182 InOp = InOp.getOperand(0);
33183 InVT = InOp.getSimpleValueType();
33184 InNumElts = InVT.getVectorNumElements();
33185 }
33186 }
33189 EVT EltVT = InOp.getOperand(0).getValueType();
33190 SDValue FillVal =
33191 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33193 Ops.append(WidenNumElts - InNumElts, FillVal);
33194 return DAG.getBuildVector(NVT, dl, Ops);
33195 }
33196 SDValue FillVal =
33197 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33198 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33199 DAG.getVectorIdxConstant(0, dl));
33200}
33201
33203 SelectionDAG &DAG) {
33204 assert(Subtarget.hasAVX512() &&
33205 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33206
33208 SDValue Src = N->getValue();
33209 MVT VT = Src.getSimpleValueType();
33210 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33211 SDLoc dl(Op);
33212
33213 SDValue Scale = N->getScale();
33214 SDValue Index = N->getIndex();
33215 SDValue Mask = N->getMask();
33216 SDValue Chain = N->getChain();
33217 SDValue BasePtr = N->getBasePtr();
33218
33219 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33220 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33221 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33222 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33223 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33224 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33225 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33226 SDVTList VTs = DAG.getVTList(MVT::Other);
33227 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33228 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33229 N->getMemoryVT(), N->getMemOperand());
33230 }
33231 return SDValue();
33232 }
33233
33234 MVT IndexVT = Index.getSimpleValueType();
33235
33236 // If the index is v2i32, we're being called by type legalization and we
33237 // should just let the default handling take care of it.
33238 if (IndexVT == MVT::v2i32)
33239 return SDValue();
33240
33241 // If we don't have VLX and neither the passthru or index is 512-bits, we
33242 // need to widen until one is.
33243 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33244 !Index.getSimpleValueType().is512BitVector()) {
33245 // Determine how much we need to widen by to get a 512-bit type.
33246 unsigned Factor = std::min(512/VT.getSizeInBits(),
33247 512/IndexVT.getSizeInBits());
33248 unsigned NumElts = VT.getVectorNumElements() * Factor;
33249
33250 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33251 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33252 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33253
33254 Src = ExtendToType(Src, VT, DAG);
33255 Index = ExtendToType(Index, IndexVT, DAG);
33256 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33257 }
33258
33259 SDVTList VTs = DAG.getVTList(MVT::Other);
33260 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33261 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33262 N->getMemoryVT(), N->getMemOperand());
33263}
33264
33265static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33266 SelectionDAG &DAG) {
33267
33269 MVT VT = Op.getSimpleValueType();
33270 MVT ScalarVT = VT.getScalarType();
33271 SDValue Mask = N->getMask();
33272 MVT MaskVT = Mask.getSimpleValueType();
33273 SDValue PassThru = N->getPassThru();
33274 SDLoc dl(Op);
33275
33276 // Handle AVX masked loads which don't support passthru other than 0.
33277 if (MaskVT.getVectorElementType() != MVT::i1) {
33278 // We also allow undef in the isel pattern.
33279 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33280 return Op;
33281
33282 SDValue NewLoad = DAG.getMaskedLoad(
33283 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33284 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33285 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33286 N->isExpandingLoad());
33287 // Emit a blend.
33288 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33289 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33290 }
33291
33292 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33293 "Expanding masked load is supported on AVX-512 target only!");
33294
33295 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33296 "Expanding masked load is supported for 32 and 64-bit types only!");
33297
33298 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33299 "Cannot lower masked load op.");
33300
33301 assert((ScalarVT.getSizeInBits() >= 32 ||
33302 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33303 ScalarVT == MVT::f16))) &&
33304 "Unsupported masked load op.");
33305
33306 // This operation is legal for targets with VLX, but without
33307 // VLX the vector should be widened to 512 bit
33308 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33309 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33310 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33311
33312 // Mask element has to be i1.
33313 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33314 "Unexpected mask type");
33315
33316 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33317
33318 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33319 SDValue NewLoad = DAG.getMaskedLoad(
33320 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33321 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33322 N->getExtensionType(), N->isExpandingLoad());
33323
33324 SDValue Extract =
33325 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33326 DAG.getVectorIdxConstant(0, dl));
33327 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33328 return DAG.getMergeValues(RetOps, dl);
33329}
33330
33331static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33332 SelectionDAG &DAG) {
33334 SDValue DataToStore = N->getValue();
33335 MVT VT = DataToStore.getSimpleValueType();
33336 MVT ScalarVT = VT.getScalarType();
33337 SDValue Mask = N->getMask();
33338 SDLoc dl(Op);
33339
33340 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33341 "Expanding masked load is supported on AVX-512 target only!");
33342
33343 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33344 "Expanding masked load is supported for 32 and 64-bit types only!");
33345
33346 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33347 "Cannot lower masked store op.");
33348
33349 assert((ScalarVT.getSizeInBits() >= 32 ||
33350 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33351 ScalarVT == MVT::f16))) &&
33352 "Unsupported masked store op.");
33353
33354 // This operation is legal for targets with VLX, but without
33355 // VLX the vector should be widened to 512 bit
33356 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33357 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33358
33359 // Mask element has to be i1.
33360 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33361 "Unexpected mask type");
33362
33363 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33364
33365 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33366 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33367 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33368 N->getOffset(), Mask, N->getMemoryVT(),
33369 N->getMemOperand(), N->getAddressingMode(),
33370 N->isTruncatingStore(), N->isCompressingStore());
33371}
33372
33373static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33374 SelectionDAG &DAG) {
33375 assert(Subtarget.hasAVX2() &&
33376 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33377
33379 SDLoc dl(Op);
33380 MVT VT = Op.getSimpleValueType();
33381 SDValue Index = N->getIndex();
33382 SDValue Mask = N->getMask();
33383 SDValue PassThru = N->getPassThru();
33384 MVT IndexVT = Index.getSimpleValueType();
33385
33386 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33387
33388 // If the index is v2i32, we're being called by type legalization.
33389 if (IndexVT == MVT::v2i32)
33390 return SDValue();
33391
33392 // If we don't have VLX and neither the passthru or index is 512-bits, we
33393 // need to widen until one is.
33394 MVT OrigVT = VT;
33395 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33396 !IndexVT.is512BitVector()) {
33397 // Determine how much we need to widen by to get a 512-bit type.
33398 unsigned Factor = std::min(512/VT.getSizeInBits(),
33399 512/IndexVT.getSizeInBits());
33400
33401 unsigned NumElts = VT.getVectorNumElements() * Factor;
33402
33403 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33404 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33405 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33406
33407 PassThru = ExtendToType(PassThru, VT, DAG);
33408 Index = ExtendToType(Index, IndexVT, DAG);
33409 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33410 }
33411
33412 // Break dependency on the data register.
33413 if (PassThru.isUndef())
33414 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33415
33416 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33417 N->getScale() };
33418 SDValue NewGather = DAG.getMemIntrinsicNode(
33419 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33420 N->getMemOperand());
33421 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33422 DAG.getVectorIdxConstant(0, dl));
33423 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33424}
33425
33427 SDLoc dl(Op);
33428 SDValue Src = Op.getOperand(0);
33429 MVT DstVT = Op.getSimpleValueType();
33430
33432 unsigned SrcAS = N->getSrcAddressSpace();
33433
33434 assert(SrcAS != N->getDestAddressSpace() &&
33435 "addrspacecast must be between different address spaces");
33436
33437 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33438 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33439 } else if (DstVT == MVT::i64) {
33440 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33441 } else if (DstVT == MVT::i32) {
33442 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33443 } else {
33444 report_fatal_error("Bad address space in addrspacecast");
33445 }
33446 return Op;
33447}
33448
33449SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33450 SelectionDAG &DAG) const {
33451 // TODO: Eventually, the lowering of these nodes should be informed by or
33452 // deferred to the GC strategy for the function in which they appear. For
33453 // now, however, they must be lowered to something. Since they are logically
33454 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33455 // require special handling for these nodes), lower them as literal NOOPs for
33456 // the time being.
33458 Ops.push_back(Op.getOperand(0));
33459 if (Op->getGluedNode())
33460 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33461
33462 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33463 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33464}
33465
33466// Custom split CVTPS2PH with wide types.
33468 SDLoc dl(Op);
33469 EVT VT = Op.getValueType();
33470 SDValue Lo, Hi;
33471 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33472 EVT LoVT, HiVT;
33473 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33474 SDValue RC = Op.getOperand(1);
33475 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33476 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33477 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33478}
33479
33481 SelectionDAG &DAG) {
33482 unsigned IsData = Op.getConstantOperandVal(4);
33483
33484 // We don't support non-data prefetch without PREFETCHI.
33485 // Just preserve the chain.
33486 if (!IsData && !Subtarget.hasPREFETCHI())
33487 return Op.getOperand(0);
33488
33489 return Op;
33490}
33491
33493 SDNode *N = Op.getNode();
33494 SDValue Operand = N->getOperand(0);
33495 EVT VT = Operand.getValueType();
33496 SDLoc dl(N);
33497
33498 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33499
33500 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33501 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33502 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33503 // promote this operator's result!
33504 SDValue Chain = DAG.getEntryNode();
33505 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33506 {Chain, Operand, One});
33507 return StrictFmul;
33508}
33509
33511 unsigned OpNo) {
33512 const APInt Operand(32, OpNo);
33513 std::string OpNoStr = llvm::toString(Operand, 10, false);
33514 std::string Str(" $");
33515
33516 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33517 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33518
33519 auto I = StringRef::npos;
33520 for (auto &AsmStr : AsmStrs) {
33521 // Match the OpNo string. We should match exactly to exclude match
33522 // sub-string, e.g. "$12" contain "$1"
33523 if (AsmStr.ends_with(OpNoStr1))
33524 I = AsmStr.size() - OpNoStr1.size();
33525
33526 // Get the index of operand in AsmStr.
33527 if (I == StringRef::npos)
33528 I = AsmStr.find(OpNoStr1 + ",");
33529 if (I == StringRef::npos)
33530 I = AsmStr.find(OpNoStr2);
33531
33532 if (I == StringRef::npos)
33533 continue;
33534
33535 assert(I > 0 && "Unexpected inline asm string!");
33536 // Remove the operand string and label (if exsit).
33537 // For example:
33538 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33539 // ==>
33540 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33541 // ==>
33542 // "call dword ptr "
33543 auto TmpStr = AsmStr.substr(0, I);
33544 I = TmpStr.rfind(':');
33545 if (I != StringRef::npos)
33546 TmpStr = TmpStr.substr(I + 1);
33547 return TmpStr.take_while(llvm::isAlpha);
33548 }
33549
33550 return StringRef();
33551}
33552
33554 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33555 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33556 // changed from indirect TargetLowering::C_Memory to direct
33557 // TargetLowering::C_Address.
33558 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33559 // location.
33560 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33561 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33562}
33563
33565 SDValue Mask) {
33566 EVT Ty = MVT::i8;
33567 auto V = DAG.getBitcast(MVT::i1, Mask);
33568 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33569 auto Zero = DAG.getConstant(0, DL, Ty);
33570 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33571 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33572 return SDValue(CmpZero.getNode(), 1);
33573}
33574
33576 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33577 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33578 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33579 // ->
33580 // _, flags = SUB 0, mask
33581 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33582 // bit_cast_to_vector<res>
33583 EVT VTy = PassThru.getValueType();
33584 EVT Ty = VTy.getVectorElementType();
33585 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33586 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33587 : DAG.getBitcast(Ty, PassThru);
33588 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33589 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33590 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33591 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33592 return DAG.getBitcast(VTy, NewLoad);
33593}
33594
33596 SDValue Chain,
33598 SDValue Val, SDValue Mask) const {
33599 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33600 // ->
33601 // _, flags = SUB 0, mask
33602 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33604 SDVTList Tys = DAG.getVTList(MVT::Other);
33605 auto ScalarVal = DAG.getBitcast(Ty, Val);
33606 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33607 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33608 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33609 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33610}
33611
33612/// Provide custom lowering hooks for some operations.
33614 switch (Op.getOpcode()) {
33615 // clang-format off
33616 default: llvm_unreachable("Should not custom lower this!");
33617 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33618 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33619 return LowerCMP_SWAP(Op, Subtarget, DAG);
33620 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33621 case ISD::ATOMIC_LOAD_ADD:
33622 case ISD::ATOMIC_LOAD_SUB:
33623 case ISD::ATOMIC_LOAD_OR:
33624 case ISD::ATOMIC_LOAD_XOR:
33625 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33626 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33627 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33628 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33629 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33630 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33631 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33632 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33633 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33634 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33635 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33636 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33637 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33638 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33639 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33640 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33641 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33642 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33643 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33644 case ISD::SHL_PARTS:
33645 case ISD::SRA_PARTS:
33646 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33647 case ISD::FSHL:
33648 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33649 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33651 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33653 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33654 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33655 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33656 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33657 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33660 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33661 case ISD::FP_TO_SINT:
33663 case ISD::FP_TO_UINT:
33664 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33666 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33667 case ISD::FP_EXTEND:
33668 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33669 case ISD::FP_ROUND:
33670 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33671 case ISD::FP16_TO_FP:
33672 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33673 case ISD::FP_TO_FP16:
33674 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33675 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33676 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33677 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33678 case ISD::FADD:
33679 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33680 case ISD::FROUND: return LowerFROUND(Op, DAG);
33681 case ISD::FABS:
33682 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33683 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33684 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33685 case ISD::LRINT:
33686 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33687 case ISD::SETCC:
33688 case ISD::STRICT_FSETCC:
33689 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33690 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33691 case ISD::SELECT: return LowerSELECT(Op, DAG);
33692 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33693 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33694 case ISD::VASTART: return LowerVASTART(Op, DAG);
33695 case ISD::VAARG: return LowerVAARG(Op, DAG);
33696 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33697 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33699 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33700 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33701 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33702 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33704 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33705 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33706 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33707 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33708 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33710 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33711 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33712 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33713 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33714 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33715 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33716 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33717 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33718 case ISD::CTLZ:
33719 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33720 case ISD::CTTZ:
33721 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33722 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33723 case ISD::MULHS:
33724 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33725 case ISD::ROTL:
33726 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33727 case ISD::SRA:
33728 case ISD::SRL:
33729 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33730 case ISD::SADDO:
33731 case ISD::UADDO:
33732 case ISD::SSUBO:
33733 case ISD::USUBO: return LowerXALUO(Op, DAG);
33734 case ISD::SMULO:
33735 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33736 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33737 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33738 case ISD::SADDO_CARRY:
33739 case ISD::SSUBO_CARRY:
33740 case ISD::UADDO_CARRY:
33741 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33742 case ISD::ADD:
33743 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33744 case ISD::UADDSAT:
33745 case ISD::SADDSAT:
33746 case ISD::USUBSAT:
33747 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33748 case ISD::SMAX:
33749 case ISD::SMIN:
33750 case ISD::UMAX:
33751 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33752 case ISD::FMINIMUM:
33753 case ISD::FMAXIMUM:
33754 case ISD::FMINIMUMNUM:
33755 case ISD::FMAXIMUMNUM:
33756 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33757 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33758 case ISD::ABDS:
33759 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33760 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33761 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33762 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33763 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33764 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33765 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33766 case ISD::GC_TRANSITION_START:
33767 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33768 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33769 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33770 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33771 // clang-format on
33772 }
33773}
33774
33775/// Replace a node with an illegal result type with a new node built out of
33776/// custom code.
33779 SelectionDAG &DAG) const {
33780 SDLoc dl(N);
33781 unsigned Opc = N->getOpcode();
33782 switch (Opc) {
33783 default:
33784#ifndef NDEBUG
33785 dbgs() << "ReplaceNodeResults: ";
33786 N->dump(&DAG);
33787#endif
33788 llvm_unreachable("Do not know how to custom type legalize this operation!");
33789 case X86ISD::CVTPH2PS: {
33790 EVT VT = N->getValueType(0);
33791 SDValue Lo, Hi;
33792 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33793 EVT LoVT, HiVT;
33794 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33795 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33796 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33797 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33798 Results.push_back(Res);
33799 return;
33800 }
33802 EVT VT = N->getValueType(0);
33803 SDValue Lo, Hi;
33804 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33805 EVT LoVT, HiVT;
33806 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33807 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33808 {N->getOperand(0), Lo});
33809 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33810 {N->getOperand(0), Hi});
33811 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33812 Lo.getValue(1), Hi.getValue(1));
33813 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33814 Results.push_back(Res);
33815 Results.push_back(Chain);
33816 return;
33817 }
33818 case X86ISD::CVTPS2PH:
33819 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33820 return;
33821 case ISD::CTPOP: {
33822 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33823 // If we have at most 32 active bits, then perform as i32 CTPOP.
33824 // TODO: Perform this in generic legalizer?
33825 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33826 unsigned LZ = Known.countMinLeadingZeros();
33827 unsigned TZ = Known.countMinTrailingZeros();
33828 if ((LZ + TZ) >= 32) {
33829 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33830 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33831 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33832 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33833 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33834 Results.push_back(Op);
33835 return;
33836 }
33837 // Use a v2i64 if possible.
33838 bool NoImplicitFloatOps =
33840 Attribute::NoImplicitFloat);
33841 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33842 SDValue Wide =
33843 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33844 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33845 // Bit count should fit in 32-bits, extract it as that and then zero
33846 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33847 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33848 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33849 DAG.getVectorIdxConstant(0, dl));
33850 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33851 Results.push_back(Wide);
33852 }
33853 return;
33854 }
33855 case ISD::MUL: {
33856 EVT VT = N->getValueType(0);
33858 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33859 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33860 // elements are needed.
33861 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33862 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33863 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33864 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33865 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33866 unsigned NumConcats = 16 / VT.getVectorNumElements();
33867 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33868 ConcatOps[0] = Res;
33869 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33870 Results.push_back(Res);
33871 return;
33872 }
33873 case ISD::SMULO:
33874 case ISD::UMULO: {
33875 EVT VT = N->getValueType(0);
33877 VT == MVT::v2i32 && "Unexpected VT!");
33878 bool IsSigned = Opc == ISD::SMULO;
33879 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33880 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33881 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33882 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33883 // Extract the high 32 bits from each result using PSHUFD.
33884 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33885 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33886 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33887 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33888 DAG.getVectorIdxConstant(0, dl));
33889
33890 // Truncate the low bits of the result. This will become PSHUFD.
33891 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33892
33893 SDValue HiCmp;
33894 if (IsSigned) {
33895 // SMULO overflows if the high bits don't match the sign of the low.
33896 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33897 } else {
33898 // UMULO overflows if the high bits are non-zero.
33899 HiCmp = DAG.getConstant(0, dl, VT);
33900 }
33901 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33902
33903 // Widen the result with by padding with undef.
33904 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33905 DAG.getUNDEF(VT));
33906 Results.push_back(Res);
33907 Results.push_back(Ovf);
33908 return;
33909 }
33910 case X86ISD::VPMADDWD: {
33911 // Legalize types for X86ISD::VPMADDWD by widening.
33912 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33913
33914 EVT VT = N->getValueType(0);
33915 EVT InVT = N->getOperand(0).getValueType();
33916 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33917 "Expected a VT that divides into 128 bits.");
33919 "Unexpected type action!");
33920 unsigned NumConcat = 128 / InVT.getSizeInBits();
33921
33922 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33923 InVT.getVectorElementType(),
33924 NumConcat * InVT.getVectorNumElements());
33925 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33927 NumConcat * VT.getVectorNumElements());
33928
33929 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33930 Ops[0] = N->getOperand(0);
33931 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33932 Ops[0] = N->getOperand(1);
33933 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33934
33935 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33936 Results.push_back(Res);
33937 return;
33938 }
33939 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33940 case X86ISD::FMINC:
33941 case X86ISD::FMIN:
33942 case X86ISD::FMAXC:
33943 case X86ISD::FMAX:
33945 case X86ISD::STRICT_FMAX: {
33946 EVT VT = N->getValueType(0);
33947 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33948 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33949 SDValue UNDEF = DAG.getUNDEF(VT);
33950 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33951 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33952 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33953 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33954 SDValue Res;
33955 if (IsStrict)
33956 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33957 {N->getOperand(0), LHS, RHS});
33958 else
33959 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33960 Results.push_back(Res);
33961 if (IsStrict)
33962 Results.push_back(Res.getValue(1));
33963 return;
33964 }
33965 case ISD::SDIV:
33966 case ISD::UDIV:
33967 case ISD::SREM:
33968 case ISD::UREM: {
33969 EVT VT = N->getValueType(0);
33970 if (VT.isVector()) {
33972 "Unexpected type action!");
33973 // If this RHS is a constant splat vector we can widen this and let
33974 // division/remainder by constant optimize it.
33975 // TODO: Can we do something for non-splat?
33976 APInt SplatVal;
33977 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33978 unsigned NumConcats = 128 / VT.getSizeInBits();
33979 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33980 Ops0[0] = N->getOperand(0);
33981 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33982 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33983 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33984 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33985 Results.push_back(Res);
33986 }
33987 return;
33988 }
33989
33990 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33991 Results.push_back(V);
33992 return;
33993 }
33994 case ISD::TRUNCATE: {
33995 MVT VT = N->getSimpleValueType(0);
33996 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33997 return;
33998
33999 // The generic legalizer will try to widen the input type to the same
34000 // number of elements as the widened result type. But this isn't always
34001 // the best thing so do some custom legalization to avoid some cases.
34002 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
34003 SDValue In = N->getOperand(0);
34004 EVT InVT = In.getValueType();
34005 EVT InEltVT = InVT.getVectorElementType();
34006 EVT EltVT = VT.getVectorElementType();
34007 unsigned MinElts = VT.getVectorNumElements();
34008 unsigned WidenNumElts = WidenVT.getVectorNumElements();
34009 unsigned InBits = InVT.getSizeInBits();
34010
34011 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
34012 unsigned PackOpcode;
34013 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
34014 Subtarget, N->getFlags())) {
34015 if (SDValue Res =
34016 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
34017 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
34018 Results.push_back(Res);
34019 return;
34020 }
34021 }
34022
34023 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
34024 // 128 bit and smaller inputs should avoid truncate all together and
34025 // use a shuffle.
34026 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
34027 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
34028 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
34029 for (unsigned I = 0; I < MinElts; ++I)
34030 TruncMask[I] = Scale * I;
34031 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
34032 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
34033 "Illegal vector type in truncation");
34034 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
34035 Results.push_back(
34036 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
34037 return;
34038 }
34039 }
34040
34041 // With AVX512 there are some cases that can use a target specific
34042 // truncate node to go from 256/512 to less than 128 with zeros in the
34043 // upper elements of the 128 bit result.
34044 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34045 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34046 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34047 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34048 return;
34049 }
34050 // There's one case we can widen to 512 bits and use VTRUNC.
34051 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34052 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34053 DAG.getUNDEF(MVT::v4i64));
34054 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34055 return;
34056 }
34057 }
34058 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34059 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34060 isTypeLegal(MVT::v4i64)) {
34061 // Input needs to be split and output needs to widened. Let's use two
34062 // VTRUNCs, and shuffle their results together into the wider type.
34063 SDValue Lo, Hi;
34064 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34065
34066 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34067 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34068 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34069 { 0, 1, 2, 3, 16, 17, 18, 19,
34070 -1, -1, -1, -1, -1, -1, -1, -1 });
34071 Results.push_back(Res);
34072 return;
34073 }
34074
34075 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34076 // this via type legalization.
34077 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34078 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34079 (!Subtarget.hasSSSE3() ||
34080 (!isTypeLegal(InVT) &&
34081 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34082 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34083 InEltVT.getSizeInBits() * WidenNumElts);
34084 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34085 return;
34086 }
34087
34088 return;
34089 }
34090 case ISD::ANY_EXTEND:
34091 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34092 // It's intended to custom handle the input type.
34093 assert(N->getValueType(0) == MVT::v8i8 &&
34094 "Do not know how to legalize this Node");
34095 return;
34096 case ISD::SIGN_EXTEND:
34097 case ISD::ZERO_EXTEND: {
34098 EVT VT = N->getValueType(0);
34099 SDValue In = N->getOperand(0);
34100 EVT InVT = In.getValueType();
34101 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34102 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34104 "Unexpected type action!");
34105 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34106 // Custom split this so we can extend i8/i16->i32 invec. This is better
34107 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34108 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34109 // we allow the sra from the extend to i32 to be shared by the split.
34110 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34111
34112 // Fill a vector with sign bits for each element.
34113 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34114 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34115
34116 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34117 // to v2i64.
34118 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34119 {0, 4, 1, 5});
34120 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34121 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34122 {2, 6, 3, 7});
34123 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34124
34125 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34126 Results.push_back(Res);
34127 return;
34128 }
34129
34130 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34131 if (!InVT.is128BitVector()) {
34132 // Not a 128 bit vector, but maybe type legalization will promote
34133 // it to 128 bits.
34134 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34135 return;
34136 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34137 if (!InVT.is128BitVector())
34138 return;
34139
34140 // Promote the input to 128 bits. Type legalization will turn this into
34141 // zext_inreg/sext_inreg.
34142 In = DAG.getNode(Opc, dl, InVT, In);
34143 }
34144
34145 // Perform custom splitting instead of the two stage extend we would get
34146 // by default.
34147 EVT LoVT, HiVT;
34148 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34149 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34150
34151 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34152
34153 // We need to shift the input over by half the number of elements.
34154 unsigned NumElts = InVT.getVectorNumElements();
34155 unsigned HalfNumElts = NumElts / 2;
34156 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34157 for (unsigned i = 0; i != HalfNumElts; ++i)
34158 ShufMask[i] = i + HalfNumElts;
34159
34160 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34161 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34162
34163 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34164 Results.push_back(Res);
34165 }
34166 return;
34167 }
34169 case ISD::FP_TO_UINT_SAT: {
34170 if (!Subtarget.hasAVX10_2())
34171 return;
34172
34173 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34174 EVT VT = N->getValueType(0);
34175 SDValue Op = N->getOperand(0);
34176 EVT OpVT = Op.getValueType();
34177 SDValue Res;
34178
34179 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34180 if (IsSigned)
34181 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34182 else
34183 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34184 Results.push_back(Res);
34185 }
34186 return;
34187 }
34188 case ISD::FP_TO_SINT:
34190 case ISD::FP_TO_UINT:
34192 bool IsStrict = N->isStrictFPOpcode();
34193 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34194 EVT VT = N->getValueType(0);
34195 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34196 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34197 EVT SrcVT = Src.getValueType();
34198
34199 SDValue Res;
34200 if (isSoftF16(SrcVT, Subtarget)) {
34201 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34202 if (IsStrict) {
34203 Res =
34204 DAG.getNode(Opc, dl, {VT, MVT::Other},
34205 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34206 {NVT, MVT::Other}, {Chain, Src})});
34207 Chain = Res.getValue(1);
34208 } else {
34209 Res =
34210 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34211 }
34212 Results.push_back(Res);
34213 if (IsStrict)
34214 Results.push_back(Chain);
34215
34216 return;
34217 }
34218
34219 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34220 SrcVT.getVectorElementType() == MVT::f16) {
34221 EVT EleVT = VT.getVectorElementType();
34222 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34223
34224 if (SrcVT != MVT::v8f16) {
34225 SDValue Tmp =
34226 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34227 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34228 Ops[0] = Src;
34229 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34230 }
34231
34232 if (IsStrict) {
34234 Res =
34235 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34236 Chain = Res.getValue(1);
34237 } else {
34238 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34239 Res = DAG.getNode(Opc, dl, ResVT, Src);
34240 }
34241
34242 // TODO: Need to add exception check code for strict FP.
34243 if (EleVT.getSizeInBits() < 16) {
34244 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34245 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34246
34247 // Now widen to 128 bits.
34248 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34249 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34250 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34251 ConcatOps[0] = Res;
34252 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34253 }
34254
34255 Results.push_back(Res);
34256 if (IsStrict)
34257 Results.push_back(Chain);
34258
34259 return;
34260 }
34261
34262 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34264 "Unexpected type action!");
34265
34266 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34267 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34268 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34270 SDValue Res;
34271 SDValue Chain;
34272 if (IsStrict) {
34273 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34274 {N->getOperand(0), Src});
34275 Chain = Res.getValue(1);
34276 } else
34277 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34278
34279 // Preserve what we know about the size of the original result. If the
34280 // result is v2i32, we have to manually widen the assert.
34281 if (PromoteVT == MVT::v2i32)
34282 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34283 DAG.getUNDEF(MVT::v2i32));
34284
34285 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34286 Res.getValueType(), Res,
34288
34289 if (PromoteVT == MVT::v2i32)
34290 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34291 DAG.getVectorIdxConstant(0, dl));
34292
34293 // Truncate back to the original width.
34294 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34295
34296 // Now widen to 128 bits.
34297 unsigned NumConcats = 128 / VT.getSizeInBits();
34299 VT.getVectorNumElements() * NumConcats);
34300 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34301 ConcatOps[0] = Res;
34302 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34303 Results.push_back(Res);
34304 if (IsStrict)
34305 Results.push_back(Chain);
34306 return;
34307 }
34308
34309
34310 if (VT == MVT::v2i32) {
34311 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34312 "Strict unsigned conversion requires AVX512");
34313 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34315 "Unexpected type action!");
34316 if (Src.getValueType() == MVT::v2f64) {
34317 if (!IsSigned && !Subtarget.hasAVX512()) {
34318 SDValue Res =
34319 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34320 Results.push_back(Res);
34321 return;
34322 }
34323
34324 if (IsStrict)
34326 else
34327 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34328
34329 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34330 if (!IsSigned && !Subtarget.hasVLX()) {
34331 // Otherwise we can defer to the generic legalizer which will widen
34332 // the input as well. This will be further widened during op
34333 // legalization to v8i32<-v8f64.
34334 // For strict nodes we'll need to widen ourselves.
34335 // FIXME: Fix the type legalizer to safely widen strict nodes?
34336 if (!IsStrict)
34337 return;
34338 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34339 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34340 Opc = N->getOpcode();
34341 }
34342 SDValue Res;
34343 SDValue Chain;
34344 if (IsStrict) {
34345 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34346 {N->getOperand(0), Src});
34347 Chain = Res.getValue(1);
34348 } else {
34349 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34350 }
34351 Results.push_back(Res);
34352 if (IsStrict)
34353 Results.push_back(Chain);
34354 return;
34355 }
34356
34357 // Custom widen strict v2f32->v2i32 by padding with zeros.
34358 // FIXME: Should generic type legalizer do this?
34359 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34360 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34361 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34362 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34363 {N->getOperand(0), Src});
34364 Results.push_back(Res);
34365 Results.push_back(Res.getValue(1));
34366 return;
34367 }
34368
34369 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34370 // so early out here.
34371 return;
34372 }
34373
34374 assert(!VT.isVector() && "Vectors should have been handled above!");
34375
34376 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34377 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34378 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34379 assert(!Subtarget.is64Bit() && "i64 should be legal");
34380 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34381 // If we use a 128-bit result we might need to use a target specific node.
34382 unsigned SrcElts =
34383 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34384 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34385 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34386 if (NumElts != SrcElts) {
34387 if (IsStrict)
34389 else
34390 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34391 }
34392
34393 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34394 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34395 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34396 ZeroIdx);
34397 SDValue Chain;
34398 if (IsStrict) {
34399 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34400 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34401 Chain = Res.getValue(1);
34402 } else
34403 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34404 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34405 Results.push_back(Res);
34406 if (IsStrict)
34407 Results.push_back(Chain);
34408 return;
34409 }
34410
34411 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34412 SDValue Chain;
34413 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34414 Results.push_back(V);
34415 if (IsStrict)
34416 Results.push_back(Chain);
34417 return;
34418 }
34419
34420 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34421 Results.push_back(V);
34422 if (IsStrict)
34423 Results.push_back(Chain);
34424 }
34425 return;
34426 }
34427 case ISD::LRINT:
34428 if (N->getValueType(0) == MVT::v2i32) {
34429 SDValue Src = N->getOperand(0);
34430 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34431 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34432 DAG.getUNDEF(MVT::v2f16));
34433 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34434 DAG.getUNDEF(MVT::v4f16));
34435 } else if (Src.getValueType() != MVT::v2f64) {
34436 return;
34437 }
34438 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34439 return;
34440 }
34441 [[fallthrough]];
34442 case ISD::LLRINT: {
34443 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34444 Results.push_back(V);
34445 return;
34446 }
34447
34448 case ISD::SINT_TO_FP:
34450 case ISD::UINT_TO_FP:
34452 bool IsStrict = N->isStrictFPOpcode();
34453 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34454 EVT VT = N->getValueType(0);
34455 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34456 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34457 Subtarget.hasVLX()) {
34458 if (Src.getValueType().getVectorElementType() == MVT::i16)
34459 return;
34460
34461 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34462 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34463 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34464 : DAG.getUNDEF(MVT::v2i32));
34465 if (IsStrict) {
34466 unsigned Opc =
34468 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34469 {N->getOperand(0), Src});
34470 Results.push_back(Res);
34471 Results.push_back(Res.getValue(1));
34472 } else {
34473 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34474 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34475 }
34476 return;
34477 }
34478 if (VT != MVT::v2f32)
34479 return;
34480 EVT SrcVT = Src.getValueType();
34481 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34482 if (IsStrict) {
34483 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34485 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34486 {N->getOperand(0), Src});
34487 Results.push_back(Res);
34488 Results.push_back(Res.getValue(1));
34489 } else {
34490 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34491 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34492 }
34493 return;
34494 }
34495 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34496 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34497 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34498 SDValue One = DAG.getConstant(1, dl, SrcVT);
34499 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34500 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34501 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34502 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34503 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34504 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34505 for (int i = 0; i != 2; ++i) {
34506 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34507 SignSrc, DAG.getVectorIdxConstant(i, dl));
34508 if (IsStrict)
34509 SignCvts[i] =
34510 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34511 {N->getOperand(0), Elt});
34512 else
34513 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34514 };
34515 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34516 SDValue Slow, Chain;
34517 if (IsStrict) {
34518 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34519 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34520 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34521 {Chain, SignCvt, SignCvt});
34522 Chain = Slow.getValue(1);
34523 } else {
34524 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34525 }
34526 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34527 IsNeg =
34528 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34529 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34530 Results.push_back(Cvt);
34531 if (IsStrict)
34532 Results.push_back(Chain);
34533 return;
34534 }
34535
34536 if (SrcVT != MVT::v2i32)
34537 return;
34538
34539 if (IsSigned || Subtarget.hasAVX512()) {
34540 if (!IsStrict)
34541 return;
34542
34543 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34544 // FIXME: Should generic type legalizer do this?
34545 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34546 DAG.getConstant(0, dl, MVT::v2i32));
34547 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34548 {N->getOperand(0), Src});
34549 Results.push_back(Res);
34550 Results.push_back(Res.getValue(1));
34551 return;
34552 }
34553
34554 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34555 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34556 SDValue VBias = DAG.getConstantFP(
34557 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34558 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34559 DAG.getBitcast(MVT::v2i64, VBias));
34560 Or = DAG.getBitcast(MVT::v2f64, Or);
34561 if (IsStrict) {
34562 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34563 {N->getOperand(0), Or, VBias});
34565 {MVT::v4f32, MVT::Other},
34566 {Sub.getValue(1), Sub});
34567 Results.push_back(Res);
34568 Results.push_back(Res.getValue(1));
34569 } else {
34570 // TODO: Are there any fast-math-flags to propagate here?
34571 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34572 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34573 }
34574 return;
34575 }
34577 case ISD::FP_ROUND: {
34578 bool IsStrict = N->isStrictFPOpcode();
34579 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34580 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34581 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34582 EVT SrcVT = Src.getValueType();
34583 EVT VT = N->getValueType(0);
34584 SDValue V;
34585 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34586 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34587 : DAG.getUNDEF(MVT::v2f32);
34588 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34589 }
34590 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34591 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34592 if (SrcVT.getVectorElementType() != MVT::f32)
34593 return;
34594
34595 if (IsStrict)
34596 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34597 {Chain, Src, Rnd});
34598 else
34599 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34600
34601 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34602 if (IsStrict)
34603 Results.push_back(V.getValue(1));
34604 return;
34605 }
34606 if (!isTypeLegal(Src.getValueType()))
34607 return;
34608 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34609 if (IsStrict)
34610 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34611 {Chain, Src});
34612 else
34613 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34614 Results.push_back(V);
34615 if (IsStrict)
34616 Results.push_back(V.getValue(1));
34617 return;
34618 }
34619 case ISD::FP_EXTEND:
34620 case ISD::STRICT_FP_EXTEND: {
34621 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34622 // No other ValueType for FP_EXTEND should reach this point.
34623 assert(N->getValueType(0) == MVT::v2f32 &&
34624 "Do not know how to legalize this Node");
34625 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34626 return;
34627 bool IsStrict = N->isStrictFPOpcode();
34628 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34629 if (Src.getValueType().getVectorElementType() != MVT::f16)
34630 return;
34631 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34632 : DAG.getUNDEF(MVT::v2f16);
34633 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34634 if (IsStrict)
34635 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34636 {N->getOperand(0), V});
34637 else
34638 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34639 Results.push_back(V);
34640 if (IsStrict)
34641 Results.push_back(V.getValue(1));
34642 return;
34643 }
34645 unsigned IntNo = N->getConstantOperandVal(1);
34646 switch (IntNo) {
34647 default : llvm_unreachable("Do not know how to custom type "
34648 "legalize this intrinsic operation!");
34649 case Intrinsic::x86_rdtsc:
34650 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34651 Results);
34652 case Intrinsic::x86_rdtscp:
34653 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34654 Results);
34655 case Intrinsic::x86_rdpmc:
34656 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34657 Results);
34658 return;
34659 case Intrinsic::x86_rdpru:
34660 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34661 Results);
34662 return;
34663 case Intrinsic::x86_xgetbv:
34664 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34665 Results);
34666 return;
34667 }
34668 }
34669 case ISD::READCYCLECOUNTER: {
34670 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34671 }
34672 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34673 EVT T = N->getValueType(0);
34674 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34675 bool Regs64bit = T == MVT::i128;
34676 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34677 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34678 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34679 SDValue cpInL, cpInH;
34680 std::tie(cpInL, cpInH) =
34681 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34682 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34683 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34684 cpInH =
34685 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34686 cpInH, cpInL.getValue(1));
34687 SDValue swapInL, swapInH;
34688 std::tie(swapInL, swapInH) =
34689 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34690 swapInH =
34691 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34692 swapInH, cpInH.getValue(1));
34693
34694 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34695 // until later. So we keep the RBX input in a vreg and use a custom
34696 // inserter.
34697 // Since RBX will be a reserved register the register allocator will not
34698 // make sure its value will be properly saved and restored around this
34699 // live-range.
34700 SDValue Result;
34701 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34702 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34703 if (Regs64bit) {
34704 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34705 swapInH.getValue(1)};
34706 Result =
34707 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34708 } else {
34709 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34710 swapInH.getValue(1));
34711 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34712 swapInL.getValue(1)};
34713 Result =
34714 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34715 }
34716
34717 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34718 Regs64bit ? X86::RAX : X86::EAX,
34719 HalfT, Result.getValue(1));
34720 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34721 Regs64bit ? X86::RDX : X86::EDX,
34722 HalfT, cpOutL.getValue(2));
34723 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34724
34725 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34726 MVT::i32, cpOutH.getValue(2));
34727 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34728 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34729
34730 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34731 Results.push_back(Success);
34732 Results.push_back(EFLAGS.getValue(1));
34733 return;
34734 }
34735 case ISD::ATOMIC_LOAD: {
34736 assert(
34737 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34738 "Unexpected VT!");
34739 bool NoImplicitFloatOps =
34741 Attribute::NoImplicitFloat);
34742 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34743 auto *Node = cast<AtomicSDNode>(N);
34744
34745 if (N->getValueType(0) == MVT::i128) {
34746 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34747 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34748 Node->getBasePtr(), Node->getMemOperand());
34749 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34750 DAG.getVectorIdxConstant(0, dl));
34751 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34752 DAG.getVectorIdxConstant(1, dl));
34753 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34754 {ResL, ResH}));
34755 Results.push_back(Ld.getValue(1));
34756 return;
34757 }
34758 break;
34759 }
34760 if (Subtarget.hasSSE1()) {
34761 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34762 // Then extract the lower 64-bits.
34763 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34764 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34765 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34767 MVT::i64, Node->getMemOperand());
34768 if (Subtarget.hasSSE2()) {
34769 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34770 DAG.getVectorIdxConstant(0, dl));
34771 Results.push_back(Res);
34772 Results.push_back(Ld.getValue(1));
34773 return;
34774 }
34775 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34776 // then casts to i64. This avoids a 128-bit stack temporary being
34777 // created by type legalization if we were to cast v4f32->v2i64.
34778 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34779 DAG.getVectorIdxConstant(0, dl));
34780 Res = DAG.getBitcast(MVT::i64, Res);
34781 Results.push_back(Res);
34782 Results.push_back(Ld.getValue(1));
34783 return;
34784 }
34785 if (Subtarget.hasX87()) {
34786 // First load this into an 80-bit X87 register. This will put the whole
34787 // integer into the significand.
34788 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34789 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34791 dl, Tys, Ops, MVT::i64,
34792 Node->getMemOperand());
34793 SDValue Chain = Result.getValue(1);
34794
34795 // Now store the X87 register to a stack temporary and convert to i64.
34796 // This store is not atomic and doesn't need to be.
34797 // FIXME: We don't need a stack temporary if the result of the load
34798 // is already being stored. We could just directly store there.
34799 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34800 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34801 MachinePointerInfo MPI =
34803 SDValue StoreOps[] = { Chain, Result, StackPtr };
34804 Chain = DAG.getMemIntrinsicNode(
34805 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34806 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34807
34808 // Finally load the value back from the stack temporary and return it.
34809 // This load is not atomic and doesn't need to be.
34810 // This load will be further type legalized.
34811 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34812 Results.push_back(Result);
34813 Results.push_back(Result.getValue(1));
34814 return;
34815 }
34816 }
34817 // TODO: Use MOVLPS when SSE1 is available?
34818 // Delegate to generic TypeLegalization. Situations we can really handle
34819 // should have already been dealt with by AtomicExpandPass.cpp.
34820 break;
34821 }
34822 case ISD::ATOMIC_SWAP:
34823 case ISD::ATOMIC_LOAD_ADD:
34824 case ISD::ATOMIC_LOAD_SUB:
34825 case ISD::ATOMIC_LOAD_AND:
34826 case ISD::ATOMIC_LOAD_OR:
34827 case ISD::ATOMIC_LOAD_XOR:
34828 case ISD::ATOMIC_LOAD_NAND:
34829 case ISD::ATOMIC_LOAD_MIN:
34830 case ISD::ATOMIC_LOAD_MAX:
34831 case ISD::ATOMIC_LOAD_UMIN:
34832 case ISD::ATOMIC_LOAD_UMAX:
34833 // Delegate to generic TypeLegalization. Situations we can really handle
34834 // should have already been dealt with by AtomicExpandPass.cpp.
34835 break;
34836
34837 case ISD::BITCAST: {
34838 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34839 EVT DstVT = N->getValueType(0);
34840 EVT SrcVT = N->getOperand(0).getValueType();
34841
34842 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34843 // we can split using the k-register rather than memory.
34844 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34845 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34846 SDValue Lo, Hi;
34847 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34848 Lo = DAG.getBitcast(MVT::i32, Lo);
34849 Hi = DAG.getBitcast(MVT::i32, Hi);
34850 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34851 Results.push_back(Res);
34852 return;
34853 }
34854
34855 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34856 // FIXME: Use v4f32 for SSE1?
34857 assert(Subtarget.hasSSE2() && "Requires SSE2");
34858 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34859 "Unexpected type action!");
34860 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34861 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34862 N->getOperand(0));
34863 Res = DAG.getBitcast(WideVT, Res);
34864 Results.push_back(Res);
34865 return;
34866 }
34867
34868 return;
34869 }
34870 case ISD::MGATHER: {
34871 EVT VT = N->getValueType(0);
34872 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34873 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34874 auto *Gather = cast<MaskedGatherSDNode>(N);
34875 SDValue Index = Gather->getIndex();
34876 if (Index.getValueType() != MVT::v2i64)
34877 return;
34879 "Unexpected type action!");
34880 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34881 SDValue Mask = Gather->getMask();
34882 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34883 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34884 Gather->getPassThru(),
34885 DAG.getUNDEF(VT));
34886 if (!Subtarget.hasVLX()) {
34887 // We need to widen the mask, but the instruction will only use 2
34888 // of its elements. So we can use undef.
34889 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34890 DAG.getUNDEF(MVT::v2i1));
34891 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34892 }
34893 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34894 Gather->getBasePtr(), Index, Gather->getScale() };
34895 SDValue Res = DAG.getMemIntrinsicNode(
34896 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34897 Gather->getMemoryVT(), Gather->getMemOperand());
34898 Results.push_back(Res);
34899 Results.push_back(Res.getValue(1));
34900 return;
34901 }
34902 return;
34903 }
34904 case ISD::LOAD: {
34905 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34906 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34907 // cast since type legalization will try to use an i64 load.
34908 MVT VT = N->getSimpleValueType(0);
34909 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34911 "Unexpected type action!");
34912 if (!ISD::isNON_EXTLoad(N))
34913 return;
34914 auto *Ld = cast<LoadSDNode>(N);
34915 if (Subtarget.hasSSE2()) {
34916 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34917 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34918 Ld->getPointerInfo(), Ld->getBaseAlign(),
34919 Ld->getMemOperand()->getFlags());
34920 SDValue Chain = Res.getValue(1);
34921 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34922 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34923 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34924 Res = DAG.getBitcast(WideVT, Res);
34925 Results.push_back(Res);
34926 Results.push_back(Chain);
34927 return;
34928 }
34929 assert(Subtarget.hasSSE1() && "Expected SSE");
34930 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34931 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34933 MVT::i64, Ld->getMemOperand());
34934 Results.push_back(Res);
34935 Results.push_back(Res.getValue(1));
34936 return;
34937 }
34938 case ISD::ADDRSPACECAST: {
34939 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34940 Results.push_back(V);
34941 return;
34942 }
34943 case ISD::BITREVERSE: {
34944 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34945 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34946 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34947 // We'll need to move the scalar in two i32 pieces.
34948 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34949 return;
34950 }
34952 // f16 = extract vXf16 %vec, i64 %idx
34953 assert(N->getSimpleValueType(0) == MVT::f16 &&
34954 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34955 assert(Subtarget.hasFP16() && "Expected FP16");
34956 SDValue VecOp = N->getOperand(0);
34958 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34959 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34960 N->getOperand(1));
34961 Split = DAG.getBitcast(MVT::f16, Split);
34962 Results.push_back(Split);
34963 return;
34964 }
34965 }
34966}
34967
34968const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34969 switch ((X86ISD::NodeType)Opcode) {
34970 case X86ISD::FIRST_NUMBER: break;
34971#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34972 NODE_NAME_CASE(BSF)
34973 NODE_NAME_CASE(BSR)
34974 NODE_NAME_CASE(FSHL)
34975 NODE_NAME_CASE(FSHR)
34976 NODE_NAME_CASE(FAND)
34977 NODE_NAME_CASE(FANDN)
34978 NODE_NAME_CASE(FOR)
34979 NODE_NAME_CASE(FXOR)
34980 NODE_NAME_CASE(FILD)
34981 NODE_NAME_CASE(FIST)
34982 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34983 NODE_NAME_CASE(FLD)
34984 NODE_NAME_CASE(FST)
34985 NODE_NAME_CASE(CALL)
34986 NODE_NAME_CASE(CALL_RVMARKER)
34987 NODE_NAME_CASE(IMP_CALL)
34989 NODE_NAME_CASE(CMP)
34990 NODE_NAME_CASE(FCMP)
34991 NODE_NAME_CASE(STRICT_FCMP)
34992 NODE_NAME_CASE(STRICT_FCMPS)
34994 NODE_NAME_CASE(UCOMI)
34995 NODE_NAME_CASE(COMX)
34996 NODE_NAME_CASE(UCOMX)
34997 NODE_NAME_CASE(CMPM)
34998 NODE_NAME_CASE(CMPMM)
34999 NODE_NAME_CASE(STRICT_CMPM)
35000 NODE_NAME_CASE(CMPMM_SAE)
35001 NODE_NAME_CASE(SETCC)
35002 NODE_NAME_CASE(SETCC_CARRY)
35003 NODE_NAME_CASE(FSETCC)
35004 NODE_NAME_CASE(FSETCCM)
35005 NODE_NAME_CASE(FSETCCM_SAE)
35006 NODE_NAME_CASE(CMOV)
35007 NODE_NAME_CASE(BRCOND)
35008 NODE_NAME_CASE(RET_GLUE)
35009 NODE_NAME_CASE(IRET)
35010 NODE_NAME_CASE(REP_STOS)
35011 NODE_NAME_CASE(REP_MOVS)
35012 NODE_NAME_CASE(GlobalBaseReg)
35014 NODE_NAME_CASE(WrapperRIP)
35015 NODE_NAME_CASE(MOVQ2DQ)
35016 NODE_NAME_CASE(MOVDQ2Q)
35017 NODE_NAME_CASE(MMX_MOVD2W)
35018 NODE_NAME_CASE(MMX_MOVW2D)
35019 NODE_NAME_CASE(PEXTRB)
35020 NODE_NAME_CASE(PEXTRW)
35021 NODE_NAME_CASE(INSERTPS)
35022 NODE_NAME_CASE(PINSRB)
35023 NODE_NAME_CASE(PINSRW)
35024 NODE_NAME_CASE(PSHUFB)
35025 NODE_NAME_CASE(ANDNP)
35026 NODE_NAME_CASE(BLENDI)
35028 NODE_NAME_CASE(HADD)
35029 NODE_NAME_CASE(HSUB)
35030 NODE_NAME_CASE(FHADD)
35031 NODE_NAME_CASE(FHSUB)
35032 NODE_NAME_CASE(CONFLICT)
35033 NODE_NAME_CASE(FMAX)
35034 NODE_NAME_CASE(FMAXS)
35035 NODE_NAME_CASE(FMAX_SAE)
35036 NODE_NAME_CASE(FMAXS_SAE)
35037 NODE_NAME_CASE(STRICT_FMAX)
35038 NODE_NAME_CASE(FMIN)
35039 NODE_NAME_CASE(FMINS)
35040 NODE_NAME_CASE(FMIN_SAE)
35041 NODE_NAME_CASE(FMINS_SAE)
35042 NODE_NAME_CASE(STRICT_FMIN)
35043 NODE_NAME_CASE(FMAXC)
35044 NODE_NAME_CASE(FMINC)
35045 NODE_NAME_CASE(FRSQRT)
35046 NODE_NAME_CASE(FRCP)
35047 NODE_NAME_CASE(EXTRQI)
35048 NODE_NAME_CASE(INSERTQI)
35049 NODE_NAME_CASE(TLSADDR)
35050 NODE_NAME_CASE(TLSBASEADDR)
35051 NODE_NAME_CASE(TLSCALL)
35052 NODE_NAME_CASE(TLSDESC)
35053 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35054 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35055 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35056 NODE_NAME_CASE(EH_RETURN)
35057 NODE_NAME_CASE(TC_RETURN)
35058 NODE_NAME_CASE(FNSTCW16m)
35059 NODE_NAME_CASE(FLDCW16m)
35060 NODE_NAME_CASE(FNSTENVm)
35061 NODE_NAME_CASE(FLDENVm)
35062 NODE_NAME_CASE(LCMPXCHG_DAG)
35063 NODE_NAME_CASE(LCMPXCHG8_DAG)
35064 NODE_NAME_CASE(LCMPXCHG16_DAG)
35065 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35066 NODE_NAME_CASE(LADD)
35067 NODE_NAME_CASE(LSUB)
35068 NODE_NAME_CASE(LOR)
35069 NODE_NAME_CASE(LXOR)
35070 NODE_NAME_CASE(LAND)
35071 NODE_NAME_CASE(LBTS)
35072 NODE_NAME_CASE(LBTC)
35073 NODE_NAME_CASE(LBTR)
35074 NODE_NAME_CASE(LBTS_RM)
35075 NODE_NAME_CASE(LBTC_RM)
35076 NODE_NAME_CASE(LBTR_RM)
35077 NODE_NAME_CASE(AADD)
35078 NODE_NAME_CASE(AOR)
35079 NODE_NAME_CASE(AXOR)
35080 NODE_NAME_CASE(AAND)
35081 NODE_NAME_CASE(VZEXT_MOVL)
35082 NODE_NAME_CASE(VZEXT_LOAD)
35083 NODE_NAME_CASE(VEXTRACT_STORE)
35084 NODE_NAME_CASE(VTRUNC)
35085 NODE_NAME_CASE(VTRUNCS)
35086 NODE_NAME_CASE(VTRUNCUS)
35087 NODE_NAME_CASE(VMTRUNC)
35088 NODE_NAME_CASE(VMTRUNCS)
35089 NODE_NAME_CASE(VMTRUNCUS)
35090 NODE_NAME_CASE(VTRUNCSTORES)
35091 NODE_NAME_CASE(VTRUNCSTOREUS)
35092 NODE_NAME_CASE(VMTRUNCSTORES)
35093 NODE_NAME_CASE(VMTRUNCSTOREUS)
35094 NODE_NAME_CASE(VFPEXT)
35095 NODE_NAME_CASE(STRICT_VFPEXT)
35096 NODE_NAME_CASE(VFPEXT_SAE)
35097 NODE_NAME_CASE(VFPEXTS)
35098 NODE_NAME_CASE(VFPEXTS_SAE)
35099 NODE_NAME_CASE(VFPROUND)
35100 NODE_NAME_CASE(VFPROUND2)
35101 NODE_NAME_CASE(VFPROUND2_RND)
35102 NODE_NAME_CASE(STRICT_VFPROUND)
35103 NODE_NAME_CASE(VMFPROUND)
35104 NODE_NAME_CASE(VFPROUND_RND)
35105 NODE_NAME_CASE(VFPROUNDS)
35106 NODE_NAME_CASE(VFPROUNDS_RND)
35107 NODE_NAME_CASE(VSHLDQ)
35108 NODE_NAME_CASE(VSRLDQ)
35109 NODE_NAME_CASE(VSHL)
35110 NODE_NAME_CASE(VSRL)
35111 NODE_NAME_CASE(VSRA)
35112 NODE_NAME_CASE(VSHLI)
35113 NODE_NAME_CASE(VSRLI)
35114 NODE_NAME_CASE(VSRAI)
35115 NODE_NAME_CASE(VSHLV)
35116 NODE_NAME_CASE(VSRLV)
35117 NODE_NAME_CASE(VSRAV)
35118 NODE_NAME_CASE(VROTLI)
35119 NODE_NAME_CASE(VROTRI)
35120 NODE_NAME_CASE(VPPERM)
35121 NODE_NAME_CASE(CMPP)
35122 NODE_NAME_CASE(STRICT_CMPP)
35123 NODE_NAME_CASE(PCMPEQ)
35124 NODE_NAME_CASE(PCMPGT)
35125 NODE_NAME_CASE(PHMINPOS)
35126 NODE_NAME_CASE(ADD)
35127 NODE_NAME_CASE(SUB)
35128 NODE_NAME_CASE(ADC)
35129 NODE_NAME_CASE(SBB)
35130 NODE_NAME_CASE(SMUL)
35131 NODE_NAME_CASE(UMUL)
35132 NODE_NAME_CASE(OR)
35133 NODE_NAME_CASE(XOR)
35134 NODE_NAME_CASE(AND)
35135 NODE_NAME_CASE(BEXTR)
35137 NODE_NAME_CASE(BZHI)
35138 NODE_NAME_CASE(PDEP)
35139 NODE_NAME_CASE(PEXT)
35140 NODE_NAME_CASE(MUL_IMM)
35141 NODE_NAME_CASE(MOVMSK)
35142 NODE_NAME_CASE(PTEST)
35143 NODE_NAME_CASE(TESTP)
35144 NODE_NAME_CASE(KORTEST)
35145 NODE_NAME_CASE(KTEST)
35146 NODE_NAME_CASE(KADD)
35147 NODE_NAME_CASE(KSHIFTL)
35148 NODE_NAME_CASE(KSHIFTR)
35149 NODE_NAME_CASE(PACKSS)
35150 NODE_NAME_CASE(PACKUS)
35151 NODE_NAME_CASE(PALIGNR)
35152 NODE_NAME_CASE(VALIGN)
35153 NODE_NAME_CASE(VSHLD)
35154 NODE_NAME_CASE(VSHRD)
35155 NODE_NAME_CASE(VSHLDV)
35156 NODE_NAME_CASE(VSHRDV)
35157 NODE_NAME_CASE(PSHUFD)
35158 NODE_NAME_CASE(PSHUFHW)
35159 NODE_NAME_CASE(PSHUFLW)
35160 NODE_NAME_CASE(SHUFP)
35161 NODE_NAME_CASE(SHUF128)
35162 NODE_NAME_CASE(MOVLHPS)
35163 NODE_NAME_CASE(MOVHLPS)
35164 NODE_NAME_CASE(MOVDDUP)
35165 NODE_NAME_CASE(MOVSHDUP)
35166 NODE_NAME_CASE(MOVSLDUP)
35167 NODE_NAME_CASE(MOVSD)
35168 NODE_NAME_CASE(MOVSS)
35169 NODE_NAME_CASE(MOVSH)
35170 NODE_NAME_CASE(UNPCKL)
35171 NODE_NAME_CASE(UNPCKH)
35172 NODE_NAME_CASE(VBROADCAST)
35173 NODE_NAME_CASE(VBROADCAST_LOAD)
35174 NODE_NAME_CASE(VBROADCASTM)
35175 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35176 NODE_NAME_CASE(VPERMILPV)
35177 NODE_NAME_CASE(VPERMILPI)
35178 NODE_NAME_CASE(VPERM2X128)
35179 NODE_NAME_CASE(VPERMV)
35180 NODE_NAME_CASE(VPERMV3)
35181 NODE_NAME_CASE(VPERMI)
35182 NODE_NAME_CASE(VPTERNLOG)
35183 NODE_NAME_CASE(FP_TO_SINT_SAT)
35184 NODE_NAME_CASE(FP_TO_UINT_SAT)
35185 NODE_NAME_CASE(VFIXUPIMM)
35186 NODE_NAME_CASE(VFIXUPIMM_SAE)
35187 NODE_NAME_CASE(VFIXUPIMMS)
35188 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35189 NODE_NAME_CASE(VRANGE)
35190 NODE_NAME_CASE(VRANGE_SAE)
35191 NODE_NAME_CASE(VRANGES)
35192 NODE_NAME_CASE(VRANGES_SAE)
35193 NODE_NAME_CASE(PMULUDQ)
35194 NODE_NAME_CASE(PMULDQ)
35195 NODE_NAME_CASE(PSADBW)
35196 NODE_NAME_CASE(DBPSADBW)
35197 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35198 NODE_NAME_CASE(VAARG_64)
35199 NODE_NAME_CASE(VAARG_X32)
35200 NODE_NAME_CASE(DYN_ALLOCA)
35201 NODE_NAME_CASE(MFENCE)
35202 NODE_NAME_CASE(SEG_ALLOCA)
35203 NODE_NAME_CASE(PROBED_ALLOCA)
35206 NODE_NAME_CASE(RDPKRU)
35207 NODE_NAME_CASE(WRPKRU)
35208 NODE_NAME_CASE(VPMADDUBSW)
35209 NODE_NAME_CASE(VPMADDWD)
35210 NODE_NAME_CASE(VPSHA)
35211 NODE_NAME_CASE(VPSHL)
35212 NODE_NAME_CASE(VPCOM)
35213 NODE_NAME_CASE(VPCOMU)
35214 NODE_NAME_CASE(VPERMIL2)
35216 NODE_NAME_CASE(STRICT_FMSUB)
35218 NODE_NAME_CASE(STRICT_FNMADD)
35220 NODE_NAME_CASE(STRICT_FNMSUB)
35221 NODE_NAME_CASE(FMADDSUB)
35222 NODE_NAME_CASE(FMSUBADD)
35223 NODE_NAME_CASE(FMADD_RND)
35224 NODE_NAME_CASE(FNMADD_RND)
35225 NODE_NAME_CASE(FMSUB_RND)
35226 NODE_NAME_CASE(FNMSUB_RND)
35227 NODE_NAME_CASE(FMADDSUB_RND)
35228 NODE_NAME_CASE(FMSUBADD_RND)
35229 NODE_NAME_CASE(VFMADDC)
35230 NODE_NAME_CASE(VFMADDC_RND)
35231 NODE_NAME_CASE(VFCMADDC)
35232 NODE_NAME_CASE(VFCMADDC_RND)
35233 NODE_NAME_CASE(VFMULC)
35234 NODE_NAME_CASE(VFMULC_RND)
35235 NODE_NAME_CASE(VFCMULC)
35236 NODE_NAME_CASE(VFCMULC_RND)
35237 NODE_NAME_CASE(VFMULCSH)
35238 NODE_NAME_CASE(VFMULCSH_RND)
35239 NODE_NAME_CASE(VFCMULCSH)
35240 NODE_NAME_CASE(VFCMULCSH_RND)
35241 NODE_NAME_CASE(VFMADDCSH)
35242 NODE_NAME_CASE(VFMADDCSH_RND)
35243 NODE_NAME_CASE(VFCMADDCSH)
35244 NODE_NAME_CASE(VFCMADDCSH_RND)
35245 NODE_NAME_CASE(VPMADD52H)
35246 NODE_NAME_CASE(VPMADD52L)
35247 NODE_NAME_CASE(VRNDSCALE)
35248 NODE_NAME_CASE(STRICT_VRNDSCALE)
35249 NODE_NAME_CASE(VRNDSCALE_SAE)
35250 NODE_NAME_CASE(VRNDSCALES)
35251 NODE_NAME_CASE(VRNDSCALES_SAE)
35252 NODE_NAME_CASE(VREDUCE)
35253 NODE_NAME_CASE(VREDUCE_SAE)
35254 NODE_NAME_CASE(VREDUCES)
35255 NODE_NAME_CASE(VREDUCES_SAE)
35256 NODE_NAME_CASE(VGETMANT)
35257 NODE_NAME_CASE(VGETMANT_SAE)
35258 NODE_NAME_CASE(VGETMANTS)
35259 NODE_NAME_CASE(VGETMANTS_SAE)
35260 NODE_NAME_CASE(PCMPESTR)
35261 NODE_NAME_CASE(PCMPISTR)
35263 NODE_NAME_CASE(COMPRESS)
35265 NODE_NAME_CASE(SELECTS)
35266 NODE_NAME_CASE(ADDSUB)
35267 NODE_NAME_CASE(RCP14)
35268 NODE_NAME_CASE(RCP14S)
35269 NODE_NAME_CASE(RSQRT14)
35270 NODE_NAME_CASE(RSQRT14S)
35271 NODE_NAME_CASE(FADD_RND)
35272 NODE_NAME_CASE(FADDS)
35273 NODE_NAME_CASE(FADDS_RND)
35274 NODE_NAME_CASE(FSUB_RND)
35275 NODE_NAME_CASE(FSUBS)
35276 NODE_NAME_CASE(FSUBS_RND)
35277 NODE_NAME_CASE(FMUL_RND)
35278 NODE_NAME_CASE(FMULS)
35279 NODE_NAME_CASE(FMULS_RND)
35280 NODE_NAME_CASE(FDIV_RND)
35281 NODE_NAME_CASE(FDIVS)
35282 NODE_NAME_CASE(FDIVS_RND)
35283 NODE_NAME_CASE(FSQRT_RND)
35284 NODE_NAME_CASE(FSQRTS)
35285 NODE_NAME_CASE(FSQRTS_RND)
35286 NODE_NAME_CASE(FGETEXP)
35287 NODE_NAME_CASE(FGETEXP_SAE)
35288 NODE_NAME_CASE(FGETEXPS)
35289 NODE_NAME_CASE(FGETEXPS_SAE)
35290 NODE_NAME_CASE(SCALEF)
35291 NODE_NAME_CASE(SCALEF_RND)
35292 NODE_NAME_CASE(SCALEFS)
35293 NODE_NAME_CASE(SCALEFS_RND)
35294 NODE_NAME_CASE(MULHRS)
35295 NODE_NAME_CASE(SINT_TO_FP_RND)
35296 NODE_NAME_CASE(UINT_TO_FP_RND)
35297 NODE_NAME_CASE(CVTTP2SI)
35298 NODE_NAME_CASE(CVTTP2UI)
35299 NODE_NAME_CASE(STRICT_CVTTP2SI)
35300 NODE_NAME_CASE(STRICT_CVTTP2UI)
35301 NODE_NAME_CASE(MCVTTP2SI)
35302 NODE_NAME_CASE(MCVTTP2UI)
35303 NODE_NAME_CASE(CVTTP2SI_SAE)
35304 NODE_NAME_CASE(CVTTP2UI_SAE)
35305 NODE_NAME_CASE(CVTTS2SI)
35306 NODE_NAME_CASE(CVTTS2UI)
35307 NODE_NAME_CASE(CVTTS2SI_SAE)
35308 NODE_NAME_CASE(CVTTS2UI_SAE)
35309 NODE_NAME_CASE(CVTSI2P)
35310 NODE_NAME_CASE(CVTUI2P)
35311 NODE_NAME_CASE(STRICT_CVTSI2P)
35312 NODE_NAME_CASE(STRICT_CVTUI2P)
35313 NODE_NAME_CASE(MCVTSI2P)
35314 NODE_NAME_CASE(MCVTUI2P)
35315 NODE_NAME_CASE(VFPCLASS)
35316 NODE_NAME_CASE(VFPCLASSS)
35317 NODE_NAME_CASE(MULTISHIFT)
35318 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35319 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35320 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35321 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35322 NODE_NAME_CASE(CVTPS2PH)
35323 NODE_NAME_CASE(STRICT_CVTPS2PH)
35324 NODE_NAME_CASE(CVTPS2PH_SAE)
35325 NODE_NAME_CASE(MCVTPS2PH)
35326 NODE_NAME_CASE(MCVTPS2PH_SAE)
35327 NODE_NAME_CASE(CVTPH2PS)
35328 NODE_NAME_CASE(STRICT_CVTPH2PS)
35329 NODE_NAME_CASE(CVTPH2PS_SAE)
35330 NODE_NAME_CASE(CVTP2SI)
35331 NODE_NAME_CASE(CVTP2UI)
35332 NODE_NAME_CASE(MCVTP2SI)
35333 NODE_NAME_CASE(MCVTP2UI)
35334 NODE_NAME_CASE(CVTP2SI_RND)
35335 NODE_NAME_CASE(CVTP2UI_RND)
35336 NODE_NAME_CASE(CVTS2SI)
35337 NODE_NAME_CASE(CVTS2UI)
35338 NODE_NAME_CASE(CVTS2SI_RND)
35339 NODE_NAME_CASE(CVTS2UI_RND)
35340 NODE_NAME_CASE(CVTNEPS2BF16)
35341 NODE_NAME_CASE(MCVTNEPS2BF16)
35342 NODE_NAME_CASE(DPBF16PS)
35343 NODE_NAME_CASE(DPFP16PS)
35344 NODE_NAME_CASE(MPSADBW)
35345 NODE_NAME_CASE(LWPINS)
35346 NODE_NAME_CASE(MGATHER)
35347 NODE_NAME_CASE(MSCATTER)
35348 NODE_NAME_CASE(VPDPBUSD)
35349 NODE_NAME_CASE(VPDPBUSDS)
35350 NODE_NAME_CASE(VPDPWSSD)
35351 NODE_NAME_CASE(VPDPWSSDS)
35352 NODE_NAME_CASE(VPSHUFBITQMB)
35353 NODE_NAME_CASE(GF2P8MULB)
35354 NODE_NAME_CASE(GF2P8AFFINEQB)
35355 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35356 NODE_NAME_CASE(NT_CALL)
35357 NODE_NAME_CASE(NT_BRIND)
35358 NODE_NAME_CASE(UMWAIT)
35359 NODE_NAME_CASE(TPAUSE)
35360 NODE_NAME_CASE(ENQCMD)
35361 NODE_NAME_CASE(ENQCMDS)
35362 NODE_NAME_CASE(VP2INTERSECT)
35363 NODE_NAME_CASE(VPDPBSUD)
35364 NODE_NAME_CASE(VPDPBSUDS)
35365 NODE_NAME_CASE(VPDPBUUD)
35366 NODE_NAME_CASE(VPDPBUUDS)
35367 NODE_NAME_CASE(VPDPBSSD)
35368 NODE_NAME_CASE(VPDPBSSDS)
35369 NODE_NAME_CASE(VPDPWSUD)
35370 NODE_NAME_CASE(VPDPWSUDS)
35371 NODE_NAME_CASE(VPDPWUSD)
35372 NODE_NAME_CASE(VPDPWUSDS)
35373 NODE_NAME_CASE(VPDPWUUD)
35374 NODE_NAME_CASE(VPDPWUUDS)
35375 NODE_NAME_CASE(VMINMAX)
35376 NODE_NAME_CASE(VMINMAX_SAE)
35377 NODE_NAME_CASE(VMINMAXS)
35378 NODE_NAME_CASE(VMINMAXS_SAE)
35379 NODE_NAME_CASE(CVTP2IBS)
35380 NODE_NAME_CASE(CVTP2IUBS)
35381 NODE_NAME_CASE(CVTP2IBS_RND)
35382 NODE_NAME_CASE(CVTP2IUBS_RND)
35383 NODE_NAME_CASE(CVTTP2IBS)
35384 NODE_NAME_CASE(CVTTP2IUBS)
35385 NODE_NAME_CASE(CVTTP2IBS_SAE)
35386 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35387 NODE_NAME_CASE(VCVT2PH2BF8)
35388 NODE_NAME_CASE(VCVT2PH2BF8S)
35389 NODE_NAME_CASE(VCVT2PH2HF8)
35390 NODE_NAME_CASE(VCVT2PH2HF8S)
35391 NODE_NAME_CASE(VCVTBIASPH2BF8)
35392 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35393 NODE_NAME_CASE(VCVTBIASPH2HF8)
35394 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35395 NODE_NAME_CASE(VCVTPH2BF8)
35396 NODE_NAME_CASE(VCVTPH2BF8S)
35397 NODE_NAME_CASE(VCVTPH2HF8)
35398 NODE_NAME_CASE(VCVTPH2HF8S)
35399 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35400 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35401 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35402 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35403 NODE_NAME_CASE(VMCVTPH2BF8)
35404 NODE_NAME_CASE(VMCVTPH2BF8S)
35405 NODE_NAME_CASE(VMCVTPH2HF8)
35406 NODE_NAME_CASE(VMCVTPH2HF8S)
35407 NODE_NAME_CASE(VCVTHF82PH)
35408 NODE_NAME_CASE(AESENC128KL)
35409 NODE_NAME_CASE(AESDEC128KL)
35410 NODE_NAME_CASE(AESENC256KL)
35411 NODE_NAME_CASE(AESDEC256KL)
35412 NODE_NAME_CASE(AESENCWIDE128KL)
35413 NODE_NAME_CASE(AESDECWIDE128KL)
35414 NODE_NAME_CASE(AESENCWIDE256KL)
35415 NODE_NAME_CASE(AESDECWIDE256KL)
35416 NODE_NAME_CASE(CMPCCXADD)
35417 NODE_NAME_CASE(TESTUI)
35418 NODE_NAME_CASE(FP80_ADD)
35419 NODE_NAME_CASE(STRICT_FP80_ADD)
35420 NODE_NAME_CASE(CCMP)
35421 NODE_NAME_CASE(CTEST)
35422 NODE_NAME_CASE(CLOAD)
35423 NODE_NAME_CASE(CSTORE)
35424 NODE_NAME_CASE(CVTTS2SIS)
35425 NODE_NAME_CASE(CVTTS2UIS)
35426 NODE_NAME_CASE(CVTTS2SIS_SAE)
35427 NODE_NAME_CASE(CVTTS2UIS_SAE)
35428 NODE_NAME_CASE(CVTTP2SIS)
35429 NODE_NAME_CASE(MCVTTP2SIS)
35430 NODE_NAME_CASE(CVTTP2UIS_SAE)
35431 NODE_NAME_CASE(CVTTP2SIS_SAE)
35432 NODE_NAME_CASE(CVTTP2UIS)
35433 NODE_NAME_CASE(MCVTTP2UIS)
35434 NODE_NAME_CASE(POP_FROM_X87_REG)
35435 }
35436 return nullptr;
35437#undef NODE_NAME_CASE
35438}
35439
35440/// Return true if the addressing mode represented by AM is legal for this
35441/// target, for a load/store of the specified type.
35443 const AddrMode &AM, Type *Ty,
35444 unsigned AS,
35445 Instruction *I) const {
35446 // X86 supports extremely general addressing modes.
35448
35449 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35450 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35451 return false;
35452
35453 if (AM.BaseGV) {
35454 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35455
35456 // If a reference to this global requires an extra load, we can't fold it.
35457 if (isGlobalStubReference(GVFlags))
35458 return false;
35459
35460 // If BaseGV requires a register for the PIC base, we cannot also have a
35461 // BaseReg specified.
35462 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35463 return false;
35464
35465 // If lower 4G is not available, then we must use rip-relative addressing.
35466 if ((M != CodeModel::Small || isPositionIndependent()) &&
35467 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35468 return false;
35469 }
35470
35471 switch (AM.Scale) {
35472 case 0:
35473 case 1:
35474 case 2:
35475 case 4:
35476 case 8:
35477 // These scales always work.
35478 break;
35479 case 3:
35480 case 5:
35481 case 9:
35482 // These scales are formed with basereg+scalereg. Only accept if there is
35483 // no basereg yet.
35484 if (AM.HasBaseReg)
35485 return false;
35486 break;
35487 default: // Other stuff never works.
35488 return false;
35489 }
35490
35491 return true;
35492}
35493
35494bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35495 switch (Opcode) {
35496 // These are non-commutative binops.
35497 // TODO: Add more X86ISD opcodes once we have test coverage.
35498 case X86ISD::ANDNP:
35499 case X86ISD::PCMPGT:
35500 case X86ISD::FMAX:
35501 case X86ISD::FMIN:
35502 case X86ISD::FANDN:
35503 case X86ISD::VPSHA:
35504 case X86ISD::VPSHL:
35505 case X86ISD::VSHLV:
35506 case X86ISD::VSRLV:
35507 case X86ISD::VSRAV:
35508 return true;
35509 }
35510
35511 return TargetLoweringBase::isBinOp(Opcode);
35512}
35513
35514bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35515 switch (Opcode) {
35516 // TODO: Add more X86ISD opcodes once we have test coverage.
35517 case X86ISD::PCMPEQ:
35518 case X86ISD::PMULDQ:
35519 case X86ISD::PMULUDQ:
35520 case X86ISD::FMAXC:
35521 case X86ISD::FMINC:
35522 case X86ISD::FAND:
35523 case X86ISD::FOR:
35524 case X86ISD::FXOR:
35525 return true;
35526 }
35527
35529}
35530
35532 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35533 return false;
35534 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35535 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35536 return NumBits1 > NumBits2;
35537}
35538
35540 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35541 return false;
35542
35543 if (!isTypeLegal(EVT::getEVT(Ty1)))
35544 return false;
35545
35546 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35547
35548 // Assuming the caller doesn't have a zeroext or signext return parameter,
35549 // truncation all the way down to i1 is valid.
35550 return true;
35551}
35552
35554 return isInt<32>(Imm);
35555}
35556
35558 // Can also use sub to handle negated immediates.
35559 return isInt<32>(Imm);
35560}
35561
35563 return isInt<32>(Imm);
35564}
35565
35567 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35568 return false;
35569 unsigned NumBits1 = VT1.getSizeInBits();
35570 unsigned NumBits2 = VT2.getSizeInBits();
35571 return NumBits1 > NumBits2;
35572}
35573
35575 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35576 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35577}
35578
35580 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35581 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35582}
35583
35585 EVT VT1 = Val.getValueType();
35586 if (isZExtFree(VT1, VT2))
35587 return true;
35588
35589 if (Val.getOpcode() != ISD::LOAD)
35590 return false;
35591
35592 if (!VT1.isSimple() || !VT1.isInteger() ||
35593 !VT2.isSimple() || !VT2.isInteger())
35594 return false;
35595
35596 switch (VT1.getSimpleVT().SimpleTy) {
35597 default: break;
35598 case MVT::i8:
35599 case MVT::i16:
35600 case MVT::i32:
35601 // X86 has 8, 16, and 32-bit zero-extending loads.
35602 return true;
35603 }
35604
35605 return false;
35606}
35607
35609 if (!Subtarget.is64Bit())
35610 return false;
35611 return TargetLowering::shouldConvertPhiType(From, To);
35612}
35613
35615 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35616 return false;
35617
35618 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35619
35620 // There is no extending load for vXi1.
35621 if (SrcVT.getScalarType() == MVT::i1)
35622 return false;
35623
35624 return true;
35625}
35626
35628 EVT VT) const {
35629 if (Subtarget.useSoftFloat())
35630 return false;
35631
35632 if (!Subtarget.hasAnyFMA())
35633 return false;
35634
35635 VT = VT.getScalarType();
35636
35637 if (!VT.isSimple())
35638 return false;
35639
35640 switch (VT.getSimpleVT().SimpleTy) {
35641 case MVT::f16:
35642 return Subtarget.hasFP16();
35643 case MVT::f32:
35644 case MVT::f64:
35645 return true;
35646 default:
35647 break;
35648 }
35649
35650 return false;
35651}
35652
35654 EVT DestVT) const {
35655 // i16 instructions are longer (0x66 prefix) and potentially slower.
35656 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35657}
35658
35660 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35661 SDValue Y) const {
35662 if (SelectOpcode == ISD::SELECT) {
35663 if (VT.isVector())
35664 return false;
35665 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35666 return false;
35667 using namespace llvm::SDPatternMatch;
35668 // BLSI
35669 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35671 return true;
35672 // BLSR
35673 if (BinOpcode == ISD::AND &&
35676 return true;
35677 // BLSMSK
35678 if (BinOpcode == ISD::XOR &&
35681 return true;
35682
35683 return false;
35684 }
35685 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35686 // benefit. The transform may also be profitable for scalar code.
35687 if (!Subtarget.hasAVX512())
35688 return false;
35689 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35690 return false;
35691 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35692 return false;
35693
35694 return true;
35695}
35696
35697/// Targets can use this to indicate that they only support *some*
35698/// VECTOR_SHUFFLE operations, those with specific masks.
35699/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35700/// are assumed to be legal.
35702 if (!VT.isSimple())
35703 return false;
35704
35705 // Not for i1 vectors
35706 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35707 return false;
35708
35709 // Very little shuffling can be done for 64-bit vectors right now.
35710 if (VT.getSimpleVT().getSizeInBits() == 64)
35711 return false;
35712
35713 // We only care that the types being shuffled are legal. The lowering can
35714 // handle any possible shuffle mask that results.
35715 return isTypeLegal(VT.getSimpleVT());
35716}
35717
35719 EVT VT) const {
35720 // Don't convert an 'and' into a shuffle that we don't directly support.
35721 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35722 if (!Subtarget.hasAVX2())
35723 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35724 return false;
35725
35726 // Just delegate to the generic legality, clear masks aren't special.
35727 return isShuffleMaskLegal(Mask, VT);
35728}
35729
35731 // If the subtarget is using thunks, we need to not generate jump tables.
35732 if (Subtarget.useIndirectThunkBranches())
35733 return false;
35734
35735 // Otherwise, fallback on the generic logic.
35737}
35738
35740 EVT ConditionVT) const {
35741 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35742 // zero-extensions.
35743 if (ConditionVT.getSizeInBits() < 32)
35744 return MVT::i32;
35746 ConditionVT);
35747}
35748
35749//===----------------------------------------------------------------------===//
35750// X86 Scheduler Hooks
35751//===----------------------------------------------------------------------===//
35752
35753/// Utility function to emit xbegin specifying the start of an RTM region.
35755 const TargetInstrInfo *TII) {
35756 const MIMetadata MIMD(MI);
35757
35758 const BasicBlock *BB = MBB->getBasicBlock();
35759 MachineFunction::iterator I = ++MBB->getIterator();
35760
35761 // For the v = xbegin(), we generate
35762 //
35763 // thisMBB:
35764 // xbegin sinkMBB
35765 //
35766 // mainMBB:
35767 // s0 = -1
35768 //
35769 // fallBB:
35770 // eax = # XABORT_DEF
35771 // s1 = eax
35772 //
35773 // sinkMBB:
35774 // v = phi(s0/mainBB, s1/fallBB)
35775
35776 MachineBasicBlock *thisMBB = MBB;
35777 MachineFunction *MF = MBB->getParent();
35778 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35779 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35780 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35781 MF->insert(I, mainMBB);
35782 MF->insert(I, fallMBB);
35783 MF->insert(I, sinkMBB);
35784
35785 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35786 mainMBB->addLiveIn(X86::EFLAGS);
35787 fallMBB->addLiveIn(X86::EFLAGS);
35788 sinkMBB->addLiveIn(X86::EFLAGS);
35789 }
35790
35791 // Transfer the remainder of BB and its successor edges to sinkMBB.
35792 sinkMBB->splice(sinkMBB->begin(), MBB,
35793 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35795
35797 Register DstReg = MI.getOperand(0).getReg();
35798 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35799 Register mainDstReg = MRI.createVirtualRegister(RC);
35800 Register fallDstReg = MRI.createVirtualRegister(RC);
35801
35802 // thisMBB:
35803 // xbegin fallMBB
35804 // # fallthrough to mainMBB
35805 // # abortion to fallMBB
35806 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35807 thisMBB->addSuccessor(mainMBB);
35808 thisMBB->addSuccessor(fallMBB);
35809
35810 // mainMBB:
35811 // mainDstReg := -1
35812 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35813 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35814 mainMBB->addSuccessor(sinkMBB);
35815
35816 // fallMBB:
35817 // ; pseudo instruction to model hardware's definition from XABORT
35818 // EAX := XABORT_DEF
35819 // fallDstReg := EAX
35820 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35821 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35822 .addReg(X86::EAX);
35823 fallMBB->addSuccessor(sinkMBB);
35824
35825 // sinkMBB:
35826 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35827 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35828 .addReg(mainDstReg).addMBB(mainMBB)
35829 .addReg(fallDstReg).addMBB(fallMBB);
35830
35831 MI.eraseFromParent();
35832 return sinkMBB;
35833}
35834
35836X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35837 MachineBasicBlock *MBB) const {
35838 // Emit va_arg instruction on X86-64.
35839
35840 // Operands to this pseudo-instruction:
35841 // 0 ) Output : destination address (reg)
35842 // 1-5) Input : va_list address (addr, i64mem)
35843 // 6 ) ArgSize : Size (in bytes) of vararg type
35844 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35845 // 8 ) Align : Alignment of type
35846 // 9 ) EFLAGS (implicit-def)
35847
35848 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35849 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35850
35851 Register DestReg = MI.getOperand(0).getReg();
35852 MachineOperand &Base = MI.getOperand(1);
35853 MachineOperand &Scale = MI.getOperand(2);
35854 MachineOperand &Index = MI.getOperand(3);
35855 MachineOperand &Disp = MI.getOperand(4);
35856 MachineOperand &Segment = MI.getOperand(5);
35857 unsigned ArgSize = MI.getOperand(6).getImm();
35858 unsigned ArgMode = MI.getOperand(7).getImm();
35859 Align Alignment = Align(MI.getOperand(8).getImm());
35860
35861 MachineFunction *MF = MBB->getParent();
35862
35863 // Memory Reference
35864 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35865
35866 MachineMemOperand *OldMMO = MI.memoperands().front();
35867
35868 // Clone the MMO into two separate MMOs for loading and storing
35869 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35870 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35871 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35872 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35873
35874 // Machine Information
35875 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35876 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35877 const TargetRegisterClass *AddrRegClass =
35879 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35880 const MIMetadata MIMD(MI);
35881
35882 // struct va_list {
35883 // i32 gp_offset
35884 // i32 fp_offset
35885 // i64 overflow_area (address)
35886 // i64 reg_save_area (address)
35887 // }
35888 // sizeof(va_list) = 24
35889 // alignment(va_list) = 8
35890
35891 unsigned TotalNumIntRegs = 6;
35892 unsigned TotalNumXMMRegs = 8;
35893 bool UseGPOffset = (ArgMode == 1);
35894 bool UseFPOffset = (ArgMode == 2);
35895 unsigned MaxOffset = TotalNumIntRegs * 8 +
35896 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35897
35898 /* Align ArgSize to a multiple of 8 */
35899 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35900 bool NeedsAlign = (Alignment > 8);
35901
35902 MachineBasicBlock *thisMBB = MBB;
35903 MachineBasicBlock *overflowMBB;
35904 MachineBasicBlock *offsetMBB;
35905 MachineBasicBlock *endMBB;
35906
35907 Register OffsetDestReg; // Argument address computed by offsetMBB
35908 Register OverflowDestReg; // Argument address computed by overflowMBB
35909 Register OffsetReg;
35910
35911 if (!UseGPOffset && !UseFPOffset) {
35912 // If we only pull from the overflow region, we don't create a branch.
35913 // We don't need to alter control flow.
35914 OffsetDestReg = Register(); // unused
35915 OverflowDestReg = DestReg;
35916
35917 offsetMBB = nullptr;
35918 overflowMBB = thisMBB;
35919 endMBB = thisMBB;
35920 } else {
35921 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35922 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35923 // If not, pull from overflow_area. (branch to overflowMBB)
35924 //
35925 // thisMBB
35926 // | .
35927 // | .
35928 // offsetMBB overflowMBB
35929 // | .
35930 // | .
35931 // endMBB
35932
35933 // Registers for the PHI in endMBB
35934 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35935 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35936
35937 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35938 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35939 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35940 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35941
35943
35944 // Insert the new basic blocks
35945 MF->insert(MBBIter, offsetMBB);
35946 MF->insert(MBBIter, overflowMBB);
35947 MF->insert(MBBIter, endMBB);
35948
35949 // Transfer the remainder of MBB and its successor edges to endMBB.
35950 endMBB->splice(endMBB->begin(), thisMBB,
35951 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35952 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35953
35954 // Make offsetMBB and overflowMBB successors of thisMBB
35955 thisMBB->addSuccessor(offsetMBB);
35956 thisMBB->addSuccessor(overflowMBB);
35957
35958 // endMBB is a successor of both offsetMBB and overflowMBB
35959 offsetMBB->addSuccessor(endMBB);
35960 overflowMBB->addSuccessor(endMBB);
35961
35962 // Load the offset value into a register
35963 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35964 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35965 .add(Base)
35966 .add(Scale)
35967 .add(Index)
35968 .addDisp(Disp, UseFPOffset ? 4 : 0)
35969 .add(Segment)
35970 .setMemRefs(LoadOnlyMMO);
35971
35972 // Check if there is enough room left to pull this argument.
35973 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35974 .addReg(OffsetReg)
35975 .addImm(MaxOffset + 8 - ArgSizeA8);
35976
35977 // Branch to "overflowMBB" if offset >= max
35978 // Fall through to "offsetMBB" otherwise
35979 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35980 .addMBB(overflowMBB).addImm(X86::COND_AE);
35981 }
35982
35983 // In offsetMBB, emit code to use the reg_save_area.
35984 if (offsetMBB) {
35985 assert(OffsetReg != 0);
35986
35987 // Read the reg_save_area address.
35988 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35989 BuildMI(
35990 offsetMBB, MIMD,
35991 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35992 RegSaveReg)
35993 .add(Base)
35994 .add(Scale)
35995 .add(Index)
35996 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35997 .add(Segment)
35998 .setMemRefs(LoadOnlyMMO);
35999
36000 if (Subtarget.isTarget64BitLP64()) {
36001 // Zero-extend the offset
36002 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
36003 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
36004 .addImm(0)
36005 .addReg(OffsetReg)
36006 .addImm(X86::sub_32bit);
36007
36008 // Add the offset to the reg_save_area to get the final address.
36009 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
36010 .addReg(OffsetReg64)
36011 .addReg(RegSaveReg);
36012 } else {
36013 // Add the offset to the reg_save_area to get the final address.
36014 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
36015 .addReg(OffsetReg)
36016 .addReg(RegSaveReg);
36017 }
36018
36019 // Compute the offset for the next argument
36020 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36021 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
36022 .addReg(OffsetReg)
36023 .addImm(UseFPOffset ? 16 : 8);
36024
36025 // Store it back into the va_list.
36026 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
36027 .add(Base)
36028 .add(Scale)
36029 .add(Index)
36030 .addDisp(Disp, UseFPOffset ? 4 : 0)
36031 .add(Segment)
36032 .addReg(NextOffsetReg)
36033 .setMemRefs(StoreOnlyMMO);
36034
36035 // Jump to endMBB
36036 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
36037 .addMBB(endMBB);
36038 }
36039
36040 //
36041 // Emit code to use overflow area
36042 //
36043
36044 // Load the overflow_area address into a register.
36045 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36046 BuildMI(overflowMBB, MIMD,
36047 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36048 OverflowAddrReg)
36049 .add(Base)
36050 .add(Scale)
36051 .add(Index)
36052 .addDisp(Disp, 8)
36053 .add(Segment)
36054 .setMemRefs(LoadOnlyMMO);
36055
36056 // If we need to align it, do so. Otherwise, just copy the address
36057 // to OverflowDestReg.
36058 if (NeedsAlign) {
36059 // Align the overflow address
36060 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36061
36062 // aligned_addr = (addr + (align-1)) & ~(align-1)
36063 BuildMI(
36064 overflowMBB, MIMD,
36065 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36066 TmpReg)
36067 .addReg(OverflowAddrReg)
36068 .addImm(Alignment.value() - 1);
36069
36070 BuildMI(
36071 overflowMBB, MIMD,
36072 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36073 OverflowDestReg)
36074 .addReg(TmpReg)
36075 .addImm(~(uint64_t)(Alignment.value() - 1));
36076 } else {
36077 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36078 .addReg(OverflowAddrReg);
36079 }
36080
36081 // Compute the next overflow address after this argument.
36082 // (the overflow address should be kept 8-byte aligned)
36083 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36084 BuildMI(
36085 overflowMBB, MIMD,
36086 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36087 NextAddrReg)
36088 .addReg(OverflowDestReg)
36089 .addImm(ArgSizeA8);
36090
36091 // Store the new overflow address.
36092 BuildMI(overflowMBB, MIMD,
36093 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36094 .add(Base)
36095 .add(Scale)
36096 .add(Index)
36097 .addDisp(Disp, 8)
36098 .add(Segment)
36099 .addReg(NextAddrReg)
36100 .setMemRefs(StoreOnlyMMO);
36101
36102 // If we branched, emit the PHI to the front of endMBB.
36103 if (offsetMBB) {
36104 BuildMI(*endMBB, endMBB->begin(), MIMD,
36105 TII->get(X86::PHI), DestReg)
36106 .addReg(OffsetDestReg).addMBB(offsetMBB)
36107 .addReg(OverflowDestReg).addMBB(overflowMBB);
36108 }
36109
36110 // Erase the pseudo instruction
36111 MI.eraseFromParent();
36112
36113 return endMBB;
36114}
36115
36116// The EFLAGS operand of SelectItr might be missing a kill marker
36117// because there were multiple uses of EFLAGS, and ISel didn't know
36118// which to mark. Figure out whether SelectItr should have had a
36119// kill marker, and set it if it should. Returns the correct kill
36120// marker value.
36123 const TargetRegisterInfo* TRI) {
36124 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36125 return false;
36126
36127 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36128 // out. SelectMI should have a kill flag on EFLAGS.
36129 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36130 return true;
36131}
36132
36133// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36134// together with other CMOV pseudo-opcodes into a single basic-block with
36135// conditional jump around it.
36137 switch (MI.getOpcode()) {
36138 case X86::CMOV_FR16:
36139 case X86::CMOV_FR16X:
36140 case X86::CMOV_FR32:
36141 case X86::CMOV_FR32X:
36142 case X86::CMOV_FR64:
36143 case X86::CMOV_FR64X:
36144 case X86::CMOV_GR8:
36145 case X86::CMOV_GR16:
36146 case X86::CMOV_GR32:
36147 case X86::CMOV_RFP32:
36148 case X86::CMOV_RFP64:
36149 case X86::CMOV_RFP80:
36150 case X86::CMOV_VR64:
36151 case X86::CMOV_VR128:
36152 case X86::CMOV_VR128X:
36153 case X86::CMOV_VR256:
36154 case X86::CMOV_VR256X:
36155 case X86::CMOV_VR512:
36156 case X86::CMOV_VK1:
36157 case X86::CMOV_VK2:
36158 case X86::CMOV_VK4:
36159 case X86::CMOV_VK8:
36160 case X86::CMOV_VK16:
36161 case X86::CMOV_VK32:
36162 case X86::CMOV_VK64:
36163 return true;
36164
36165 default:
36166 return false;
36167 }
36168}
36169
36170// Helper function, which inserts PHI functions into SinkMBB:
36171// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36172// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36173// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36174// the last PHI function inserted.
36177 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36178 MachineBasicBlock *SinkMBB) {
36179 MachineFunction *MF = TrueMBB->getParent();
36181 const MIMetadata MIMD(*MIItBegin);
36182
36183 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36185
36186 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36187
36188 // As we are creating the PHIs, we have to be careful if there is more than
36189 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36190 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36191 // That also means that PHI construction must work forward from earlier to
36192 // later, and that the code must maintain a mapping from earlier PHI's
36193 // destination registers, and the registers that went into the PHI.
36196
36197 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36198 Register DestReg = MIIt->getOperand(0).getReg();
36199 Register Op1Reg = MIIt->getOperand(1).getReg();
36200 Register Op2Reg = MIIt->getOperand(2).getReg();
36201
36202 // If this CMOV we are generating is the opposite condition from
36203 // the jump we generated, then we have to swap the operands for the
36204 // PHI that is going to be generated.
36205 if (MIIt->getOperand(3).getImm() == OppCC)
36206 std::swap(Op1Reg, Op2Reg);
36207
36208 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36209 Op1Reg = It->second.first;
36210
36211 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36212 Op2Reg = It->second.second;
36213
36214 MIB =
36215 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36216 .addReg(Op1Reg)
36217 .addMBB(FalseMBB)
36218 .addReg(Op2Reg)
36219 .addMBB(TrueMBB);
36220
36221 // Add this PHI to the rewrite table.
36222 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36223 }
36224
36225 return MIB;
36226}
36227
36228// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36230X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36231 MachineInstr &SecondCascadedCMOV,
36232 MachineBasicBlock *ThisMBB) const {
36233 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36234 const MIMetadata MIMD(FirstCMOV);
36235
36236 // We lower cascaded CMOVs such as
36237 //
36238 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36239 //
36240 // to two successive branches.
36241 //
36242 // Without this, we would add a PHI between the two jumps, which ends up
36243 // creating a few copies all around. For instance, for
36244 //
36245 // (sitofp (zext (fcmp une)))
36246 //
36247 // we would generate:
36248 //
36249 // ucomiss %xmm1, %xmm0
36250 // movss <1.0f>, %xmm0
36251 // movaps %xmm0, %xmm1
36252 // jne .LBB5_2
36253 // xorps %xmm1, %xmm1
36254 // .LBB5_2:
36255 // jp .LBB5_4
36256 // movaps %xmm1, %xmm0
36257 // .LBB5_4:
36258 // retq
36259 //
36260 // because this custom-inserter would have generated:
36261 //
36262 // A
36263 // | \
36264 // | B
36265 // | /
36266 // C
36267 // | \
36268 // | D
36269 // | /
36270 // E
36271 //
36272 // A: X = ...; Y = ...
36273 // B: empty
36274 // C: Z = PHI [X, A], [Y, B]
36275 // D: empty
36276 // E: PHI [X, C], [Z, D]
36277 //
36278 // If we lower both CMOVs in a single step, we can instead generate:
36279 //
36280 // A
36281 // | \
36282 // | C
36283 // | /|
36284 // |/ |
36285 // | |
36286 // | D
36287 // | /
36288 // E
36289 //
36290 // A: X = ...; Y = ...
36291 // D: empty
36292 // E: PHI [X, A], [X, C], [Y, D]
36293 //
36294 // Which, in our sitofp/fcmp example, gives us something like:
36295 //
36296 // ucomiss %xmm1, %xmm0
36297 // movss <1.0f>, %xmm0
36298 // jne .LBB5_4
36299 // jp .LBB5_4
36300 // xorps %xmm0, %xmm0
36301 // .LBB5_4:
36302 // retq
36303 //
36304
36305 // We lower cascaded CMOV into two successive branches to the same block.
36306 // EFLAGS is used by both, so mark it as live in the second.
36307 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36308 MachineFunction *F = ThisMBB->getParent();
36309 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36310 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36311 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36312
36313 MachineFunction::iterator It = ++ThisMBB->getIterator();
36314 F->insert(It, FirstInsertedMBB);
36315 F->insert(It, SecondInsertedMBB);
36316 F->insert(It, SinkMBB);
36317
36318 // For a cascaded CMOV, we lower it to two successive branches to
36319 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36320 // the FirstInsertedMBB.
36321 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36322
36323 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36324 // live into the sink and copy blocks.
36325 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36326 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36327 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36328 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36329 SinkMBB->addLiveIn(X86::EFLAGS);
36330 }
36331
36332 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36333 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36334 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36335 ThisMBB->end());
36336 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36337
36338 // Fallthrough block for ThisMBB.
36339 ThisMBB->addSuccessor(FirstInsertedMBB);
36340 // The true block target of the first branch is always SinkMBB.
36341 ThisMBB->addSuccessor(SinkMBB);
36342 // Fallthrough block for FirstInsertedMBB.
36343 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36344 // The true block for the branch of FirstInsertedMBB.
36345 FirstInsertedMBB->addSuccessor(SinkMBB);
36346 // This is fallthrough.
36347 SecondInsertedMBB->addSuccessor(SinkMBB);
36348
36349 // Create the conditional branch instructions.
36350 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36351 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36352
36353 X86::CondCode SecondCC =
36354 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36355 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36356 .addMBB(SinkMBB)
36357 .addImm(SecondCC);
36358
36359 // SinkMBB:
36360 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36361 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36362 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36363 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36364 MachineInstrBuilder MIB =
36365 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36366 .addReg(Op1Reg)
36367 .addMBB(SecondInsertedMBB)
36368 .addReg(Op2Reg)
36369 .addMBB(ThisMBB);
36370
36371 // The second SecondInsertedMBB provides the same incoming value as the
36372 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36373 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36374
36375 // Now remove the CMOVs.
36376 FirstCMOV.eraseFromParent();
36377 SecondCascadedCMOV.eraseFromParent();
36378
36379 return SinkMBB;
36380}
36381
36383X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36384 MachineBasicBlock *ThisMBB) const {
36385 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36386 const MIMetadata MIMD(MI);
36387
36388 // To "insert" a SELECT_CC instruction, we actually have to insert the
36389 // diamond control-flow pattern. The incoming instruction knows the
36390 // destination vreg to set, the condition code register to branch on, the
36391 // true/false values to select between and a branch opcode to use.
36392
36393 // ThisMBB:
36394 // ...
36395 // TrueVal = ...
36396 // cmpTY ccX, r1, r2
36397 // bCC copy1MBB
36398 // fallthrough --> FalseMBB
36399
36400 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36401 // as described above, by inserting a BB, and then making a PHI at the join
36402 // point to select the true and false operands of the CMOV in the PHI.
36403 //
36404 // The code also handles two different cases of multiple CMOV opcodes
36405 // in a row.
36406 //
36407 // Case 1:
36408 // In this case, there are multiple CMOVs in a row, all which are based on
36409 // the same condition setting (or the exact opposite condition setting).
36410 // In this case we can lower all the CMOVs using a single inserted BB, and
36411 // then make a number of PHIs at the join point to model the CMOVs. The only
36412 // trickiness here, is that in a case like:
36413 //
36414 // t2 = CMOV cond1 t1, f1
36415 // t3 = CMOV cond1 t2, f2
36416 //
36417 // when rewriting this into PHIs, we have to perform some renaming on the
36418 // temps since you cannot have a PHI operand refer to a PHI result earlier
36419 // in the same block. The "simple" but wrong lowering would be:
36420 //
36421 // t2 = PHI t1(BB1), f1(BB2)
36422 // t3 = PHI t2(BB1), f2(BB2)
36423 //
36424 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36425 // renaming is to note that on the path through BB1, t2 is really just a
36426 // copy of t1, and do that renaming, properly generating:
36427 //
36428 // t2 = PHI t1(BB1), f1(BB2)
36429 // t3 = PHI t1(BB1), f2(BB2)
36430 //
36431 // Case 2:
36432 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36433 // function - EmitLoweredCascadedSelect.
36434
36435 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36437 MachineInstr *LastCMOV = &MI;
36439
36440 // Check for case 1, where there are multiple CMOVs with the same condition
36441 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36442 // number of jumps the most.
36443
36444 if (isCMOVPseudo(MI)) {
36445 // See if we have a string of CMOVS with the same condition. Skip over
36446 // intervening debug insts.
36447 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36448 (NextMIIt->getOperand(3).getImm() == CC ||
36449 NextMIIt->getOperand(3).getImm() == OppCC)) {
36450 LastCMOV = &*NextMIIt;
36451 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36452 }
36453 }
36454
36455 // This checks for case 2, but only do this if we didn't already find
36456 // case 1, as indicated by LastCMOV == MI.
36457 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36458 NextMIIt->getOpcode() == MI.getOpcode() &&
36459 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36460 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36461 NextMIIt->getOperand(1).isKill()) {
36462 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36463 }
36464
36465 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36466 MachineFunction *F = ThisMBB->getParent();
36467 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36468 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36469
36470 MachineFunction::iterator It = ++ThisMBB->getIterator();
36471 F->insert(It, FalseMBB);
36472 F->insert(It, SinkMBB);
36473
36474 // Set the call frame size on entry to the new basic blocks.
36475 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36476 FalseMBB->setCallFrameSize(CallFrameSize);
36477 SinkMBB->setCallFrameSize(CallFrameSize);
36478
36479 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36480 // live into the sink and copy blocks.
36481 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36482 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36483 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36484 FalseMBB->addLiveIn(X86::EFLAGS);
36485 SinkMBB->addLiveIn(X86::EFLAGS);
36486 }
36487
36488 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36490 MachineBasicBlock::iterator(LastCMOV));
36491 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36492 if (MI.isDebugInstr())
36493 SinkMBB->push_back(MI.removeFromParent());
36494
36495 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36496 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36497 std::next(MachineBasicBlock::iterator(LastCMOV)),
36498 ThisMBB->end());
36499 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36500
36501 // Fallthrough block for ThisMBB.
36502 ThisMBB->addSuccessor(FalseMBB);
36503 // The true block target of the first (or only) branch is always a SinkMBB.
36504 ThisMBB->addSuccessor(SinkMBB);
36505 // Fallthrough block for FalseMBB.
36506 FalseMBB->addSuccessor(SinkMBB);
36507
36508 // Create the conditional branch instruction.
36509 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36510
36511 // SinkMBB:
36512 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36513 // ...
36516 std::next(MachineBasicBlock::iterator(LastCMOV));
36517 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36518
36519 // Now remove the CMOV(s).
36520 ThisMBB->erase(MIItBegin, MIItEnd);
36521
36522 return SinkMBB;
36523}
36524
36525static unsigned getSUBriOpcode(bool IsLP64) {
36526 if (IsLP64)
36527 return X86::SUB64ri32;
36528 else
36529 return X86::SUB32ri;
36530}
36531
36533X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36534 MachineBasicBlock *MBB) const {
36535 MachineFunction *MF = MBB->getParent();
36536 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36537 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36538 const MIMetadata MIMD(MI);
36539 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36540
36541 const unsigned ProbeSize = getStackProbeSize(*MF);
36542
36543 MachineRegisterInfo &MRI = MF->getRegInfo();
36544 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36545 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36546 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36547
36549 MF->insert(MBBIter, testMBB);
36550 MF->insert(MBBIter, blockMBB);
36551 MF->insert(MBBIter, tailMBB);
36552
36553 Register sizeVReg = MI.getOperand(1).getReg();
36554
36555 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36556
36557 Register TmpStackPtr = MRI.createVirtualRegister(
36558 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36559 Register FinalStackPtr = MRI.createVirtualRegister(
36560 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36561
36562 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36563 .addReg(physSPReg);
36564 {
36565 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36566 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36567 .addReg(TmpStackPtr)
36568 .addReg(sizeVReg);
36569 }
36570
36571 // test rsp size
36572
36573 BuildMI(testMBB, MIMD,
36574 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36575 .addReg(FinalStackPtr)
36576 .addReg(physSPReg);
36577
36578 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36579 .addMBB(tailMBB)
36581 testMBB->addSuccessor(blockMBB);
36582 testMBB->addSuccessor(tailMBB);
36583
36584 // Touch the block then extend it. This is done on the opposite side of
36585 // static probe where we allocate then touch, to avoid the need of probing the
36586 // tail of the static alloca. Possible scenarios are:
36587 //
36588 // + ---- <- ------------ <- ------------- <- ------------ +
36589 // | |
36590 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36591 // | |
36592 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36593 //
36594 // The property we want to enforce is to never have more than [page alloc] between two probes.
36595
36596 const unsigned XORMIOpc =
36597 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36598 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36599 .addImm(0);
36600
36601 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36602 physSPReg)
36603 .addReg(physSPReg)
36604 .addImm(ProbeSize);
36605
36606 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36607 blockMBB->addSuccessor(testMBB);
36608
36609 // Replace original instruction by the expected stack ptr
36610 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36611 MI.getOperand(0).getReg())
36612 .addReg(FinalStackPtr);
36613
36614 tailMBB->splice(tailMBB->end(), MBB,
36615 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36617 MBB->addSuccessor(testMBB);
36618
36619 // Delete the original pseudo instruction.
36620 MI.eraseFromParent();
36621
36622 // And we're done.
36623 return tailMBB;
36624}
36625
36627X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36628 MachineBasicBlock *BB) const {
36629 MachineFunction *MF = BB->getParent();
36630 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36631 const MIMetadata MIMD(MI);
36632 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36633
36634 assert(MF->shouldSplitStack());
36635
36636 const bool Is64Bit = Subtarget.is64Bit();
36637 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36638
36639 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36640 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36641
36642 // BB:
36643 // ... [Till the alloca]
36644 // If stacklet is not large enough, jump to mallocMBB
36645 //
36646 // bumpMBB:
36647 // Allocate by subtracting from RSP
36648 // Jump to continueMBB
36649 //
36650 // mallocMBB:
36651 // Allocate by call to runtime
36652 //
36653 // continueMBB:
36654 // ...
36655 // [rest of original BB]
36656 //
36657
36658 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36659 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36660 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36661
36662 MachineRegisterInfo &MRI = MF->getRegInfo();
36663 const TargetRegisterClass *AddrRegClass =
36665
36666 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36667 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36668 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36669 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36670 sizeVReg = MI.getOperand(1).getReg(),
36671 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36672
36673 MachineFunction::iterator MBBIter = ++BB->getIterator();
36674
36675 MF->insert(MBBIter, bumpMBB);
36676 MF->insert(MBBIter, mallocMBB);
36677 MF->insert(MBBIter, continueMBB);
36678
36679 continueMBB->splice(continueMBB->begin(), BB,
36680 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36681 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36682
36683 // Add code to the main basic block to check if the stack limit has been hit,
36684 // and if so, jump to mallocMBB otherwise to bumpMBB.
36685 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36686 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36687 .addReg(tmpSPVReg).addReg(sizeVReg);
36688 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36689 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36690 .addReg(SPLimitVReg);
36691 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36692
36693 // bumpMBB simply decreases the stack pointer, since we know the current
36694 // stacklet has enough space.
36695 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36696 .addReg(SPLimitVReg);
36697 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36698 .addReg(SPLimitVReg);
36699 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36700
36701 // Calls into a routine in libgcc to allocate more space from the heap.
36702 const uint32_t *RegMask =
36703 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36704 if (IsLP64) {
36705 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36706 .addReg(sizeVReg);
36707 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36708 .addExternalSymbol("__morestack_allocate_stack_space")
36709 .addRegMask(RegMask)
36710 .addReg(X86::RDI, RegState::Implicit)
36711 .addReg(X86::RAX, RegState::ImplicitDefine);
36712 } else if (Is64Bit) {
36713 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36714 .addReg(sizeVReg);
36715 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36716 .addExternalSymbol("__morestack_allocate_stack_space")
36717 .addRegMask(RegMask)
36718 .addReg(X86::EDI, RegState::Implicit)
36719 .addReg(X86::EAX, RegState::ImplicitDefine);
36720 } else {
36721 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36722 .addImm(12);
36723 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36724 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36725 .addExternalSymbol("__morestack_allocate_stack_space")
36726 .addRegMask(RegMask)
36727 .addReg(X86::EAX, RegState::ImplicitDefine);
36728 }
36729
36730 if (!Is64Bit)
36731 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36732 .addImm(16);
36733
36734 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36735 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36736 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36737
36738 // Set up the CFG correctly.
36739 BB->addSuccessor(bumpMBB);
36740 BB->addSuccessor(mallocMBB);
36741 mallocMBB->addSuccessor(continueMBB);
36742 bumpMBB->addSuccessor(continueMBB);
36743
36744 // Take care of the PHI nodes.
36745 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36746 MI.getOperand(0).getReg())
36747 .addReg(mallocPtrVReg)
36748 .addMBB(mallocMBB)
36749 .addReg(bumpSPPtrVReg)
36750 .addMBB(bumpMBB);
36751
36752 // Delete the original pseudo instruction.
36753 MI.eraseFromParent();
36754
36755 // And we're done.
36756 return continueMBB;
36757}
36758
36760X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36761 MachineBasicBlock *BB) const {
36762 MachineFunction *MF = BB->getParent();
36763 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36764 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36765 const MIMetadata MIMD(MI);
36766
36769 "SEH does not use catchret!");
36770
36771 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36772 if (!Subtarget.is32Bit())
36773 return BB;
36774
36775 // C++ EH creates a new target block to hold the restore code, and wires up
36776 // the new block to the return destination with a normal JMP_4.
36777 MachineBasicBlock *RestoreMBB =
36779 assert(BB->succ_size() == 1);
36780 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36781 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36782 BB->addSuccessor(RestoreMBB);
36783 MI.getOperand(0).setMBB(RestoreMBB);
36784
36785 // Marking this as an EH pad but not a funclet entry block causes PEI to
36786 // restore stack pointers in the block.
36787 RestoreMBB->setIsEHPad(true);
36788
36789 auto RestoreMBBI = RestoreMBB->begin();
36790 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36791 return BB;
36792}
36793
36795X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36796 MachineBasicBlock *BB) const {
36797 // This is pretty easy. We're taking the value that we received from
36798 // our load from the relocation, sticking it in either RDI (x86-64)
36799 // or EAX and doing an indirect call. The return value will then
36800 // be in the normal return register.
36801 MachineFunction *F = BB->getParent();
36802 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36803 const MIMetadata MIMD(MI);
36804
36805 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36806 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36807
36808 // Get a register mask for the lowered call.
36809 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36810 // proper register mask.
36811 const uint32_t *RegMask =
36812 Subtarget.is64Bit() ?
36813 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36814 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36815 if (Subtarget.is64Bit()) {
36816 MachineInstrBuilder MIB =
36817 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36818 .addReg(X86::RIP)
36819 .addImm(0)
36820 .addReg(0)
36821 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36822 MI.getOperand(3).getTargetFlags())
36823 .addReg(0);
36824 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36825 addDirectMem(MIB, X86::RDI);
36826 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36827 } else if (!isPositionIndependent()) {
36828 MachineInstrBuilder MIB =
36829 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36830 .addReg(0)
36831 .addImm(0)
36832 .addReg(0)
36833 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36834 MI.getOperand(3).getTargetFlags())
36835 .addReg(0);
36836 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36837 addDirectMem(MIB, X86::EAX);
36838 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36839 } else {
36840 MachineInstrBuilder MIB =
36841 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36842 .addReg(TII->getGlobalBaseReg(F))
36843 .addImm(0)
36844 .addReg(0)
36845 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36846 MI.getOperand(3).getTargetFlags())
36847 .addReg(0);
36848 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36849 addDirectMem(MIB, X86::EAX);
36850 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36851 }
36852
36853 MI.eraseFromParent(); // The pseudo instruction is gone now.
36854 return BB;
36855}
36856
36857static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36858 switch (RPOpc) {
36859 case X86::INDIRECT_THUNK_CALL32:
36860 return X86::CALLpcrel32;
36861 case X86::INDIRECT_THUNK_CALL64:
36862 return X86::CALL64pcrel32;
36863 case X86::INDIRECT_THUNK_TCRETURN32:
36864 return X86::TCRETURNdi;
36865 case X86::INDIRECT_THUNK_TCRETURN64:
36866 return X86::TCRETURNdi64;
36867 }
36868 llvm_unreachable("not indirect thunk opcode");
36869}
36870
36871static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36872 Register Reg) {
36873 if (Subtarget.useRetpolineExternalThunk()) {
36874 // When using an external thunk for retpolines, we pick names that match the
36875 // names GCC happens to use as well. This helps simplify the implementation
36876 // of the thunks for kernels where they have no easy ability to create
36877 // aliases and are doing non-trivial configuration of the thunk's body. For
36878 // example, the Linux kernel will do boot-time hot patching of the thunk
36879 // bodies and cannot easily export aliases of these to loaded modules.
36880 //
36881 // Note that at any point in the future, we may need to change the semantics
36882 // of how we implement retpolines and at that time will likely change the
36883 // name of the called thunk. Essentially, there is no hard guarantee that
36884 // LLVM will generate calls to specific thunks, we merely make a best-effort
36885 // attempt to help out kernels and other systems where duplicating the
36886 // thunks is costly.
36887 switch (Reg.id()) {
36888 case X86::EAX:
36889 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36890 return "__x86_indirect_thunk_eax";
36891 case X86::ECX:
36892 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36893 return "__x86_indirect_thunk_ecx";
36894 case X86::EDX:
36895 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36896 return "__x86_indirect_thunk_edx";
36897 case X86::EDI:
36898 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36899 return "__x86_indirect_thunk_edi";
36900 case X86::R11:
36901 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36902 return "__x86_indirect_thunk_r11";
36903 }
36904 llvm_unreachable("unexpected reg for external indirect thunk");
36905 }
36906
36907 if (Subtarget.useRetpolineIndirectCalls() ||
36908 Subtarget.useRetpolineIndirectBranches()) {
36909 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36910 switch (Reg.id()) {
36911 case X86::EAX:
36912 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36913 return "__llvm_retpoline_eax";
36914 case X86::ECX:
36915 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36916 return "__llvm_retpoline_ecx";
36917 case X86::EDX:
36918 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36919 return "__llvm_retpoline_edx";
36920 case X86::EDI:
36921 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36922 return "__llvm_retpoline_edi";
36923 case X86::R11:
36924 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36925 return "__llvm_retpoline_r11";
36926 }
36927 llvm_unreachable("unexpected reg for retpoline");
36928 }
36929
36930 if (Subtarget.useLVIControlFlowIntegrity()) {
36931 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36932 return "__llvm_lvi_thunk_r11";
36933 }
36934 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36935}
36936
36938X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36939 MachineBasicBlock *BB) const {
36940 // Copy the virtual register into the R11 physical register and
36941 // call the retpoline thunk.
36942 const MIMetadata MIMD(MI);
36943 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36944 Register CalleeVReg = MI.getOperand(0).getReg();
36945 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36946
36947 // Find an available scratch register to hold the callee. On 64-bit, we can
36948 // just use R11, but we scan for uses anyway to ensure we don't generate
36949 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36950 // already a register use operand to the call to hold the callee. If none
36951 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36952 // register and ESI is the base pointer to realigned stack frames with VLAs.
36953 SmallVector<Register, 3> AvailableRegs;
36954 if (Subtarget.is64Bit())
36955 AvailableRegs.push_back(X86::R11);
36956 else
36957 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36958
36959 // Zero out any registers that are already used.
36960 for (const auto &MO : MI.operands()) {
36961 if (MO.isReg() && MO.isUse())
36962 llvm::replace(AvailableRegs, MO.getReg(), Register());
36963 }
36964
36965 // Choose the first remaining non-zero available register.
36966 Register AvailableReg;
36967 for (Register MaybeReg : AvailableRegs) {
36968 if (MaybeReg) {
36969 AvailableReg = MaybeReg;
36970 break;
36971 }
36972 }
36973 if (!AvailableReg)
36974 report_fatal_error("calling convention incompatible with retpoline, no "
36975 "available registers");
36976
36977 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36978
36979 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36980 .addReg(CalleeVReg);
36981 MI.getOperand(0).ChangeToES(Symbol);
36982 MI.setDesc(TII->get(Opc));
36983 MachineInstrBuilder(*BB->getParent(), &MI)
36984 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36985 return BB;
36986}
36987
36988/// SetJmp implies future control flow change upon calling the corresponding
36989/// LongJmp.
36990/// Instead of using the 'return' instruction, the long jump fixes the stack and
36991/// performs an indirect branch. To do so it uses the registers that were stored
36992/// in the jump buffer (when calling SetJmp).
36993/// In case the shadow stack is enabled we need to fix it as well, because some
36994/// return addresses will be skipped.
36995/// The function will save the SSP for future fixing in the function
36996/// emitLongJmpShadowStackFix.
36997/// \sa emitLongJmpShadowStackFix
36998/// \param [in] MI The temporary Machine Instruction for the builtin.
36999/// \param [in] MBB The Machine Basic Block that will be modified.
37000void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
37001 MachineBasicBlock *MBB) const {
37002 const MIMetadata MIMD(MI);
37003 MachineFunction *MF = MBB->getParent();
37004 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37005 MachineRegisterInfo &MRI = MF->getRegInfo();
37006 MachineInstrBuilder MIB;
37007
37008 // Memory Reference.
37009 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37010
37011 // Initialize a register with zero.
37012 MVT PVT = getPointerTy(MF->getDataLayout());
37013 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37014 Register ZReg = MRI.createVirtualRegister(PtrRC);
37015 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
37016 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
37017 .addDef(ZReg)
37018 .addReg(ZReg, RegState::Undef)
37019 .addReg(ZReg, RegState::Undef);
37020
37021 // Read the current SSP Register value to the zeroed register.
37022 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37023 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37024 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37025
37026 // Write the SSP register value to offset 3 in input memory buffer.
37027 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37028 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
37029 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37030 const unsigned MemOpndSlot = 1;
37031 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37032 if (i == X86::AddrDisp)
37033 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37034 else
37035 MIB.add(MI.getOperand(MemOpndSlot + i));
37036 }
37037 MIB.addReg(SSPCopyReg);
37038 MIB.setMemRefs(MMOs);
37039}
37040
37042X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37043 MachineBasicBlock *MBB) const {
37044 const MIMetadata MIMD(MI);
37045 MachineFunction *MF = MBB->getParent();
37046 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37047 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37048 MachineRegisterInfo &MRI = MF->getRegInfo();
37049
37050 const BasicBlock *BB = MBB->getBasicBlock();
37052
37053 // Memory Reference
37054 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37055
37056 unsigned MemOpndSlot = 0;
37057
37058 unsigned CurOp = 0;
37059
37060 Register DstReg = MI.getOperand(CurOp++).getReg();
37061 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37062 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37063 (void)TRI;
37064 Register mainDstReg = MRI.createVirtualRegister(RC);
37065 Register restoreDstReg = MRI.createVirtualRegister(RC);
37066
37067 MemOpndSlot = CurOp;
37068
37069 MVT PVT = getPointerTy(MF->getDataLayout());
37070 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37071 "Invalid Pointer Size!");
37072
37073 // For v = setjmp(buf), we generate
37074 //
37075 // thisMBB:
37076 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37077 // SjLjSetup restoreMBB
37078 //
37079 // mainMBB:
37080 // v_main = 0
37081 //
37082 // sinkMBB:
37083 // v = phi(main, restore)
37084 //
37085 // restoreMBB:
37086 // if base pointer being used, load it from frame
37087 // v_restore = 1
37088
37089 MachineBasicBlock *thisMBB = MBB;
37090 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37091 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37092 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37093 MF->insert(I, mainMBB);
37094 MF->insert(I, sinkMBB);
37095 MF->push_back(restoreMBB);
37096 restoreMBB->setMachineBlockAddressTaken();
37097
37098 MachineInstrBuilder MIB;
37099
37100 // Transfer the remainder of BB and its successor edges to sinkMBB.
37101 sinkMBB->splice(sinkMBB->begin(), MBB,
37102 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37104
37105 // thisMBB:
37106 unsigned PtrStoreOpc = 0;
37107 Register LabelReg;
37108 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37109 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37111
37112 // Prepare IP either in reg or imm.
37113 if (!UseImmLabel) {
37114 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37115 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37116 LabelReg = MRI.createVirtualRegister(PtrRC);
37117 if (Subtarget.is64Bit()) {
37118 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37119 .addReg(X86::RIP)
37120 .addImm(0)
37121 .addReg(0)
37122 .addMBB(restoreMBB)
37123 .addReg(0);
37124 } else {
37125 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37126 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37127 .addReg(XII->getGlobalBaseReg(MF))
37128 .addImm(0)
37129 .addReg(0)
37130 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37131 .addReg(0);
37132 }
37133 } else
37134 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37135 // Store IP
37136 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37137 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37138 if (i == X86::AddrDisp)
37139 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37140 else
37141 MIB.add(MI.getOperand(MemOpndSlot + i));
37142 }
37143 if (!UseImmLabel)
37144 MIB.addReg(LabelReg);
37145 else
37146 MIB.addMBB(restoreMBB);
37147 MIB.setMemRefs(MMOs);
37148
37149 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37150 emitSetJmpShadowStackFix(MI, thisMBB);
37151 }
37152
37153 // Setup
37154 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37155 .addMBB(restoreMBB);
37156
37157 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37158 MIB.addRegMask(RegInfo->getNoPreservedMask());
37159 thisMBB->addSuccessor(mainMBB);
37160 thisMBB->addSuccessor(restoreMBB);
37161
37162 // mainMBB:
37163 // EAX = 0
37164 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37165 mainMBB->addSuccessor(sinkMBB);
37166
37167 // sinkMBB:
37168 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37169 .addReg(mainDstReg)
37170 .addMBB(mainMBB)
37171 .addReg(restoreDstReg)
37172 .addMBB(restoreMBB);
37173
37174 // restoreMBB:
37175 if (RegInfo->hasBasePointer(*MF)) {
37176 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37177 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37178 X86FI->setRestoreBasePointer(MF);
37179 Register FramePtr = RegInfo->getFrameRegister(*MF);
37180 Register BasePtr = RegInfo->getBaseRegister();
37181 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37182 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37183 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37185 }
37186 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37187 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37188 restoreMBB->addSuccessor(sinkMBB);
37189
37190 MI.eraseFromParent();
37191 return sinkMBB;
37192}
37193
37194/// Fix the shadow stack using the previously saved SSP pointer.
37195/// \sa emitSetJmpShadowStackFix
37196/// \param [in] MI The temporary Machine Instruction for the builtin.
37197/// \param [in] MBB The Machine Basic Block that will be modified.
37198/// \return The sink MBB that will perform the future indirect branch.
37200X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37201 MachineBasicBlock *MBB) const {
37202 const MIMetadata MIMD(MI);
37203 MachineFunction *MF = MBB->getParent();
37204 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37205 MachineRegisterInfo &MRI = MF->getRegInfo();
37206
37207 // Memory Reference
37208 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37209
37210 MVT PVT = getPointerTy(MF->getDataLayout());
37211 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37212
37213 // checkSspMBB:
37214 // xor vreg1, vreg1
37215 // rdssp vreg1
37216 // test vreg1, vreg1
37217 // je sinkMBB # Jump if Shadow Stack is not supported
37218 // fallMBB:
37219 // mov buf+24/12(%rip), vreg2
37220 // sub vreg1, vreg2
37221 // jbe sinkMBB # No need to fix the Shadow Stack
37222 // fixShadowMBB:
37223 // shr 3/2, vreg2
37224 // incssp vreg2 # fix the SSP according to the lower 8 bits
37225 // shr 8, vreg2
37226 // je sinkMBB
37227 // fixShadowLoopPrepareMBB:
37228 // shl vreg2
37229 // mov 128, vreg3
37230 // fixShadowLoopMBB:
37231 // incssp vreg3
37232 // dec vreg2
37233 // jne fixShadowLoopMBB # Iterate until you finish fixing
37234 // # the Shadow Stack
37235 // sinkMBB:
37236
37238 const BasicBlock *BB = MBB->getBasicBlock();
37239
37240 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37241 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37242 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37243 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37244 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37245 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37246 MF->insert(I, checkSspMBB);
37247 MF->insert(I, fallMBB);
37248 MF->insert(I, fixShadowMBB);
37249 MF->insert(I, fixShadowLoopPrepareMBB);
37250 MF->insert(I, fixShadowLoopMBB);
37251 MF->insert(I, sinkMBB);
37252
37253 // Transfer the remainder of BB and its successor edges to sinkMBB.
37254 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37255 MBB->end());
37257
37258 MBB->addSuccessor(checkSspMBB);
37259
37260 // Initialize a register with zero.
37261 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37262 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37263
37264 if (PVT == MVT::i64) {
37265 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37266 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37267 .addImm(0)
37268 .addReg(ZReg)
37269 .addImm(X86::sub_32bit);
37270 ZReg = TmpZReg;
37271 }
37272
37273 // Read the current SSP Register value to the zeroed register.
37274 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37275 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37276 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37277
37278 // Check whether the result of the SSP register is zero and jump directly
37279 // to the sink.
37280 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37281 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37282 .addReg(SSPCopyReg)
37283 .addReg(SSPCopyReg);
37284 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37285 .addMBB(sinkMBB)
37287 checkSspMBB->addSuccessor(sinkMBB);
37288 checkSspMBB->addSuccessor(fallMBB);
37289
37290 // Reload the previously saved SSP register value.
37291 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37292 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37293 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37294 MachineInstrBuilder MIB =
37295 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37296 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37297 const MachineOperand &MO = MI.getOperand(i);
37298 if (i == X86::AddrDisp)
37299 MIB.addDisp(MO, SPPOffset);
37300 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37301 // preserve kill flags.
37302 MIB.addReg(MO.getReg());
37303 else
37304 MIB.add(MO);
37305 }
37306 MIB.setMemRefs(MMOs);
37307
37308 // Subtract the current SSP from the previous SSP.
37309 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37310 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37311 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37312 .addReg(PrevSSPReg)
37313 .addReg(SSPCopyReg);
37314
37315 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37316 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37317 .addMBB(sinkMBB)
37319 fallMBB->addSuccessor(sinkMBB);
37320 fallMBB->addSuccessor(fixShadowMBB);
37321
37322 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37323 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37324 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37325 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37326 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37327 .addReg(SspSubReg)
37328 .addImm(Offset);
37329
37330 // Increase SSP when looking only on the lower 8 bits of the delta.
37331 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37332 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37333
37334 // Reset the lower 8 bits.
37335 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37336 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37337 .addReg(SspFirstShrReg)
37338 .addImm(8);
37339
37340 // Jump if the result of the shift is zero.
37341 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37342 .addMBB(sinkMBB)
37344 fixShadowMBB->addSuccessor(sinkMBB);
37345 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37346
37347 // Do a single shift left.
37348 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37349 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37350 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37351 .addReg(SspSecondShrReg)
37352 .addImm(1);
37353
37354 // Save the value 128 to a register (will be used next with incssp).
37355 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37356 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37357 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37358 .addImm(128);
37359 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37360
37361 // Since incssp only looks at the lower 8 bits, we might need to do several
37362 // iterations of incssp until we finish fixing the shadow stack.
37363 Register DecReg = MRI.createVirtualRegister(PtrRC);
37364 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37365 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37366 .addReg(SspAfterShlReg)
37367 .addMBB(fixShadowLoopPrepareMBB)
37368 .addReg(DecReg)
37369 .addMBB(fixShadowLoopMBB);
37370
37371 // Every iteration we increase the SSP by 128.
37372 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37373
37374 // Every iteration we decrement the counter by 1.
37375 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37376 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37377
37378 // Jump if the counter is not zero yet.
37379 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37380 .addMBB(fixShadowLoopMBB)
37382 fixShadowLoopMBB->addSuccessor(sinkMBB);
37383 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37384
37385 return sinkMBB;
37386}
37387
37389X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37390 MachineBasicBlock *MBB) const {
37391 const MIMetadata MIMD(MI);
37392 MachineFunction *MF = MBB->getParent();
37393 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37394 MachineRegisterInfo &MRI = MF->getRegInfo();
37395
37396 // Memory Reference
37397 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37398
37399 MVT PVT = getPointerTy(MF->getDataLayout());
37400 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37401 "Invalid Pointer Size!");
37402
37403 const TargetRegisterClass *RC =
37404 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37405 Register Tmp = MRI.createVirtualRegister(RC);
37406 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37407 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37408 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37409 Register SP = RegInfo->getStackRegister();
37410
37411 MachineInstrBuilder MIB;
37412
37413 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37414 const int64_t SPOffset = 2 * PVT.getStoreSize();
37415
37416 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37417 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37418
37419 MachineBasicBlock *thisMBB = MBB;
37420
37421 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37422 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37423 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37424 }
37425
37426 // Reload FP
37427 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37428 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37429 const MachineOperand &MO = MI.getOperand(i);
37430 if (MO.isReg()) // Don't add the whole operand, we don't want to
37431 // preserve kill flags.
37432 MIB.addReg(MO.getReg());
37433 else
37434 MIB.add(MO);
37435 }
37436 MIB.setMemRefs(MMOs);
37438
37439 // Reload IP
37440 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37441 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37442 const MachineOperand &MO = MI.getOperand(i);
37443 if (i == X86::AddrDisp)
37444 MIB.addDisp(MO, LabelOffset);
37445 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37446 // preserve kill flags.
37447 MIB.addReg(MO.getReg());
37448 else
37449 MIB.add(MO);
37450 }
37451 MIB.setMemRefs(MMOs);
37452
37453 // Reload SP
37454 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37455 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37456 if (i == X86::AddrDisp)
37457 MIB.addDisp(MI.getOperand(i), SPOffset);
37458 else
37459 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37460 // the last instruction of the expansion.
37461 }
37462 MIB.setMemRefs(MMOs);
37464
37465 // Jump
37466 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37467
37468 MI.eraseFromParent();
37469 return thisMBB;
37470}
37471
37472void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37474 MachineBasicBlock *DispatchBB,
37475 int FI) const {
37476 const MIMetadata MIMD(MI);
37477 MachineFunction *MF = MBB->getParent();
37478 MachineRegisterInfo *MRI = &MF->getRegInfo();
37479 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37480
37481 MVT PVT = getPointerTy(MF->getDataLayout());
37482 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37483
37484 unsigned Op = 0;
37485 Register VR;
37486
37487 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37489
37490 if (UseImmLabel) {
37491 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37492 } else {
37493 const TargetRegisterClass *TRC =
37494 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37495 VR = MRI->createVirtualRegister(TRC);
37496 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37497
37498 if (Subtarget.is64Bit())
37499 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37500 .addReg(X86::RIP)
37501 .addImm(1)
37502 .addReg(0)
37503 .addMBB(DispatchBB)
37504 .addReg(0);
37505 else
37506 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37507 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37508 .addImm(1)
37509 .addReg(0)
37510 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37511 .addReg(0);
37512 }
37513
37514 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37515 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37516 if (UseImmLabel)
37517 MIB.addMBB(DispatchBB);
37518 else
37519 MIB.addReg(VR);
37520}
37521
37523X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37524 MachineBasicBlock *BB) const {
37525 const MIMetadata MIMD(MI);
37526 MachineFunction *MF = BB->getParent();
37527 MachineRegisterInfo *MRI = &MF->getRegInfo();
37528 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37529 int FI = MF->getFrameInfo().getFunctionContextIndex();
37530
37531 // Get a mapping of the call site numbers to all of the landing pads they're
37532 // associated with.
37533 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37534 unsigned MaxCSNum = 0;
37535 for (auto &MBB : *MF) {
37536 if (!MBB.isEHPad())
37537 continue;
37538
37539 MCSymbol *Sym = nullptr;
37540 for (const auto &MI : MBB) {
37541 if (MI.isDebugInstr())
37542 continue;
37543
37544 assert(MI.isEHLabel() && "expected EH_LABEL");
37545 Sym = MI.getOperand(0).getMCSymbol();
37546 break;
37547 }
37548
37549 if (!MF->hasCallSiteLandingPad(Sym))
37550 continue;
37551
37552 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37553 CallSiteNumToLPad[CSI].push_back(&MBB);
37554 MaxCSNum = std::max(MaxCSNum, CSI);
37555 }
37556 }
37557
37558 // Get an ordered list of the machine basic blocks for the jump table.
37559 std::vector<MachineBasicBlock *> LPadList;
37560 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37561 LPadList.reserve(CallSiteNumToLPad.size());
37562
37563 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37564 for (auto &LP : CallSiteNumToLPad[CSI]) {
37565 LPadList.push_back(LP);
37566 InvokeBBs.insert_range(LP->predecessors());
37567 }
37568 }
37569
37570 assert(!LPadList.empty() &&
37571 "No landing pad destinations for the dispatch jump table!");
37572
37573 // Create the MBBs for the dispatch code.
37574
37575 // Shove the dispatch's address into the return slot in the function context.
37576 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37577 DispatchBB->setIsEHPad(true);
37578
37579 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37580 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37581 DispatchBB->addSuccessor(TrapBB);
37582
37583 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37584 DispatchBB->addSuccessor(DispContBB);
37585
37586 // Insert MBBs.
37587 MF->push_back(DispatchBB);
37588 MF->push_back(DispContBB);
37589 MF->push_back(TrapBB);
37590
37591 // Insert code into the entry block that creates and registers the function
37592 // context.
37593 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37594
37595 // Create the jump table and associated information
37596 unsigned JTE = getJumpTableEncoding();
37597 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37598 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37599
37600 const X86RegisterInfo &RI = TII->getRegisterInfo();
37601 // Add a register mask with no preserved registers. This results in all
37602 // registers being marked as clobbered.
37603 if (RI.hasBasePointer(*MF)) {
37604 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37605 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37606 MFI->setRestoreBasePointer(MF);
37607
37608 Register FP = RI.getFrameRegister(*MF);
37609 Register BP = RI.getBaseRegister();
37610 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37611 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37614 } else {
37615 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37617 }
37618
37619 // IReg is used as an index in a memory operand and therefore can't be SP
37620 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37621 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37622 Subtarget.is64Bit() ? 8 : 4);
37623 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37624 .addReg(IReg)
37625 .addImm(LPadList.size());
37626 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37627 .addMBB(TrapBB)
37629
37630 if (Subtarget.is64Bit()) {
37631 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37632 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37633
37634 // leaq .LJTI0_0(%rip), BReg
37635 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37636 .addReg(X86::RIP)
37637 .addImm(1)
37638 .addReg(0)
37639 .addJumpTableIndex(MJTI)
37640 .addReg(0);
37641 // movzx IReg64, IReg
37642 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37643 .addImm(0)
37644 .addReg(IReg)
37645 .addImm(X86::sub_32bit);
37646
37647 switch (JTE) {
37649 // jmpq *(BReg,IReg64,8)
37650 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37651 .addReg(BReg)
37652 .addImm(8)
37653 .addReg(IReg64)
37654 .addImm(0)
37655 .addReg(0);
37656 break;
37658 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37659 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37660 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37661
37662 // movl (BReg,IReg64,4), OReg
37663 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37664 .addReg(BReg)
37665 .addImm(4)
37666 .addReg(IReg64)
37667 .addImm(0)
37668 .addReg(0);
37669 // movsx OReg64, OReg
37670 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37671 .addReg(OReg);
37672 // addq BReg, OReg64, TReg
37673 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37674 .addReg(OReg64)
37675 .addReg(BReg);
37676 // jmpq *TReg
37677 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37678 break;
37679 }
37680 default:
37681 llvm_unreachable("Unexpected jump table encoding");
37682 }
37683 } else {
37684 // jmpl *.LJTI0_0(,IReg,4)
37685 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37686 .addReg(0)
37687 .addImm(4)
37688 .addReg(IReg)
37689 .addJumpTableIndex(MJTI)
37690 .addReg(0);
37691 }
37692
37693 // Add the jump table entries as successors to the MBB.
37694 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37695 for (auto &LP : LPadList)
37696 if (SeenMBBs.insert(LP).second)
37697 DispContBB->addSuccessor(LP);
37698
37699 // N.B. the order the invoke BBs are processed in doesn't matter here.
37701 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37702 for (MachineBasicBlock *MBB : InvokeBBs) {
37703 // Remove the landing pad successor from the invoke block and replace it
37704 // with the new dispatch block.
37705 // Keep a copy of Successors since it's modified inside the loop.
37706 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37707 MBB->succ_rend());
37708 // FIXME: Avoid quadratic complexity.
37709 for (auto *MBBS : Successors) {
37710 if (MBBS->isEHPad()) {
37711 MBB->removeSuccessor(MBBS);
37712 MBBLPads.push_back(MBBS);
37713 }
37714 }
37715
37716 MBB->addSuccessor(DispatchBB);
37717
37718 // Find the invoke call and mark all of the callee-saved registers as
37719 // 'implicit defined' so that they're spilled. This prevents code from
37720 // moving instructions to before the EH block, where they will never be
37721 // executed.
37722 for (auto &II : reverse(*MBB)) {
37723 if (!II.isCall())
37724 continue;
37725
37726 DenseSet<Register> DefRegs;
37727 for (auto &MOp : II.operands())
37728 if (MOp.isReg())
37729 DefRegs.insert(MOp.getReg());
37730
37731 MachineInstrBuilder MIB(*MF, &II);
37732 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37733 Register Reg = SavedRegs[RegIdx];
37734 if (!DefRegs.contains(Reg))
37736 }
37737
37738 break;
37739 }
37740 }
37741
37742 // Mark all former landing pads as non-landing pads. The dispatch is the only
37743 // landing pad now.
37744 for (auto &LP : MBBLPads)
37745 LP->setIsEHPad(false);
37746
37747 // The instruction is gone now.
37748 MI.eraseFromParent();
37749 return BB;
37750}
37751
37753X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37754 MachineBasicBlock *BB) const {
37755 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37756 // calls may require proper stack alignment.
37757 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37758 const MIMetadata MIMD(MI);
37759 MachineFunction &MF = *BB->getParent();
37760
37761 // Emit CALLSEQ_START right before the instruction.
37762 MF.getFrameInfo().setAdjustsStack(true);
37763 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37764 MachineInstrBuilder CallseqStart =
37765 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37766 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37767
37768 // Emit CALLSEQ_END right after the instruction.
37769 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37770 MachineInstrBuilder CallseqEnd =
37771 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37772 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37773
37774 return BB;
37775}
37776
37779 MachineBasicBlock *BB) const {
37780 MachineFunction *MF = BB->getParent();
37781 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37782 const MIMetadata MIMD(MI);
37783
37784 auto TMMImmToTMMReg = [](unsigned Imm) {
37785 assert (Imm < 8 && "Illegal tmm index");
37786 return X86::TMM0 + Imm;
37787 };
37788 auto TMMImmToTMMPair = [](unsigned Imm) {
37789 assert(Imm < 8 && "Illegal tmm pair index.");
37790 return X86::TMM0_TMM1 + Imm / 2;
37791 };
37792 switch (MI.getOpcode()) {
37793 default:
37794 llvm_unreachable("Unexpected instr type to insert");
37795 case X86::INDIRECT_THUNK_CALL32:
37796 case X86::INDIRECT_THUNK_CALL64:
37797 case X86::INDIRECT_THUNK_TCRETURN32:
37798 case X86::INDIRECT_THUNK_TCRETURN64:
37799 return EmitLoweredIndirectThunk(MI, BB);
37800 case X86::CATCHRET:
37801 return EmitLoweredCatchRet(MI, BB);
37802 case X86::SEG_ALLOCA_32:
37803 case X86::SEG_ALLOCA_64:
37804 return EmitLoweredSegAlloca(MI, BB);
37805 case X86::PROBED_ALLOCA_32:
37806 case X86::PROBED_ALLOCA_64:
37807 return EmitLoweredProbedAlloca(MI, BB);
37808 case X86::TLSCall_32:
37809 case X86::TLSCall_64:
37810 return EmitLoweredTLSCall(MI, BB);
37811 case X86::CMOV_FR16:
37812 case X86::CMOV_FR16X:
37813 case X86::CMOV_FR32:
37814 case X86::CMOV_FR32X:
37815 case X86::CMOV_FR64:
37816 case X86::CMOV_FR64X:
37817 case X86::CMOV_GR8:
37818 case X86::CMOV_GR16:
37819 case X86::CMOV_GR32:
37820 case X86::CMOV_RFP32:
37821 case X86::CMOV_RFP64:
37822 case X86::CMOV_RFP80:
37823 case X86::CMOV_VR64:
37824 case X86::CMOV_VR128:
37825 case X86::CMOV_VR128X:
37826 case X86::CMOV_VR256:
37827 case X86::CMOV_VR256X:
37828 case X86::CMOV_VR512:
37829 case X86::CMOV_VK1:
37830 case X86::CMOV_VK2:
37831 case X86::CMOV_VK4:
37832 case X86::CMOV_VK8:
37833 case X86::CMOV_VK16:
37834 case X86::CMOV_VK32:
37835 case X86::CMOV_VK64:
37836 return EmitLoweredSelect(MI, BB);
37837
37838 case X86::FP80_ADDr:
37839 case X86::FP80_ADDm32: {
37840 // Change the floating point control register to use double extended
37841 // precision when performing the addition.
37842 int OrigCWFrameIdx =
37843 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37844 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37845 OrigCWFrameIdx);
37846
37847 // Load the old value of the control word...
37848 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37849 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37850 OrigCWFrameIdx);
37851
37852 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37853 // precision.
37854 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37855 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37856 .addReg(OldCW, RegState::Kill)
37857 .addImm(0x300);
37858
37859 // Extract to 16 bits.
37860 Register NewCW16 =
37861 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37862 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37863 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37864
37865 // Prepare memory for FLDCW.
37866 int NewCWFrameIdx =
37867 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37868 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37869 NewCWFrameIdx)
37870 .addReg(NewCW16, RegState::Kill);
37871
37872 // Reload the modified control word now...
37873 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37874 NewCWFrameIdx);
37875
37876 // Do the addition.
37877 if (MI.getOpcode() == X86::FP80_ADDr) {
37878 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37879 .add(MI.getOperand(0))
37880 .add(MI.getOperand(1))
37881 .add(MI.getOperand(2));
37882 } else {
37883 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37884 .add(MI.getOperand(0))
37885 .add(MI.getOperand(1))
37886 .add(MI.getOperand(2))
37887 .add(MI.getOperand(3))
37888 .add(MI.getOperand(4))
37889 .add(MI.getOperand(5))
37890 .add(MI.getOperand(6));
37891 }
37892
37893 // Reload the original control word now.
37894 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37895 OrigCWFrameIdx);
37896
37897 MI.eraseFromParent(); // The pseudo instruction is gone now.
37898 return BB;
37899 }
37900
37901 case X86::FP32_TO_INT16_IN_MEM:
37902 case X86::FP32_TO_INT32_IN_MEM:
37903 case X86::FP32_TO_INT64_IN_MEM:
37904 case X86::FP64_TO_INT16_IN_MEM:
37905 case X86::FP64_TO_INT32_IN_MEM:
37906 case X86::FP64_TO_INT64_IN_MEM:
37907 case X86::FP80_TO_INT16_IN_MEM:
37908 case X86::FP80_TO_INT32_IN_MEM:
37909 case X86::FP80_TO_INT64_IN_MEM: {
37910 // Change the floating point control register to use "round towards zero"
37911 // mode when truncating to an integer value.
37912 int OrigCWFrameIdx =
37913 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37914 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37915 OrigCWFrameIdx);
37916
37917 // Load the old value of the control word...
37918 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37919 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37920 OrigCWFrameIdx);
37921
37922 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37923 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37924 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37925 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37926
37927 // Extract to 16 bits.
37928 Register NewCW16 =
37929 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37930 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37931 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37932
37933 // Prepare memory for FLDCW.
37934 int NewCWFrameIdx =
37935 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37936 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37937 NewCWFrameIdx)
37938 .addReg(NewCW16, RegState::Kill);
37939
37940 // Reload the modified control word now...
37941 addFrameReference(BuildMI(*BB, MI, MIMD,
37942 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37943
37944 // Get the X86 opcode to use.
37945 unsigned Opc;
37946 switch (MI.getOpcode()) {
37947 // clang-format off
37948 default: llvm_unreachable("illegal opcode!");
37949 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37950 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37951 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37952 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37953 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37954 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37955 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37956 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37957 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37958 // clang-format on
37959 }
37960
37962 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37963 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37964
37965 // Reload the original control word now.
37966 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37967 OrigCWFrameIdx);
37968
37969 MI.eraseFromParent(); // The pseudo instruction is gone now.
37970 return BB;
37971 }
37972
37973 // xbegin
37974 case X86::XBEGIN:
37975 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37976
37977 case X86::VAARG_64:
37978 case X86::VAARG_X32:
37979 return EmitVAARGWithCustomInserter(MI, BB);
37980
37981 case X86::EH_SjLj_SetJmp32:
37982 case X86::EH_SjLj_SetJmp64:
37983 return emitEHSjLjSetJmp(MI, BB);
37984
37985 case X86::EH_SjLj_LongJmp32:
37986 case X86::EH_SjLj_LongJmp64:
37987 return emitEHSjLjLongJmp(MI, BB);
37988
37989 case X86::Int_eh_sjlj_setup_dispatch:
37990 return EmitSjLjDispatchBlock(MI, BB);
37991
37992 case TargetOpcode::STATEPOINT:
37993 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37994 // this point in the process. We diverge later.
37995 return emitPatchPoint(MI, BB);
37996
37997 case TargetOpcode::STACKMAP:
37998 case TargetOpcode::PATCHPOINT:
37999 return emitPatchPoint(MI, BB);
38000
38001 case TargetOpcode::PATCHABLE_EVENT_CALL:
38002 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
38003 return emitPatchableEventCall(MI, BB);
38004
38005 case X86::LCMPXCHG8B: {
38006 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38007 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
38008 // requires a memory operand. If it happens that current architecture is
38009 // i686 and for current function we need a base pointer
38010 // - which is ESI for i686 - register allocator would not be able to
38011 // allocate registers for an address in form of X(%reg, %reg, Y)
38012 // - there never would be enough unreserved registers during regalloc
38013 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
38014 // We are giving a hand to register allocator by precomputing the address in
38015 // a new vreg using LEA.
38016
38017 // If it is not i686 or there is no base pointer - nothing to do here.
38018 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38019 return BB;
38020
38021 // Even though this code does not necessarily needs the base pointer to
38022 // be ESI, we check for that. The reason: if this assert fails, there are
38023 // some changes happened in the compiler base pointer handling, which most
38024 // probably have to be addressed somehow here.
38025 assert(TRI->getBaseRegister() == X86::ESI &&
38026 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38027 "base pointer in mind");
38028
38030 MVT SPTy = getPointerTy(MF->getDataLayout());
38031 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38032 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38033
38035 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38036 // does not use index register.
38037 if (AM.IndexReg == X86::NoRegister)
38038 return BB;
38039
38040 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38041 // four operand definitions that are E[ABCD] registers. We skip them and
38042 // then insert the LEA.
38043 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38044 while (RMBBI != BB->rend() &&
38045 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38046 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38047 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38048 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38049 ++RMBBI;
38050 }
38053 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38054
38055 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38056
38057 return BB;
38058 }
38059 case X86::LCMPXCHG16B_NO_RBX: {
38060 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38061 Register BasePtr = TRI->getBaseRegister();
38062 if (TRI->hasBasePointer(*MF) &&
38063 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38064 if (!BB->isLiveIn(BasePtr))
38065 BB->addLiveIn(BasePtr);
38066 // Save RBX into a virtual register.
38067 Register SaveRBX =
38068 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38069 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38070 .addReg(X86::RBX);
38071 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38073 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38074 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38075 MIB.add(MI.getOperand(Idx));
38076 MIB.add(MI.getOperand(X86::AddrNumOperands));
38077 MIB.addReg(SaveRBX);
38078 } else {
38079 // Simple case, just copy the virtual register to RBX.
38080 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38081 .add(MI.getOperand(X86::AddrNumOperands));
38083 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38084 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38085 MIB.add(MI.getOperand(Idx));
38086 }
38087 MI.eraseFromParent();
38088 return BB;
38089 }
38090 case X86::MWAITX: {
38091 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38092 Register BasePtr = TRI->getBaseRegister();
38093 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38094 // If no need to save the base pointer, we generate MWAITXrrr,
38095 // else we generate pseudo MWAITX_SAVE_RBX.
38096 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38097 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38098 .addReg(MI.getOperand(0).getReg());
38099 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38100 .addReg(MI.getOperand(1).getReg());
38101 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38102 .addReg(MI.getOperand(2).getReg());
38103 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38104 MI.eraseFromParent();
38105 } else {
38106 if (!BB->isLiveIn(BasePtr)) {
38107 BB->addLiveIn(BasePtr);
38108 }
38109 // Parameters can be copied into ECX and EAX but not EBX yet.
38110 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38111 .addReg(MI.getOperand(0).getReg());
38112 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38113 .addReg(MI.getOperand(1).getReg());
38114 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38115 // Save RBX into a virtual register.
38116 Register SaveRBX =
38117 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38118 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38119 .addReg(X86::RBX);
38120 // Generate mwaitx pseudo.
38121 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38122 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38123 .addDef(Dst) // Destination tied in with SaveRBX.
38124 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38125 .addUse(SaveRBX); // Save of base pointer.
38126 MI.eraseFromParent();
38127 }
38128 return BB;
38129 }
38130 case TargetOpcode::PREALLOCATED_SETUP: {
38131 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38132 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38133 MFI->setHasPreallocatedCall(true);
38134 int64_t PreallocatedId = MI.getOperand(0).getImm();
38135 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38136 assert(StackAdjustment != 0 && "0 stack adjustment");
38137 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38138 << StackAdjustment << "\n");
38139 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38140 .addReg(X86::ESP)
38141 .addImm(StackAdjustment);
38142 MI.eraseFromParent();
38143 return BB;
38144 }
38145 case TargetOpcode::PREALLOCATED_ARG: {
38146 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38147 int64_t PreallocatedId = MI.getOperand(1).getImm();
38148 int64_t ArgIdx = MI.getOperand(2).getImm();
38149 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38150 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38151 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38152 << ", arg offset " << ArgOffset << "\n");
38153 // stack pointer + offset
38154 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38155 MI.getOperand(0).getReg()),
38156 X86::ESP, false, ArgOffset);
38157 MI.eraseFromParent();
38158 return BB;
38159 }
38160 case X86::PTDPBSSD:
38161 case X86::PTDPBSUD:
38162 case X86::PTDPBUSD:
38163 case X86::PTDPBUUD:
38164 case X86::PTDPBF16PS:
38165 case X86::PTDPFP16PS:
38166 case X86::PTCMMIMFP16PS:
38167 case X86::PTCMMRLFP16PS:
38168 case X86::PTDPBF8PS:
38169 case X86::PTDPBHF8PS:
38170 case X86::PTDPHBF8PS:
38171 case X86::PTDPHF8PS:
38172 case X86::PTTDPBF16PS:
38173 case X86::PTTDPFP16PS:
38174 case X86::PTTCMMIMFP16PS:
38175 case X86::PTTCMMRLFP16PS:
38176 case X86::PTCONJTCMMIMFP16PS:
38177 case X86::PTMMULTF32PS:
38178 case X86::PTTMMULTF32PS: {
38179 unsigned Opc;
38180 switch (MI.getOpcode()) {
38181 default: llvm_unreachable("illegal opcode!");
38182 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38183 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38184 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38185 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38186 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38187 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38188 case X86::PTCMMIMFP16PS:
38189 Opc = X86::TCMMIMFP16PS;
38190 break;
38191 case X86::PTCMMRLFP16PS:
38192 Opc = X86::TCMMRLFP16PS;
38193 break;
38194 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38195 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38196 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38197 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38198 case X86::PTTDPBF16PS:
38199 Opc = X86::TTDPBF16PS;
38200 break;
38201 case X86::PTTDPFP16PS:
38202 Opc = X86::TTDPFP16PS;
38203 break;
38204 case X86::PTTCMMIMFP16PS:
38205 Opc = X86::TTCMMIMFP16PS;
38206 break;
38207 case X86::PTTCMMRLFP16PS:
38208 Opc = X86::TTCMMRLFP16PS;
38209 break;
38210 case X86::PTCONJTCMMIMFP16PS:
38211 Opc = X86::TCONJTCMMIMFP16PS;
38212 break;
38213 case X86::PTMMULTF32PS:
38214 Opc = X86::TMMULTF32PS;
38215 break;
38216 case X86::PTTMMULTF32PS:
38217 Opc = X86::TTMMULTF32PS;
38218 break;
38219 }
38220
38221 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38222 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38223 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38224 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38225 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38226
38227 MI.eraseFromParent(); // The pseudo is gone now.
38228 return BB;
38229 }
38230 case X86::PTILEZERO: {
38231 unsigned Imm = MI.getOperand(0).getImm();
38232 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38233 MI.eraseFromParent(); // The pseudo is gone now.
38234 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38236 return BB;
38237 }
38238 case X86::PTILEZEROV: {
38239 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38241 return BB;
38242 }
38243 case X86::PTILELOADDRS:
38244 case X86::PTILELOADDRST1:
38245 case X86::PTILELOADD:
38246 case X86::PTILELOADDT1:
38247 case X86::PTILESTORED: {
38248 unsigned Opc;
38249 switch (MI.getOpcode()) {
38250 default: llvm_unreachable("illegal opcode!");
38251#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38252 case X86::PTILELOADD:
38253 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38254 break;
38255 case X86::PTILELOADDT1:
38256 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38257 break;
38258 case X86::PTILESTORED:
38259 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38260 break;
38261 case X86::PTILELOADDRS:
38262 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38263 break;
38264 case X86::PTILELOADDRST1:
38265 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38266 break;
38267 }
38268#undef GET_EGPR_IF_ENABLED
38269
38270 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38271 unsigned CurOp = 0;
38272 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38273 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38275
38276 MIB.add(MI.getOperand(CurOp++)); // base
38277 MIB.add(MI.getOperand(CurOp++)); // scale
38278 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38279 MIB.add(MI.getOperand(CurOp++)); // displacement
38280 MIB.add(MI.getOperand(CurOp++)); // segment
38281
38282 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38283 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38285
38286 MI.eraseFromParent(); // The pseudo is gone now.
38287 return BB;
38288 }
38289 case X86::PT2RPNTLVWZ0:
38290 case X86::PT2RPNTLVWZ0T1:
38291 case X86::PT2RPNTLVWZ1:
38292 case X86::PT2RPNTLVWZ1T1:
38293 case X86::PT2RPNTLVWZ0RS:
38294 case X86::PT2RPNTLVWZ0RST1:
38295 case X86::PT2RPNTLVWZ1RS:
38296 case X86::PT2RPNTLVWZ1RST1: {
38297 const DebugLoc &DL = MI.getDebugLoc();
38298 unsigned Opc;
38299#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38300 switch (MI.getOpcode()) {
38301 default:
38302 llvm_unreachable("Unexpected instruction!");
38303 case X86::PT2RPNTLVWZ0:
38304 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38305 break;
38306 case X86::PT2RPNTLVWZ0T1:
38307 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38308 break;
38309 case X86::PT2RPNTLVWZ1:
38310 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38311 break;
38312 case X86::PT2RPNTLVWZ1T1:
38313 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38314 break;
38315 case X86::PT2RPNTLVWZ0RS:
38316 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38317 break;
38318 case X86::PT2RPNTLVWZ0RST1:
38319 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38320 break;
38321 case X86::PT2RPNTLVWZ1RS:
38322 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38323 break;
38324 case X86::PT2RPNTLVWZ1RST1:
38325 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38326 break;
38327 }
38328#undef GET_EGPR_IF_ENABLED
38329 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38330 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38331
38332 MIB.add(MI.getOperand(1)); // base
38333 MIB.add(MI.getOperand(2)); // scale
38334 MIB.add(MI.getOperand(3)); // index
38335 MIB.add(MI.getOperand(4)); // displacement
38336 MIB.add(MI.getOperand(5)); // segment
38337 MI.eraseFromParent(); // The pseudo is gone now.
38338 return BB;
38339 }
38340 case X86::PTTRANSPOSED:
38341 case X86::PTCONJTFP16: {
38342 const DebugLoc &DL = MI.getDebugLoc();
38343 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38344 : X86::TCONJTFP16;
38345
38346 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38347 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38348 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38349
38350 MI.eraseFromParent(); // The pseudo is gone now.
38351 return BB;
38352 }
38353 case X86::PTCVTROWPS2BF16Hrri:
38354 case X86::PTCVTROWPS2BF16Lrri:
38355 case X86::PTCVTROWPS2PHHrri:
38356 case X86::PTCVTROWPS2PHLrri:
38357 case X86::PTCVTROWD2PSrri:
38358 case X86::PTILEMOVROWrri: {
38359 const DebugLoc &DL = MI.getDebugLoc();
38360 unsigned Opc;
38361 switch (MI.getOpcode()) {
38362 default:
38363 llvm_unreachable("Unexpected instruction!");
38364 case X86::PTCVTROWD2PSrri:
38365 Opc = X86::TCVTROWD2PSrri;
38366 break;
38367 case X86::PTCVTROWPS2BF16Hrri:
38368 Opc = X86::TCVTROWPS2BF16Hrri;
38369 break;
38370 case X86::PTCVTROWPS2PHHrri:
38371 Opc = X86::TCVTROWPS2PHHrri;
38372 break;
38373 case X86::PTCVTROWPS2BF16Lrri:
38374 Opc = X86::TCVTROWPS2BF16Lrri;
38375 break;
38376 case X86::PTCVTROWPS2PHLrri:
38377 Opc = X86::TCVTROWPS2PHLrri;
38378 break;
38379 case X86::PTILEMOVROWrri:
38380 Opc = X86::TILEMOVROWrri;
38381 break;
38382 }
38383 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38384 MIB.add(MI.getOperand(0));
38385 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38386 MIB.addImm(MI.getOperand(2).getImm());
38387
38388 MI.eraseFromParent(); // The pseudo is gone now.
38389 return BB;
38390 }
38391 case X86::PTCVTROWPS2BF16Hrre:
38392 case X86::PTCVTROWPS2BF16Lrre:
38393 case X86::PTCVTROWPS2PHHrre:
38394 case X86::PTCVTROWPS2PHLrre:
38395 case X86::PTCVTROWD2PSrre:
38396 case X86::PTILEMOVROWrre: {
38397 const DebugLoc &DL = MI.getDebugLoc();
38398 unsigned Opc;
38399 switch (MI.getOpcode()) {
38400 default:
38401 llvm_unreachable("Unexpected instruction!");
38402 case X86::PTCVTROWD2PSrre:
38403 Opc = X86::TCVTROWD2PSrre;
38404 break;
38405 case X86::PTCVTROWPS2BF16Hrre:
38406 Opc = X86::TCVTROWPS2BF16Hrre;
38407 break;
38408 case X86::PTCVTROWPS2BF16Lrre:
38409 Opc = X86::TCVTROWPS2BF16Lrre;
38410 break;
38411 case X86::PTCVTROWPS2PHHrre:
38412 Opc = X86::TCVTROWPS2PHHrre;
38413 break;
38414 case X86::PTCVTROWPS2PHLrre:
38415 Opc = X86::TCVTROWPS2PHLrre;
38416 break;
38417 case X86::PTILEMOVROWrre:
38418 Opc = X86::TILEMOVROWrre;
38419 break;
38420 }
38421 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38422 MIB.add(MI.getOperand(0));
38423 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38424 MIB.add(MI.getOperand(2));
38425
38426 MI.eraseFromParent(); // The pseudo is gone now.
38427 return BB;
38428 }
38429 }
38430}
38431
38432//===----------------------------------------------------------------------===//
38433// X86 Optimization Hooks
38434//===----------------------------------------------------------------------===//
38435
38436bool
38438 const APInt &DemandedBits,
38439 const APInt &DemandedElts,
38440 TargetLoweringOpt &TLO) const {
38441 EVT VT = Op.getValueType();
38442 unsigned Opcode = Op.getOpcode();
38443 unsigned EltSize = VT.getScalarSizeInBits();
38444
38445 if (VT.isVector()) {
38446 // If the constant is only all signbits in the active bits, then we should
38447 // extend it to the entire constant to allow it act as a boolean constant
38448 // vector.
38449 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38450 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38451 return false;
38452 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38453 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38454 continue;
38455 const APInt &Val = V.getConstantOperandAPInt(i);
38456 if (Val.getBitWidth() > Val.getNumSignBits() &&
38457 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38458 return true;
38459 }
38460 return false;
38461 };
38462 // For vectors - if we have a constant, then try to sign extend.
38463 // TODO: Handle AND cases.
38464 unsigned ActiveBits = DemandedBits.getActiveBits();
38465 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38466 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38467 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38468 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38469 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38471 SDValue NewC =
38473 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38474 SDValue NewOp =
38475 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38476 return TLO.CombineTo(Op, NewOp);
38477 }
38478 return false;
38479 }
38480
38481 // Only optimize Ands to prevent shrinking a constant that could be
38482 // matched by movzx.
38483 if (Opcode != ISD::AND)
38484 return false;
38485
38486 // Make sure the RHS really is a constant.
38487 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38488 if (!C)
38489 return false;
38490
38491 const APInt &Mask = C->getAPIntValue();
38492
38493 // Clear all non-demanded bits initially.
38494 APInt ShrunkMask = Mask & DemandedBits;
38495
38496 // Find the width of the shrunk mask.
38497 unsigned Width = ShrunkMask.getActiveBits();
38498
38499 // If the mask is all 0s there's nothing to do here.
38500 if (Width == 0)
38501 return false;
38502
38503 // Find the next power of 2 width, rounding up to a byte.
38504 Width = llvm::bit_ceil(std::max(Width, 8U));
38505 // Truncate the width to size to handle illegal types.
38506 Width = std::min(Width, EltSize);
38507
38508 // Calculate a possible zero extend mask for this constant.
38509 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38510
38511 // If we aren't changing the mask, just return true to keep it and prevent
38512 // the caller from optimizing.
38513 if (ZeroExtendMask == Mask)
38514 return true;
38515
38516 // Make sure the new mask can be represented by a combination of mask bits
38517 // and non-demanded bits.
38518 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38519 return false;
38520
38521 // Replace the constant with the zero extend mask.
38522 SDLoc DL(Op);
38523 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38524 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38525 return TLO.CombineTo(Op, NewOp);
38526}
38527
38529 KnownBits &Known,
38530 const APInt &DemandedElts,
38531 const SelectionDAG &DAG, unsigned Depth) {
38532 KnownBits Known2;
38533 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38534 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38535 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38536 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38537 Known = KnownBits::abdu(Known, Known2).zext(16);
38538 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38539 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38540 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38541 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38542 Known = Known.zext(64);
38543}
38544
38546 KnownBits &Known,
38547 const APInt &DemandedElts,
38548 const SelectionDAG &DAG,
38549 unsigned Depth) {
38550 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38551
38552 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38553 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38554 APInt DemandedLoElts =
38555 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38556 APInt DemandedHiElts =
38557 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38558 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38559 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38560 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38561 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38562 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38563 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38564 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38565}
38566
38568 KnownBits &Known,
38569 const APInt &DemandedElts,
38570 const SelectionDAG &DAG,
38571 unsigned Depth) {
38572 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38573
38574 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38575 // pairs.
38576 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38577 APInt DemandedLoElts =
38578 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38579 APInt DemandedHiElts =
38580 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38581 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38582 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38583 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38584 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38585 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38586 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38587 Known = KnownBits::sadd_sat(Lo, Hi);
38588}
38589
38591 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38592 const SelectionDAG &DAG,
38593 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38594 KnownBitsFunc) {
38595 APInt DemandedEltsLHS, DemandedEltsRHS;
38596 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38597 DemandedElts, DemandedEltsLHS,
38598 DemandedEltsRHS);
38599
38600 const auto ComputeForSingleOpFunc =
38601 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38602 return KnownBitsFunc(
38603 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38604 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38605 };
38606
38607 if (DemandedEltsRHS.isZero())
38608 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38609 if (DemandedEltsLHS.isZero())
38610 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38611
38612 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38613 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38614}
38615
38617 KnownBits &Known,
38618 const APInt &DemandedElts,
38619 const SelectionDAG &DAG,
38620 unsigned Depth) const {
38621 unsigned BitWidth = Known.getBitWidth();
38622 unsigned NumElts = DemandedElts.getBitWidth();
38623 unsigned Opc = Op.getOpcode();
38624 EVT VT = Op.getValueType();
38629 "Should use MaskedValueIsZero if you don't know whether Op"
38630 " is a target node!");
38631
38632 Known.resetAll();
38633 switch (Opc) {
38634 default: break;
38635 case X86ISD::MUL_IMM: {
38636 KnownBits Known2;
38637 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38638 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38639 Known = KnownBits::mul(Known, Known2);
38640 break;
38641 }
38642 case X86ISD::BSF: {
38644
38645 KnownBits Known2;
38646 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38647 if (Known2.isNonZero()) {
38648 // If we have a known 1, its position is our upper bound.
38649 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38650 unsigned LowBits = llvm::bit_width(PossibleTZ);
38651 Known.Zero.setBitsFrom(LowBits);
38652 } else if (!Op.getOperand(0).isUndef()) {
38653 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38654 Known = Known.intersectWith(Known2);
38655 }
38656 break;
38657 }
38658 case X86ISD::BSR: {
38659 // TODO: Bound with input known bits?
38661
38662 if (!Op.getOperand(0).isUndef() &&
38663 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38664 KnownBits Known2;
38665 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38666 Known = Known.intersectWith(Known2);
38667 }
38668 break;
38669 }
38670 case X86ISD::SETCC:
38671 Known.Zero.setBitsFrom(1);
38672 break;
38673 case X86ISD::MOVMSK: {
38674 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38675 Known.Zero.setBitsFrom(NumLoBits);
38676 break;
38677 }
38678 case X86ISD::PEXTRB:
38679 case X86ISD::PEXTRW: {
38680 SDValue Src = Op.getOperand(0);
38681 EVT SrcVT = Src.getValueType();
38682 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38683 Op.getConstantOperandVal(1));
38684 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38685 Known = Known.anyextOrTrunc(BitWidth);
38686 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38687 break;
38688 }
38689 case X86ISD::VSRAI:
38690 case X86ISD::VSHLI:
38691 case X86ISD::VSRLI: {
38692 unsigned ShAmt = Op.getConstantOperandVal(1);
38693 if (ShAmt >= VT.getScalarSizeInBits()) {
38694 // Out of range logical bit shifts are guaranteed to be zero.
38695 // Out of range arithmetic bit shifts splat the sign bit.
38696 if (Opc != X86ISD::VSRAI) {
38697 Known.setAllZero();
38698 break;
38699 }
38700
38701 ShAmt = VT.getScalarSizeInBits() - 1;
38702 }
38703
38704 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38705 if (Opc == X86ISD::VSHLI) {
38706 Known <<= ShAmt;
38707 // Low bits are known zero.
38708 Known.Zero.setLowBits(ShAmt);
38709 } else if (Opc == X86ISD::VSRLI) {
38710 Known >>= ShAmt;
38711 // High bits are known zero.
38712 Known.Zero.setHighBits(ShAmt);
38713 } else {
38714 Known.Zero.ashrInPlace(ShAmt);
38715 Known.One.ashrInPlace(ShAmt);
38716 }
38717 break;
38718 }
38719 case X86ISD::PACKUS: {
38720 // PACKUS is just a truncation if the upper half is zero.
38721 APInt DemandedLHS, DemandedRHS;
38722 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38723
38724 Known.One = APInt::getAllOnes(BitWidth * 2);
38725 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38726
38727 KnownBits Known2;
38728 if (!!DemandedLHS) {
38729 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38730 Known = Known.intersectWith(Known2);
38731 }
38732 if (!!DemandedRHS) {
38733 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38734 Known = Known.intersectWith(Known2);
38735 }
38736
38737 if (Known.countMinLeadingZeros() < BitWidth)
38738 Known.resetAll();
38739 Known = Known.trunc(BitWidth);
38740 break;
38741 }
38742 case X86ISD::PSHUFB: {
38743 SDValue Src = Op.getOperand(0);
38744 SDValue Idx = Op.getOperand(1);
38745
38746 // If the index vector is never negative (MSB is zero), then all elements
38747 // come from the source vector. This is useful for cases where
38748 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38749 // below will handle the more common constant shuffle mask case.
38750 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38751 if (KnownIdx.isNonNegative())
38752 Known = DAG.computeKnownBits(Src, Depth + 1);
38753 break;
38754 }
38755 case X86ISD::VBROADCAST: {
38756 SDValue Src = Op.getOperand(0);
38757 if (!Src.getSimpleValueType().isVector()) {
38758 Known = DAG.computeKnownBits(Src, Depth + 1);
38759 return;
38760 }
38761 break;
38762 }
38763 case X86ISD::AND: {
38764 if (Op.getResNo() == 0) {
38765 KnownBits Known2;
38766 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38767 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38768 Known &= Known2;
38769 }
38770 break;
38771 }
38772 case X86ISD::ANDNP: {
38773 KnownBits Known2;
38774 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38775 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38776
38777 // ANDNP = (~X & Y);
38778 Known.One &= Known2.Zero;
38779 Known.Zero |= Known2.One;
38780 break;
38781 }
38782 case X86ISD::FOR: {
38783 KnownBits Known2;
38784 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38785 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38786
38787 Known |= Known2;
38788 break;
38789 }
38790 case X86ISD::PSADBW: {
38791 SDValue LHS = Op.getOperand(0);
38792 SDValue RHS = Op.getOperand(1);
38793 assert(VT.getScalarType() == MVT::i64 &&
38794 LHS.getValueType() == RHS.getValueType() &&
38795 LHS.getValueType().getScalarType() == MVT::i8 &&
38796 "Unexpected PSADBW types");
38797 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38798 break;
38799 }
38800 case X86ISD::PCMPGT:
38801 case X86ISD::PCMPEQ: {
38802 KnownBits KnownLhs =
38803 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38804 KnownBits KnownRhs =
38805 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38806 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38807 ? KnownBits::eq(KnownLhs, KnownRhs)
38808 : KnownBits::sgt(KnownLhs, KnownRhs);
38809 if (Res) {
38810 if (*Res)
38811 Known.setAllOnes();
38812 else
38813 Known.setAllZero();
38814 }
38815 break;
38816 }
38817 case X86ISD::VPMADDWD: {
38818 SDValue LHS = Op.getOperand(0);
38819 SDValue RHS = Op.getOperand(1);
38820 assert(VT.getVectorElementType() == MVT::i32 &&
38821 LHS.getValueType() == RHS.getValueType() &&
38822 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38823 "Unexpected PMADDWD types");
38824 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38825 break;
38826 }
38827 case X86ISD::VPMADDUBSW: {
38828 SDValue LHS = Op.getOperand(0);
38829 SDValue RHS = Op.getOperand(1);
38830 assert(VT.getVectorElementType() == MVT::i16 &&
38831 LHS.getValueType() == RHS.getValueType() &&
38832 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38833 "Unexpected PMADDUBSW types");
38834 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38835 break;
38836 }
38837 case X86ISD::PMULUDQ: {
38838 KnownBits Known2;
38839 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38840 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38841
38842 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38843 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38844 Known = KnownBits::mul(Known, Known2);
38845 break;
38846 }
38847 case X86ISD::CMOV: {
38848 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38849 // If we don't know any bits, early out.
38850 if (Known.isUnknown())
38851 break;
38852 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38853
38854 // Only known if known in both the LHS and RHS.
38855 Known = Known.intersectWith(Known2);
38856 break;
38857 }
38858 case X86ISD::BEXTR:
38859 case X86ISD::BEXTRI: {
38860 SDValue Op0 = Op.getOperand(0);
38861 SDValue Op1 = Op.getOperand(1);
38862
38863 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38864 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38865 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38866
38867 // If the length is 0, the result is 0.
38868 if (Length == 0) {
38869 Known.setAllZero();
38870 break;
38871 }
38872
38873 if ((Shift + Length) <= BitWidth) {
38874 Known = DAG.computeKnownBits(Op0, Depth + 1);
38875 Known = Known.extractBits(Length, Shift);
38876 Known = Known.zextOrTrunc(BitWidth);
38877 }
38878 }
38879 break;
38880 }
38881 case X86ISD::PDEP: {
38882 KnownBits Known2;
38883 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38884 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38885 // Zeros are retained from the mask operand. But not ones.
38886 Known.One.clearAllBits();
38887 // The result will have at least as many trailing zeros as the non-mask
38888 // operand since bits can only map to the same or higher bit position.
38889 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38890 break;
38891 }
38892 case X86ISD::PEXT: {
38893 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38894 // The result has as many leading zeros as the number of zeroes in the mask.
38895 unsigned Count = Known.Zero.popcount();
38897 Known.One.clearAllBits();
38898 break;
38899 }
38900 case X86ISD::VTRUNC:
38901 case X86ISD::VTRUNCS:
38902 case X86ISD::VTRUNCUS:
38903 case X86ISD::CVTSI2P:
38904 case X86ISD::CVTUI2P:
38905 case X86ISD::CVTP2SI:
38906 case X86ISD::CVTP2UI:
38907 case X86ISD::MCVTP2SI:
38908 case X86ISD::MCVTP2UI:
38909 case X86ISD::CVTTP2SI:
38910 case X86ISD::CVTTP2UI:
38911 case X86ISD::MCVTTP2SI:
38912 case X86ISD::MCVTTP2UI:
38913 case X86ISD::MCVTSI2P:
38914 case X86ISD::MCVTUI2P:
38915 case X86ISD::VFPROUND:
38916 case X86ISD::VMFPROUND:
38917 case X86ISD::CVTPS2PH:
38918 case X86ISD::MCVTPS2PH:
38919 case X86ISD::MCVTTP2SIS:
38920 case X86ISD::MCVTTP2UIS: {
38921 // Truncations/Conversions - upper elements are known zero.
38922 EVT SrcVT = Op.getOperand(0).getValueType();
38923 if (SrcVT.isVector()) {
38924 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38925 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38926 Known.setAllZero();
38927 }
38928 break;
38929 }
38936 // Strict Conversions - upper elements are known zero.
38937 EVT SrcVT = Op.getOperand(1).getValueType();
38938 if (SrcVT.isVector()) {
38939 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38940 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38941 Known.setAllZero();
38942 }
38943 break;
38944 }
38945 case X86ISD::MOVQ2DQ: {
38946 // Move from MMX to XMM. Upper half of XMM should be 0.
38947 if (DemandedElts.countr_zero() >= (NumElts / 2))
38948 Known.setAllZero();
38949 break;
38950 }
38952 APInt UndefElts;
38953 SmallVector<APInt, 16> EltBits;
38954 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38955 /*AllowWholeUndefs*/ false,
38956 /*AllowPartialUndefs*/ false)) {
38957 Known.Zero.setAllBits();
38958 Known.One.setAllBits();
38959 for (unsigned I = 0; I != NumElts; ++I) {
38960 if (!DemandedElts[I])
38961 continue;
38962 if (UndefElts[I]) {
38963 Known.resetAll();
38964 break;
38965 }
38966 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38967 Known = Known.intersectWith(Known2);
38968 }
38969 return;
38970 }
38971 break;
38972 }
38973 case X86ISD::HADD:
38974 case X86ISD::HSUB: {
38976 Op, DemandedElts, Depth, DAG,
38977 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38979 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38980 KnownLHS, KnownRHS);
38981 });
38982 break;
38983 }
38985 switch (Op->getConstantOperandVal(0)) {
38986 case Intrinsic::x86_sse2_pmadd_wd:
38987 case Intrinsic::x86_avx2_pmadd_wd:
38988 case Intrinsic::x86_avx512_pmaddw_d_512: {
38989 SDValue LHS = Op.getOperand(1);
38990 SDValue RHS = Op.getOperand(2);
38991 assert(VT.getScalarType() == MVT::i32 &&
38992 LHS.getValueType() == RHS.getValueType() &&
38993 LHS.getValueType().getScalarType() == MVT::i16 &&
38994 "Unexpected PMADDWD types");
38995 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38996 break;
38997 }
38998 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38999 case Intrinsic::x86_avx2_pmadd_ub_sw:
39000 case Intrinsic::x86_avx512_pmaddubs_w_512: {
39001 SDValue LHS = Op.getOperand(1);
39002 SDValue RHS = Op.getOperand(2);
39003 assert(VT.getScalarType() == MVT::i16 &&
39004 LHS.getValueType() == RHS.getValueType() &&
39005 LHS.getValueType().getScalarType() == MVT::i8 &&
39006 "Unexpected PMADDUBSW types");
39007 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
39008 break;
39009 }
39010 case Intrinsic::x86_sse2_psad_bw:
39011 case Intrinsic::x86_avx2_psad_bw:
39012 case Intrinsic::x86_avx512_psad_bw_512: {
39013 SDValue LHS = Op.getOperand(1);
39014 SDValue RHS = Op.getOperand(2);
39015 assert(VT.getScalarType() == MVT::i64 &&
39016 LHS.getValueType() == RHS.getValueType() &&
39017 LHS.getValueType().getScalarType() == MVT::i8 &&
39018 "Unexpected PSADBW types");
39019 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
39020 break;
39021 }
39022 }
39023 break;
39024 }
39025 case X86ISD::VPMADD52L:
39026 case X86ISD::VPMADD52H: {
39027 assert(Op.getValueType().isVector() &&
39028 Op.getValueType().getScalarType() == MVT::i64 &&
39029 "Unexpected VPMADD52 type");
39030 KnownBits K0 =
39031 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
39032 KnownBits K1 =
39033 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
39034 KnownBits KAcc =
39035 DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
39036 K0 = K0.trunc(52);
39037 K1 = K1.trunc(52);
39038 KnownBits KnownMul = (Op.getOpcode() == X86ISD::VPMADD52L)
39039 ? KnownBits::mul(K0, K1)
39040 : KnownBits::mulhu(K0, K1);
39041 KnownMul = KnownMul.zext(64);
39042 Known = KnownBits::add(KAcc, KnownMul);
39043 return;
39044 }
39045 }
39046
39047 // Handle target shuffles.
39048 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39049 if (isTargetShuffle(Opc)) {
39052 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39053 unsigned NumOps = Ops.size();
39054 unsigned NumElts = VT.getVectorNumElements();
39055 if (Mask.size() == NumElts) {
39056 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39057 Known.Zero.setAllBits(); Known.One.setAllBits();
39058 for (unsigned i = 0; i != NumElts; ++i) {
39059 if (!DemandedElts[i])
39060 continue;
39061 int M = Mask[i];
39062 if (M == SM_SentinelUndef) {
39063 // For UNDEF elements, we don't know anything about the common state
39064 // of the shuffle result.
39065 Known.resetAll();
39066 break;
39067 }
39068 if (M == SM_SentinelZero) {
39069 Known.One.clearAllBits();
39070 continue;
39071 }
39072 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39073 "Shuffle index out of range");
39074
39075 unsigned OpIdx = (unsigned)M / NumElts;
39076 unsigned EltIdx = (unsigned)M % NumElts;
39077 if (Ops[OpIdx].getValueType() != VT) {
39078 // TODO - handle target shuffle ops with different value types.
39079 Known.resetAll();
39080 break;
39081 }
39082 DemandedOps[OpIdx].setBit(EltIdx);
39083 }
39084 // Known bits are the values that are shared by every demanded element.
39085 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39086 if (!DemandedOps[i])
39087 continue;
39088 KnownBits Known2 =
39089 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39090 Known = Known.intersectWith(Known2);
39091 }
39092 }
39093 }
39094 }
39095}
39096
39098 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39099 unsigned Depth) const {
39100 EVT VT = Op.getValueType();
39101 unsigned VTBits = VT.getScalarSizeInBits();
39102 unsigned Opcode = Op.getOpcode();
39103 switch (Opcode) {
39105 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39106 return VTBits;
39107
39108 case X86ISD::VTRUNC: {
39109 SDValue Src = Op.getOperand(0);
39110 MVT SrcVT = Src.getSimpleValueType();
39111 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39112 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39113 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39114 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39115 if (Tmp > (NumSrcBits - VTBits))
39116 return Tmp - (NumSrcBits - VTBits);
39117 return 1;
39118 }
39119
39120 case X86ISD::PACKSS: {
39121 // PACKSS is just a truncation if the sign bits extend to the packed size.
39122 APInt DemandedLHS, DemandedRHS;
39123 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39124 DemandedRHS);
39125
39126 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39127 // patterns often used to compact vXi64 allsignbit patterns.
39128 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39130 if (BC.getOpcode() == X86ISD::PACKSS &&
39131 BC.getScalarValueSizeInBits() == 16 &&
39132 V.getScalarValueSizeInBits() == 32) {
39135 if (BC0.getScalarValueSizeInBits() == 64 &&
39136 BC1.getScalarValueSizeInBits() == 64 &&
39137 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39138 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39139 return 32;
39140 }
39141 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39142 };
39143
39144 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39145 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39146 if (!!DemandedLHS)
39147 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39148 if (!!DemandedRHS)
39149 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39150 unsigned Tmp = std::min(Tmp0, Tmp1);
39151 if (Tmp > (SrcBits - VTBits))
39152 return Tmp - (SrcBits - VTBits);
39153 return 1;
39154 }
39155
39156 case X86ISD::VBROADCAST: {
39157 SDValue Src = Op.getOperand(0);
39158 if (!Src.getSimpleValueType().isVector())
39159 return DAG.ComputeNumSignBits(Src, Depth + 1);
39160 break;
39161 }
39162
39163 case X86ISD::VSHLI: {
39164 SDValue Src = Op.getOperand(0);
39165 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39166 if (ShiftVal.uge(VTBits))
39167 return VTBits; // Shifted all bits out --> zero.
39168 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39169 if (ShiftVal.uge(Tmp))
39170 return 1; // Shifted all sign bits out --> unknown.
39171 return Tmp - ShiftVal.getZExtValue();
39172 }
39173
39174 case X86ISD::VSRAI: {
39175 SDValue Src = Op.getOperand(0);
39176 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39177 if (ShiftVal.uge(VTBits - 1))
39178 return VTBits; // Sign splat.
39179 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39180 ShiftVal += Tmp;
39181 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39182 }
39183
39184 case X86ISD::FSETCC:
39185 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39186 if (VT == MVT::f32 || VT == MVT::f64 ||
39187 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39188 return VTBits;
39189 break;
39190
39191 case X86ISD::PCMPGT:
39192 case X86ISD::PCMPEQ:
39193 case X86ISD::CMPP:
39194 case X86ISD::VPCOM:
39195 case X86ISD::VPCOMU:
39196 // Vector compares return zero/all-bits result values.
39197 return VTBits;
39198
39199 case X86ISD::ANDNP: {
39200 unsigned Tmp0 =
39201 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39202 if (Tmp0 == 1) return 1; // Early out.
39203 unsigned Tmp1 =
39204 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39205 return std::min(Tmp0, Tmp1);
39206 }
39207
39208 case X86ISD::CMOV: {
39209 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39210 if (Tmp0 == 1) return 1; // Early out.
39211 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39212 return std::min(Tmp0, Tmp1);
39213 }
39214 }
39215
39216 // Handle target shuffles.
39217 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39218 if (isTargetShuffle(Opcode)) {
39221 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39222 unsigned NumOps = Ops.size();
39223 unsigned NumElts = VT.getVectorNumElements();
39224 if (Mask.size() == NumElts) {
39225 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39226 for (unsigned i = 0; i != NumElts; ++i) {
39227 if (!DemandedElts[i])
39228 continue;
39229 int M = Mask[i];
39230 if (M == SM_SentinelUndef) {
39231 // For UNDEF elements, we don't know anything about the common state
39232 // of the shuffle result.
39233 return 1;
39234 } else if (M == SM_SentinelZero) {
39235 // Zero = all sign bits.
39236 continue;
39237 }
39238 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39239 "Shuffle index out of range");
39240
39241 unsigned OpIdx = (unsigned)M / NumElts;
39242 unsigned EltIdx = (unsigned)M % NumElts;
39243 if (Ops[OpIdx].getValueType() != VT) {
39244 // TODO - handle target shuffle ops with different value types.
39245 return 1;
39246 }
39247 DemandedOps[OpIdx].setBit(EltIdx);
39248 }
39249 unsigned Tmp0 = VTBits;
39250 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39251 if (!DemandedOps[i])
39252 continue;
39253 unsigned Tmp1 =
39254 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39255 Tmp0 = std::min(Tmp0, Tmp1);
39256 }
39257 return Tmp0;
39258 }
39259 }
39260 }
39261
39262 // Fallback case.
39263 return 1;
39264}
39265
39267 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39268 return N->getOperand(0);
39269 return N;
39270}
39271
39272// Helper to look for a normal load that can be narrowed into a vzload with the
39273// specified VT and memory VT. Returns SDValue() on failure.
39275 SelectionDAG &DAG) {
39276 // Can't if the load is volatile or atomic.
39277 if (!LN->isSimple())
39278 return SDValue();
39279
39280 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39281 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39282 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39283 LN->getPointerInfo(), LN->getBaseAlign(),
39284 LN->getMemOperand()->getFlags());
39285}
39286
39287// Attempt to match a combined shuffle mask against supported unary shuffle
39288// instructions.
39289// TODO: Investigate sharing more of this with shuffle lowering.
39290static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39291 bool AllowFloatDomain, bool AllowIntDomain,
39292 SDValue V1, const SelectionDAG &DAG,
39293 const X86Subtarget &Subtarget, unsigned &Shuffle,
39294 MVT &SrcVT, MVT &DstVT) {
39295 unsigned NumMaskElts = Mask.size();
39296 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39297
39298 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39299 if (Mask[0] == 0 &&
39300 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39301 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39303 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39304 Shuffle = X86ISD::VZEXT_MOVL;
39305 if (MaskEltSize == 16)
39306 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39307 else
39308 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39309 return true;
39310 }
39311 }
39312
39313 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39314 if (AllowIntDomain &&
39315 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39316 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39317 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39318 unsigned MaxScale = 64 / MaskEltSize;
39319 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39320 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39321 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39322 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39323 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39324 continue;
39325 bool MatchAny = true;
39326 bool MatchZero = true;
39327 bool MatchSign = UseSign;
39328 unsigned NumDstElts = NumMaskElts / Scale;
39329 for (unsigned i = 0;
39330 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39331 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39332 MatchAny = MatchSign = MatchZero = false;
39333 break;
39334 }
39335 unsigned Pos = (i * Scale) + 1;
39336 unsigned Len = Scale - 1;
39337 MatchAny &= isUndefInRange(Mask, Pos, Len);
39338 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39339 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39340 }
39341 if (MatchAny || MatchSign || MatchZero) {
39342 assert((MatchSign || MatchZero) &&
39343 "Failed to match sext/zext but matched aext?");
39344 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39345 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39346 : MVT::getIntegerVT(MaskEltSize);
39347 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39348
39349 Shuffle = unsigned(
39350 MatchAny ? ISD::ANY_EXTEND
39351 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39352 if (SrcVT.getVectorNumElements() != NumDstElts)
39353 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39354
39355 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39356 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39357 return true;
39358 }
39359 }
39360 }
39361
39362 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39363 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39364 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39365 isUndefOrEqual(Mask[0], 0) &&
39366 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39367 Shuffle = X86ISD::VZEXT_MOVL;
39368 if (MaskEltSize == 16)
39369 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39370 else
39371 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39372 return true;
39373 }
39374
39375 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39376 // instructions are no slower than UNPCKLPD but has the option to
39377 // fold the input operand into even an unaligned memory load.
39378 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39379 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39380 Shuffle = X86ISD::MOVDDUP;
39381 SrcVT = DstVT = MVT::v2f64;
39382 return true;
39383 }
39384 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39385 Shuffle = X86ISD::MOVSLDUP;
39386 SrcVT = DstVT = MVT::v4f32;
39387 return true;
39388 }
39389 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39390 Shuffle = X86ISD::MOVSHDUP;
39391 SrcVT = DstVT = MVT::v4f32;
39392 return true;
39393 }
39394 }
39395
39396 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39397 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39398 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39399 Shuffle = X86ISD::MOVDDUP;
39400 SrcVT = DstVT = MVT::v4f64;
39401 return true;
39402 }
39403 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39404 V1)) {
39405 Shuffle = X86ISD::MOVSLDUP;
39406 SrcVT = DstVT = MVT::v8f32;
39407 return true;
39408 }
39409 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39410 V1)) {
39411 Shuffle = X86ISD::MOVSHDUP;
39412 SrcVT = DstVT = MVT::v8f32;
39413 return true;
39414 }
39415 }
39416
39417 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39418 assert(Subtarget.hasAVX512() &&
39419 "AVX512 required for 512-bit vector shuffles");
39420 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39421 V1)) {
39422 Shuffle = X86ISD::MOVDDUP;
39423 SrcVT = DstVT = MVT::v8f64;
39424 return true;
39425 }
39427 MaskVT, Mask,
39428 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39429 Shuffle = X86ISD::MOVSLDUP;
39430 SrcVT = DstVT = MVT::v16f32;
39431 return true;
39432 }
39434 MaskVT, Mask,
39435 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39436 Shuffle = X86ISD::MOVSHDUP;
39437 SrcVT = DstVT = MVT::v16f32;
39438 return true;
39439 }
39440 }
39441
39442 return false;
39443}
39444
39445// Attempt to match a combined shuffle mask against supported unary immediate
39446// permute instructions.
39447// TODO: Investigate sharing more of this with shuffle lowering.
39449 const APInt &Zeroable,
39450 bool AllowFloatDomain, bool AllowIntDomain,
39451 const SelectionDAG &DAG,
39452 const X86Subtarget &Subtarget,
39453 unsigned &Shuffle, MVT &ShuffleVT,
39454 unsigned &PermuteImm) {
39455 unsigned NumMaskElts = Mask.size();
39456 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39457 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39458 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39459 bool ContainsZeros = isAnyZero(Mask);
39460
39461 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39462 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39463 // Check for lane crossing permutes.
39464 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39465 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39466 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39467 Shuffle = X86ISD::VPERMI;
39468 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39469 PermuteImm = getV4X86ShuffleImm(Mask);
39470 return true;
39471 }
39472 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39473 SmallVector<int, 4> RepeatedMask;
39474 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39475 Shuffle = X86ISD::VPERMI;
39476 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39477 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39478 return true;
39479 }
39480 }
39481 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39482 // VPERMILPD can permute with a non-repeating shuffle.
39483 Shuffle = X86ISD::VPERMILPI;
39484 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39485 PermuteImm = 0;
39486 for (int i = 0, e = Mask.size(); i != e; ++i) {
39487 int M = Mask[i];
39488 if (M == SM_SentinelUndef)
39489 continue;
39490 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39491 PermuteImm |= (M & 1) << i;
39492 }
39493 return true;
39494 }
39495 }
39496
39497 // We are checking for shuffle match or shift match. Loop twice so we can
39498 // order which we try and match first depending on target preference.
39499 for (unsigned Order = 0; Order < 2; ++Order) {
39500 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39501 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39502 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39503 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39504 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39505 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39506 SmallVector<int, 4> RepeatedMask;
39507 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39508 // Narrow the repeated mask to create 32-bit element permutes.
39509 SmallVector<int, 4> WordMask = RepeatedMask;
39510 if (MaskScalarSizeInBits == 64)
39511 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39512
39513 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39514 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39515 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39516 PermuteImm = getV4X86ShuffleImm(WordMask);
39517 return true;
39518 }
39519 }
39520
39521 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39522 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39523 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39524 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39525 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39526 SmallVector<int, 4> RepeatedMask;
39527 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39528 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39529 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39530
39531 // PSHUFLW: permute lower 4 elements only.
39532 if (isUndefOrInRange(LoMask, 0, 4) &&
39533 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39534 Shuffle = X86ISD::PSHUFLW;
39535 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39536 PermuteImm = getV4X86ShuffleImm(LoMask);
39537 return true;
39538 }
39539
39540 // PSHUFHW: permute upper 4 elements only.
39541 if (isUndefOrInRange(HiMask, 4, 8) &&
39542 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39543 // Offset the HiMask so that we can create the shuffle immediate.
39544 int OffsetHiMask[4];
39545 for (int i = 0; i != 4; ++i)
39546 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39547
39548 Shuffle = X86ISD::PSHUFHW;
39549 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39550 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39551 return true;
39552 }
39553 }
39554 }
39555 } else {
39556 // Attempt to match against bit rotates.
39557 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39558 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39559 Subtarget.hasAVX512())) {
39560 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39561 Subtarget, Mask);
39562 if (0 < RotateAmt) {
39563 Shuffle = X86ISD::VROTLI;
39564 PermuteImm = (unsigned)RotateAmt;
39565 return true;
39566 }
39567 }
39568 }
39569 // Attempt to match against byte/bit shifts.
39570 if (AllowIntDomain &&
39571 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39572 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39573 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39574 int ShiftAmt =
39575 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39576 Zeroable, Subtarget);
39577 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39578 32 <= ShuffleVT.getScalarSizeInBits())) {
39579 // Byte shifts can be slower so only match them on second attempt.
39580 if (Order == 0 &&
39581 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39582 continue;
39583
39584 PermuteImm = (unsigned)ShiftAmt;
39585 return true;
39586 }
39587
39588 }
39589 }
39590
39591 return false;
39592}
39593
39594// Attempt to match a combined unary shuffle mask against supported binary
39595// shuffle instructions.
39596// TODO: Investigate sharing more of this with shuffle lowering.
39597static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39598 bool AllowFloatDomain, bool AllowIntDomain,
39599 SDValue &V1, SDValue &V2, const SDLoc &DL,
39600 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39601 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39602 bool IsUnary) {
39603 unsigned NumMaskElts = Mask.size();
39604 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39605 unsigned SizeInBits = MaskVT.getSizeInBits();
39606
39607 if (MaskVT.is128BitVector()) {
39608 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39609 AllowFloatDomain) {
39610 V2 = V1;
39611 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39612 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39613 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39614 return true;
39615 }
39616 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39617 AllowFloatDomain) {
39618 V2 = V1;
39619 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39620 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39621 return true;
39622 }
39623 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39624 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39625 std::swap(V1, V2);
39626 Shuffle = X86ISD::MOVSD;
39627 SrcVT = DstVT = MVT::v2f64;
39628 return true;
39629 }
39630 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39631 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39632 Shuffle = X86ISD::MOVSS;
39633 SrcVT = DstVT = MVT::v4f32;
39634 return true;
39635 }
39636 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39637 DAG) &&
39638 Subtarget.hasFP16()) {
39639 Shuffle = X86ISD::MOVSH;
39640 SrcVT = DstVT = MVT::v8f16;
39641 return true;
39642 }
39643 }
39644
39645 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39646 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39647 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39648 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39649 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39650 Subtarget)) {
39651 DstVT = MaskVT;
39652 return true;
39653 }
39654 }
39655 // TODO: Can we handle this inside matchShuffleWithPACK?
39656 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39657 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39658 V1.getScalarValueSizeInBits() == 64 &&
39659 V2.getScalarValueSizeInBits() == 64) {
39660 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39661 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39662 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39663 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39664 SrcVT = MVT::v4i32;
39665 DstVT = MVT::v8i16;
39666 Shuffle = X86ISD::PACKUS;
39667 return true;
39668 }
39669 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39670 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39671 SrcVT = MVT::v8i16;
39672 DstVT = MVT::v16i8;
39673 Shuffle = X86ISD::PACKUS;
39674 return true;
39675 }
39676 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39677 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39678 SrcVT = MVT::v4i32;
39679 DstVT = MVT::v8i16;
39680 Shuffle = X86ISD::PACKSS;
39681 return true;
39682 }
39683 }
39684
39685 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39686 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39687 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39688 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39689 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39690 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39691 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39692 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39693 Subtarget)) {
39694 SrcVT = DstVT = MaskVT;
39695 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39696 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39697 return true;
39698 }
39699 }
39700
39701 // Attempt to match against a OR if we're performing a blend shuffle and the
39702 // non-blended source element is zero in each case.
39703 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39704 if (SizeInBits == V1.getValueSizeInBits() &&
39705 SizeInBits == V2.getValueSizeInBits() &&
39706 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39707 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39708 bool IsBlend = true;
39709 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39710 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39711 unsigned Scale1 = NumV1Elts / NumMaskElts;
39712 unsigned Scale2 = NumV2Elts / NumMaskElts;
39713 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39714 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39715 for (unsigned i = 0; i != NumMaskElts; ++i) {
39716 int M = Mask[i];
39717 if (M == SM_SentinelUndef)
39718 continue;
39719 if (M == SM_SentinelZero) {
39720 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39721 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39722 continue;
39723 }
39724 if (M == (int)i) {
39725 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39726 continue;
39727 }
39728 if (M == (int)(i + NumMaskElts)) {
39729 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39730 continue;
39731 }
39732 IsBlend = false;
39733 break;
39734 }
39735 if (IsBlend) {
39736 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39737 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39738 Shuffle = ISD::OR;
39739 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39740 return true;
39741 }
39742 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39743 // FIXME: handle mismatched sizes?
39744 // TODO: investigate if `ISD::OR` handling in
39745 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39746 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39747 unsigned NumElts = V.getValueType().getVectorNumElements();
39748 KnownBits Known(NumElts);
39749 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39750 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39751 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39752 if (PeepholeKnown.isZero())
39753 Known.Zero.setBit(EltIdx);
39754 if (PeepholeKnown.isAllOnes())
39755 Known.One.setBit(EltIdx);
39756 }
39757 return Known;
39758 };
39759
39760 KnownBits V1Known = computeKnownBitsElementWise(V1);
39761 KnownBits V2Known = computeKnownBitsElementWise(V2);
39762
39763 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39764 int M = Mask[i];
39765 if (M == SM_SentinelUndef)
39766 continue;
39767 if (M == SM_SentinelZero) {
39768 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39769 continue;
39770 }
39771 if (M == (int)i) {
39772 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39773 continue;
39774 }
39775 if (M == (int)(i + NumMaskElts)) {
39776 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39777 continue;
39778 }
39779 llvm_unreachable("will not get here.");
39780 }
39781 if (IsBlend) {
39782 Shuffle = ISD::OR;
39783 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39784 return true;
39785 }
39786 }
39787 }
39788 }
39789
39790 return false;
39791}
39792
39794 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39795 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39796 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39797 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39798 unsigned NumMaskElts = Mask.size();
39799 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39800
39801 // Attempt to match against VALIGND/VALIGNQ rotate.
39802 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39803 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39804 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39805 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39806 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39807 MaskVT.getSizeInBits() / EltSizeInBits);
39808 if (!isAnyZero(Mask)) {
39809 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39810 if (0 < Rotation) {
39811 Shuffle = X86ISD::VALIGN;
39812 ShuffleVT = AlignVT;
39813 PermuteImm = Rotation;
39814 return true;
39815 }
39816 }
39817 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39818 unsigned ZeroLo = Zeroable.countr_one();
39819 unsigned ZeroHi = Zeroable.countl_one();
39820 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39821 if (ZeroLo) {
39822 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39823 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39824 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39825 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39826 Shuffle = X86ISD::VALIGN;
39827 ShuffleVT = AlignVT;
39828 PermuteImm = NumMaskElts - ZeroLo;
39829 return true;
39830 }
39831 }
39832 if (ZeroHi) {
39833 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39834 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39835 ZeroHi);
39836 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39837 V2 = V1;
39838 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39839 Shuffle = X86ISD::VALIGN;
39840 ShuffleVT = AlignVT;
39841 PermuteImm = ZeroHi;
39842 return true;
39843 }
39844 }
39845 }
39846
39847 // Attempt to match against PALIGNR byte rotate.
39848 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39849 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39850 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39851 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39852 if (0 < ByteRotation) {
39853 Shuffle = X86ISD::PALIGNR;
39854 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39855 PermuteImm = ByteRotation;
39856 return true;
39857 }
39858 }
39859
39860 // Attempt to combine to X86ISD::BLENDI.
39861 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39862 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39863 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39864 uint64_t BlendMask = 0;
39865 bool ForceV1Zero = false, ForceV2Zero = false;
39866 SmallVector<int, 8> TargetMask(Mask);
39867 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39868 ForceV2Zero, BlendMask)) {
39869 if (MaskVT == MVT::v16i16) {
39870 // We can only use v16i16 PBLENDW if the lanes are repeated.
39871 SmallVector<int, 8> RepeatedMask;
39872 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39873 RepeatedMask)) {
39874 assert(RepeatedMask.size() == 8 &&
39875 "Repeated mask size doesn't match!");
39876 PermuteImm = 0;
39877 for (int i = 0; i < 8; ++i)
39878 if (RepeatedMask[i] >= 8)
39879 PermuteImm |= 1 << i;
39880 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39881 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39882 Shuffle = X86ISD::BLENDI;
39883 ShuffleVT = MaskVT;
39884 return true;
39885 }
39886 } else {
39887 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39888 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39889 PermuteImm = (unsigned)BlendMask;
39890 Shuffle = X86ISD::BLENDI;
39891 ShuffleVT = MaskVT;
39892 return true;
39893 }
39894 }
39895 }
39896
39897 // Attempt to combine to INSERTPS, but only if it has elements that need to
39898 // be set to zero.
39899 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39900 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39901 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39902 Shuffle = X86ISD::INSERTPS;
39903 ShuffleVT = MVT::v4f32;
39904 return true;
39905 }
39906
39907 // Attempt to combine to SHUFPD.
39908 if (AllowFloatDomain && EltSizeInBits == 64 &&
39909 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39910 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39911 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39912 bool ForceV1Zero = false, ForceV2Zero = false;
39913 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39914 PermuteImm, Mask, Zeroable)) {
39915 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39916 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39917 Shuffle = X86ISD::SHUFP;
39918 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39919 return true;
39920 }
39921 }
39922
39923 // Attempt to combine to SHUFPS.
39924 if (AllowFloatDomain && EltSizeInBits == 32 &&
39925 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39926 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39927 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39928 SmallVector<int, 4> RepeatedMask;
39929 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39930 // Match each half of the repeated mask, to determine if its just
39931 // referencing one of the vectors, is zeroable or entirely undef.
39932 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39933 int M0 = RepeatedMask[Offset];
39934 int M1 = RepeatedMask[Offset + 1];
39935
39936 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39937 return DAG.getUNDEF(MaskVT);
39938 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39939 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39940 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39941 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39942 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39943 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39944 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39945 return V1;
39946 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39947 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39948 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39949 return V2;
39950 }
39951
39952 return SDValue();
39953 };
39954
39955 int ShufMask[4] = {-1, -1, -1, -1};
39956 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39957 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39958
39959 if (Lo && Hi) {
39960 V1 = Lo;
39961 V2 = Hi;
39962 Shuffle = X86ISD::SHUFP;
39963 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39964 PermuteImm = getV4X86ShuffleImm(ShufMask);
39965 return true;
39966 }
39967 }
39968 }
39969
39970 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39971 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39972 MaskVT.is128BitVector() &&
39973 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39974 Shuffle = X86ISD::INSERTPS;
39975 ShuffleVT = MVT::v4f32;
39976 return true;
39977 }
39978
39979 return false;
39980}
39981
39983 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39984 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39985 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39986 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39987 const X86Subtarget &Subtarget);
39988
39989/// Combine an arbitrary chain of shuffles into a single instruction if
39990/// possible.
39991///
39992/// This is the leaf of the recursive combine below. When we have found some
39993/// chain of single-use x86 shuffle instructions and accumulated the combined
39994/// shuffle mask represented by them, this will try to pattern match that mask
39995/// into either a single instruction if there is a special purpose instruction
39996/// for this operation, or into a PSHUFB instruction which is a fully general
39997/// instruction but should only be used to replace chains over a certain depth.
39999 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
40000 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40001 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40002 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40003 const X86Subtarget &Subtarget) {
40004 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
40005 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
40006 "Unexpected number of shuffle inputs!");
40007 unsigned RootSizeInBits = RootVT.getSizeInBits();
40008 unsigned NumRootElts = RootVT.getVectorNumElements();
40009
40010 // Canonicalize shuffle input op to the requested type.
40011 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
40012 if (VT.getSizeInBits() > Op.getValueSizeInBits())
40013 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
40014 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
40015 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
40016 return DAG.getBitcast(VT, Op);
40017 };
40018
40019 // Find the inputs that enter the chain. Note that multiple uses are OK
40020 // here, we're not going to remove the operands we find.
40021 bool UnaryShuffle = (Inputs.size() == 1);
40022 SDValue V1 = peekThroughBitcasts(Inputs[0]);
40023 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
40024 : peekThroughBitcasts(Inputs[1]));
40025
40026 MVT VT1 = V1.getSimpleValueType();
40027 MVT VT2 = V2.getSimpleValueType();
40028 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
40029 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
40030
40031 SDValue Res;
40032
40033 unsigned NumBaseMaskElts = BaseMask.size();
40034 if (NumBaseMaskElts == 1) {
40035 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
40036 return CanonicalizeShuffleInput(RootVT, V1);
40037 }
40038
40039 bool OptForSize = DAG.shouldOptForSize();
40040 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
40041 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
40042 (RootVT.isFloatingPoint() && Depth >= 1) ||
40043 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
40044
40045 // If we are shuffling a splat (and not introducing zeros) then we can just
40046 // use it directly. This works for smaller elements as well as they already
40047 // repeat across each mask element.
40048 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40049 V1.getValueSizeInBits() >= RootSizeInBits &&
40050 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40051 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40052 return CanonicalizeShuffleInput(RootVT, V1);
40053 }
40054
40055 SmallVector<int, 64> Mask(BaseMask);
40056
40057 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40058 // etc. can be simplified.
40059 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40060 SmallVector<int> ScaledMask, IdentityMask;
40061 unsigned NumElts = VT1.getVectorNumElements();
40062 if (Mask.size() <= NumElts &&
40063 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40064 for (unsigned i = 0; i != NumElts; ++i)
40065 IdentityMask.push_back(i);
40066 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40067 V2))
40068 return CanonicalizeShuffleInput(RootVT, V1);
40069 }
40070 }
40071
40072 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40073 if (RootVT.is512BitVector() &&
40074 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40075 // If the upper subvectors are zeroable, then an extract+insert is more
40076 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40077 // to zero the upper subvectors.
40078 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40079 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40080 return SDValue(); // Nothing to do!
40081 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40082 "Unexpected lane shuffle");
40083 Res = CanonicalizeShuffleInput(RootVT, V1);
40084 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40085 bool UseZero = isAnyZero(Mask);
40086 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40087 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40088 }
40089
40090 // Narrow shuffle mask to v4x128.
40091 SmallVector<int, 4> ScaledMask;
40092 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40093 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40094
40095 // Try to lower to vshuf64x2/vshuf32x4.
40096 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40097 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40098 SelectionDAG &DAG) {
40099 int PermMask[4] = {-1, -1, -1, -1};
40100 // Ensure elements came from the same Op.
40101 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40102 for (int i = 0; i < 4; ++i) {
40103 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40104 if (ScaledMask[i] < 0)
40105 continue;
40106
40107 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40108 unsigned OpIndex = i / 2;
40109 if (Ops[OpIndex].isUndef())
40110 Ops[OpIndex] = Op;
40111 else if (Ops[OpIndex] != Op)
40112 return SDValue();
40113
40114 PermMask[i] = ScaledMask[i] % 4;
40115 }
40116
40117 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40118 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40119 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40120 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40121 };
40122
40123 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40124 // doesn't work because our mask is for 128 bits and we don't have an MVT
40125 // to match that.
40126 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40127 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40128 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40129 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40130 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40131 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40132 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40133 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40134 ScaledMask[1] == (ScaledMask[3] % 2));
40135
40136 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40137 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40138 return SDValue(); // Nothing to do!
40139 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40140 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40141 return DAG.getBitcast(RootVT, V);
40142 }
40143 }
40144
40145 // Handle 128-bit lane shuffles of 256-bit vectors.
40146 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40147 // If the upper half is zeroable, then an extract+insert is more optimal
40148 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40149 // zero the upper half.
40150 if (isUndefOrZero(Mask[1])) {
40151 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40152 return SDValue(); // Nothing to do!
40153 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40154 Res = CanonicalizeShuffleInput(RootVT, V1);
40155 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40156 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40157 256);
40158 }
40159
40160 // If we're inserting the low subvector, an insert-subvector 'concat'
40161 // pattern is quicker than VPERM2X128.
40162 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40163 !Subtarget.hasAVX2()) {
40164 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40165 return SDValue(); // Nothing to do!
40166 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40167 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40168 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40169 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40170 }
40171
40172 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40173 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40174 // feature.
40175 // Prefer blends for sequential shuffles unless we are optimizing for size.
40176 if (UnaryShuffle &&
40177 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40178 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40179 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40180 return SDValue(); // Nothing to do!
40181 unsigned PermMask = 0;
40182 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40183 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40184 return DAG.getNode(
40185 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40186 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40187 }
40188
40189 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40190 return SDValue(); // Nothing to do!
40191
40192 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40193 if (!UnaryShuffle && !IsMaskedShuffle) {
40194 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40195 "Unexpected shuffle sentinel value");
40196 // Prefer blends to X86ISD::VPERM2X128.
40197 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40198 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40199 return SDValue(); // Nothing to do!
40200 unsigned PermMask = 0;
40201 PermMask |= ((Mask[0] & 3) << 0);
40202 PermMask |= ((Mask[1] & 3) << 4);
40203 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40204 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40205 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40206 CanonicalizeShuffleInput(RootVT, LHS),
40207 CanonicalizeShuffleInput(RootVT, RHS),
40208 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40209 }
40210 }
40211 }
40212
40213 // For masks that have been widened to 128-bit elements or more,
40214 // narrow back down to 64-bit elements.
40215 if (BaseMaskEltSizeInBits > 64) {
40216 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40217 int MaskScale = BaseMaskEltSizeInBits / 64;
40218 SmallVector<int, 64> ScaledMask;
40219 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40220 Mask = std::move(ScaledMask);
40221 }
40222
40223 // For masked shuffles, we're trying to match the root width for better
40224 // writemask folding, attempt to scale the mask.
40225 // TODO - variable shuffles might need this to be widened again.
40226 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40227 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40228 int MaskScale = NumRootElts / Mask.size();
40229 SmallVector<int, 64> ScaledMask;
40230 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40231 Mask = std::move(ScaledMask);
40232 }
40233
40234 unsigned NumMaskElts = Mask.size();
40235 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40236 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40237
40238 // Determine the effective mask value type.
40239 FloatDomain &= (32 <= MaskEltSizeInBits);
40240 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40241 : MVT::getIntegerVT(MaskEltSizeInBits);
40242 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40243
40244 // Only allow legal mask types.
40245 if (!TLI.isTypeLegal(MaskVT))
40246 return SDValue();
40247
40248 // Attempt to match the mask against known shuffle patterns.
40249 MVT ShuffleSrcVT, ShuffleVT;
40250 unsigned Shuffle, PermuteImm;
40251
40252 // Which shuffle domains are permitted?
40253 // Permit domain crossing at higher combine depths.
40254 // TODO: Should we indicate which domain is preferred if both are allowed?
40255 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40256 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40257 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40258
40259 // Determine zeroable mask elements.
40260 APInt KnownUndef, KnownZero;
40261 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40262 APInt Zeroable = KnownUndef | KnownZero;
40263
40264 if (UnaryShuffle) {
40265 // Attempt to match against broadcast-from-vector.
40266 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40267 if ((Subtarget.hasAVX2() ||
40268 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40269 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40270 if (isUndefOrEqual(Mask, 0)) {
40271 if (V1.getValueType() == MaskVT &&
40273 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40274 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40275 return SDValue(); // Nothing to do!
40276 Res = V1.getOperand(0);
40277 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40278 return DAG.getBitcast(RootVT, Res);
40279 }
40280 if (Subtarget.hasAVX2()) {
40281 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40282 return SDValue(); // Nothing to do!
40283 Res = CanonicalizeShuffleInput(MaskVT, V1);
40284 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40285 return DAG.getBitcast(RootVT, Res);
40286 }
40287 }
40288 }
40289
40290 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40291 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40292 (!IsMaskedShuffle ||
40293 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40294 if (Depth == 0 && RootOpc == Shuffle)
40295 return SDValue(); // Nothing to do!
40296 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40297 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40298 return DAG.getBitcast(RootVT, Res);
40299 }
40300
40301 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40302 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40303 PermuteImm) &&
40304 (!IsMaskedShuffle ||
40305 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40306 if (Depth == 0 && RootOpc == Shuffle)
40307 return SDValue(); // Nothing to do!
40308 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40309 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40310 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40311 return DAG.getBitcast(RootVT, Res);
40312 }
40313 }
40314
40315 // Attempt to combine to INSERTPS, but only if the inserted element has come
40316 // from a scalar.
40317 // TODO: Handle other insertions here as well?
40318 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40319 Subtarget.hasSSE41() &&
40320 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40321 if (MaskEltSizeInBits == 32) {
40322 SDValue SrcV1 = V1, SrcV2 = V2;
40323 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40324 DAG) &&
40325 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40326 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40327 return SDValue(); // Nothing to do!
40328 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40329 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40330 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40331 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40332 return DAG.getBitcast(RootVT, Res);
40333 }
40334 }
40335 if (MaskEltSizeInBits == 64 &&
40336 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40338 V2.getScalarValueSizeInBits() <= 32) {
40339 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40340 return SDValue(); // Nothing to do!
40341 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40342 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40343 CanonicalizeShuffleInput(MVT::v4f32, V1),
40344 CanonicalizeShuffleInput(MVT::v4f32, V2),
40345 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40346 return DAG.getBitcast(RootVT, Res);
40347 }
40348 }
40349
40350 SDValue NewV1 = V1; // Save operands in case early exit happens.
40351 SDValue NewV2 = V2;
40352 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40353 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40354 ShuffleVT, UnaryShuffle) &&
40355 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40356 if (Depth == 0 && RootOpc == Shuffle)
40357 return SDValue(); // Nothing to do!
40358 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40359 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40360 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40361 return DAG.getBitcast(RootVT, Res);
40362 }
40363
40364 NewV1 = V1; // Save operands in case early exit happens.
40365 NewV2 = V2;
40366 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40367 AllowIntDomain, NewV1, NewV2, DL, DAG,
40368 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40369 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40370 if (Depth == 0 && RootOpc == Shuffle)
40371 return SDValue(); // Nothing to do!
40372 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40373 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40374 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40375 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40376 return DAG.getBitcast(RootVT, Res);
40377 }
40378
40379 // Typically from here on, we need an integer version of MaskVT.
40380 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40381 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40382
40383 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40384 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40385 uint64_t BitLen, BitIdx;
40386 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40387 Zeroable)) {
40388 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40389 return SDValue(); // Nothing to do!
40390 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40391 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40392 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40393 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40394 return DAG.getBitcast(RootVT, Res);
40395 }
40396
40397 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40398 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40399 return SDValue(); // Nothing to do!
40400 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40401 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40402 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40403 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40404 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40405 return DAG.getBitcast(RootVT, Res);
40406 }
40407 }
40408
40409 // Match shuffle against TRUNCATE patterns.
40410 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40411 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40412 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40413 Subtarget)) {
40414 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40415 ShuffleSrcVT.getVectorNumElements();
40416 unsigned Opc =
40418 if (Depth == 0 && RootOpc == Opc)
40419 return SDValue(); // Nothing to do!
40420 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40421 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40422 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40423 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40424 return DAG.getBitcast(RootVT, Res);
40425 }
40426
40427 // Do we need a more general binary truncation pattern?
40428 if (RootSizeInBits < 512 &&
40429 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40430 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40431 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40432 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40433 // Bail if this was already a truncation or PACK node.
40434 // We sometimes fail to match PACK if we demand known undef elements.
40435 if (Depth == 0 &&
40436 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40437 RootOpc == X86ISD::PACKUS))
40438 return SDValue(); // Nothing to do!
40439 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40440 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40441 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40442 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40443 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40444 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40445 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40446 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40447 return DAG.getBitcast(RootVT, Res);
40448 }
40449 }
40450
40451 // Don't try to re-form single instruction chains under any circumstances now
40452 // that we've done encoding canonicalization for them.
40453 if (Depth < 1)
40454 return SDValue();
40455
40456 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40457 return isTargetShuffleVariableMask(N->getOpcode());
40458 });
40459 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40460 return (N->getOpcode() == X86ISD::VPERMV3 ||
40461 N->getOpcode() == X86ISD::VPERMV);
40462 });
40463
40464 // Depth threshold above which we can efficiently use variable mask shuffles.
40465 int VariableCrossLaneShuffleDepth =
40466 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40467 int VariablePerLaneShuffleDepth =
40468 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40469 AllowVariableCrossLaneMask &=
40470 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40471 AllowVariablePerLaneMask &=
40472 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40473 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40474 // higher depth before combining them.
40475 int BWIVPERMV3ShuffleDepth =
40476 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40477 bool AllowBWIVPERMV3 =
40478 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40479
40480 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40481 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40482 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40483
40484 bool MaskContainsZeros = isAnyZero(Mask);
40485
40486 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40487 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40488 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40489 if (Subtarget.hasAVX2() &&
40490 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40491 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40492 Res = CanonicalizeShuffleInput(MaskVT, V1);
40493 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40494 return DAG.getBitcast(RootVT, Res);
40495 }
40496 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40497 if ((Subtarget.hasAVX512() &&
40498 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40499 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40500 (Subtarget.hasBWI() &&
40501 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40502 (Subtarget.hasVBMI() &&
40503 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40504 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40505 V2 = DAG.getUNDEF(MaskVT);
40506 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40507 return DAG.getBitcast(RootVT, Res);
40508 }
40509 }
40510
40511 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40512 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40513 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40514 ((Subtarget.hasAVX512() &&
40515 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40516 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40517 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40518 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40519 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40520 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40521 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40522 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40523 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40524 for (unsigned i = 0; i != NumMaskElts; ++i)
40525 if (Mask[i] == SM_SentinelZero)
40526 Mask[i] = NumMaskElts + i;
40527 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40528 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40529 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40530 return DAG.getBitcast(RootVT, Res);
40531 }
40532
40533 // If that failed and either input is extracted then try to combine as a
40534 // shuffle with the larger type.
40536 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40537 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40538 IsMaskedShuffle, DAG, DL, Subtarget))
40539 return WideShuffle;
40540
40541 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40542 // (non-VLX will pad to 512-bit shuffles).
40543 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40544 ((Subtarget.hasAVX512() &&
40545 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40546 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40547 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40548 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40549 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40550 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40551 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40552 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40553 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40554 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40555 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40556 return DAG.getBitcast(RootVT, Res);
40557 }
40558 return SDValue();
40559 }
40560
40561 // See if we can combine a single input shuffle with zeros to a bit-mask,
40562 // which is much simpler than any shuffle.
40563 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40564 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40565 TLI.isTypeLegal(MaskVT)) {
40566 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40567 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40568 APInt UndefElts(NumMaskElts, 0);
40569 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40570 for (unsigned i = 0; i != NumMaskElts; ++i) {
40571 int M = Mask[i];
40572 if (M == SM_SentinelUndef) {
40573 UndefElts.setBit(i);
40574 continue;
40575 }
40576 if (M == SM_SentinelZero)
40577 continue;
40578 EltBits[i] = AllOnes;
40579 }
40580 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40581 Res = CanonicalizeShuffleInput(MaskVT, V1);
40582 unsigned AndOpcode =
40584 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40585 return DAG.getBitcast(RootVT, Res);
40586 }
40587
40588 // If we have a single input shuffle with different shuffle patterns in the
40589 // the 128-bit lanes use the variable mask to VPERMILPS.
40590 // TODO Combine other mask types at higher depths.
40591 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40592 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40593 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40594 SmallVector<SDValue, 16> VPermIdx;
40595 for (int M : Mask) {
40596 SDValue Idx =
40597 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40598 VPermIdx.push_back(Idx);
40599 }
40600 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40601 Res = CanonicalizeShuffleInput(MaskVT, V1);
40602 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40603 return DAG.getBitcast(RootVT, Res);
40604 }
40605
40606 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40607 // to VPERMIL2PD/VPERMIL2PS.
40608 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40609 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40610 MaskVT == MVT::v8f32)) {
40611 // VPERMIL2 Operation.
40612 // Bits[3] - Match Bit.
40613 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40614 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40615 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40616 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40617 SmallVector<int, 8> VPerm2Idx;
40618 unsigned M2ZImm = 0;
40619 for (int M : Mask) {
40620 if (M == SM_SentinelUndef) {
40621 VPerm2Idx.push_back(-1);
40622 continue;
40623 }
40624 if (M == SM_SentinelZero) {
40625 M2ZImm = 2;
40626 VPerm2Idx.push_back(8);
40627 continue;
40628 }
40629 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40630 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40631 VPerm2Idx.push_back(Index);
40632 }
40633 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40634 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40635 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40636 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40637 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40638 return DAG.getBitcast(RootVT, Res);
40639 }
40640
40641 // If we have 3 or more shuffle instructions or a chain involving a variable
40642 // mask, we can replace them with a single PSHUFB instruction profitably.
40643 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40644 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40645 // more aggressive.
40646 if (UnaryShuffle && AllowVariablePerLaneMask &&
40647 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40648 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40649 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40650 SmallVector<SDValue, 16> PSHUFBMask;
40651 int NumBytes = RootVT.getSizeInBits() / 8;
40652 int Ratio = NumBytes / NumMaskElts;
40653 for (int i = 0; i < NumBytes; ++i) {
40654 int M = Mask[i / Ratio];
40655 if (M == SM_SentinelUndef) {
40656 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40657 continue;
40658 }
40659 if (M == SM_SentinelZero) {
40660 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40661 continue;
40662 }
40663 M = Ratio * M + i % Ratio;
40664 assert((M / 16) == (i / 16) && "Lane crossing detected");
40665 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40666 }
40667 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40668 Res = CanonicalizeShuffleInput(ByteVT, V1);
40669 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40670 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40671 return DAG.getBitcast(RootVT, Res);
40672 }
40673
40674 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40675 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40676 // slower than PSHUFB on targets that support both.
40677 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40678 Subtarget.hasXOP()) {
40679 // VPPERM Mask Operation
40680 // Bits[4:0] - Byte Index (0 - 31)
40681 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40682 SmallVector<SDValue, 16> VPPERMMask;
40683 int NumBytes = 16;
40684 int Ratio = NumBytes / NumMaskElts;
40685 for (int i = 0; i < NumBytes; ++i) {
40686 int M = Mask[i / Ratio];
40687 if (M == SM_SentinelUndef) {
40688 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40689 continue;
40690 }
40691 if (M == SM_SentinelZero) {
40692 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40693 continue;
40694 }
40695 M = Ratio * M + i % Ratio;
40696 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40697 }
40698 MVT ByteVT = MVT::v16i8;
40699 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40700 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40701 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40702 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40703 return DAG.getBitcast(RootVT, Res);
40704 }
40705
40706 // If that failed and either input is extracted then try to combine as a
40707 // shuffle with the larger type.
40709 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40710 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40711 DAG, DL, Subtarget))
40712 return WideShuffle;
40713
40714 // If we have a dual input shuffle then lower to VPERMV3,
40715 // (non-VLX will pad to 512-bit shuffles)
40716 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40717 ((Subtarget.hasAVX512() &&
40718 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40719 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40720 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40721 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40722 MaskVT == MVT::v16i32)) ||
40723 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40724 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40725 MaskVT == MVT::v32i16)) ||
40726 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40727 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40728 MaskVT == MVT::v64i8)))) {
40729 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40730 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40731 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40732 return DAG.getBitcast(RootVT, Res);
40733 }
40734
40735 // Failed to find any combines.
40736 return SDValue();
40737}
40738
40739// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40740// instruction if possible.
40741//
40742// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40743// type size to attempt to combine:
40744// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40745// -->
40746// extract_subvector(shuffle(x,y,m2),0)
40748 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40749 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40750 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40751 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40752 const X86Subtarget &Subtarget) {
40753 unsigned NumMaskElts = BaseMask.size();
40754 unsigned NumInputs = Inputs.size();
40755 if (NumInputs == 0)
40756 return SDValue();
40757
40758 unsigned RootSizeInBits = RootVT.getSizeInBits();
40759 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40760 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40761
40762 // Peek through subvectors to find widest legal vector.
40763 // TODO: Handle ISD::TRUNCATE
40764 unsigned WideSizeInBits = RootSizeInBits;
40765 for (SDValue Input : Inputs) {
40767 while (1) {
40768 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40769 Input = peekThroughBitcasts(Input.getOperand(0));
40770 continue;
40771 }
40772 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40773 Input.getOperand(0).isUndef() &&
40774 isNullConstant(Input.getOperand(2))) {
40775 Input = peekThroughBitcasts(Input.getOperand(1));
40776 continue;
40777 }
40778 break;
40779 }
40780 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40781 WideSizeInBits < Input.getValueSizeInBits())
40782 WideSizeInBits = Input.getValueSizeInBits();
40783 }
40784
40785 // Bail if we fail to find a source larger than the existing root.
40786 if (WideSizeInBits <= RootSizeInBits ||
40787 (WideSizeInBits % RootSizeInBits) != 0)
40788 return SDValue();
40789
40790 // Create new mask for larger type.
40791 SmallVector<int, 64> WideMask;
40792 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40793
40794 // Attempt to peek through inputs and adjust mask when we extract from an
40795 // upper subvector.
40796 int AdjustedMasks = 0;
40797 SmallVector<SDValue, 4> WideInputs(Inputs);
40798 for (unsigned I = 0; I != NumInputs; ++I) {
40799 SDValue &Input = WideInputs[I];
40801 while (1) {
40802 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40803 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40804 uint64_t Idx = Input.getConstantOperandVal(1);
40805 if (Idx != 0) {
40806 ++AdjustedMasks;
40807 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40808 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40809
40810 int lo = I * WideMask.size();
40811 int hi = (I + 1) * WideMask.size();
40812 for (int &M : WideMask)
40813 if (lo <= M && M < hi)
40814 M += Idx;
40815 }
40816 Input = peekThroughBitcasts(Input.getOperand(0));
40817 continue;
40818 }
40819 // TODO: Handle insertions into upper subvectors.
40820 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40821 Input.getOperand(0).isUndef() &&
40822 isNullConstant(Input.getOperand(2))) {
40823 Input = peekThroughBitcasts(Input.getOperand(1));
40824 continue;
40825 }
40826 break;
40827 }
40828 }
40829
40830 // Remove unused/repeated shuffle source ops.
40831 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40832 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40833
40834 // Bail if we're always extracting from the lowest subvectors,
40835 // combineX86ShuffleChain should match this for the current width, or the
40836 // shuffle still references too many inputs.
40837 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40838 return SDValue();
40839
40840 // Minor canonicalization of the accumulated shuffle mask to make it easier
40841 // to match below. All this does is detect masks with sequential pairs of
40842 // elements, and shrink them to the half-width mask. It does this in a loop
40843 // so it will reduce the size of the mask to the minimal width mask which
40844 // performs an equivalent shuffle.
40845 while (WideMask.size() > 1) {
40846 SmallVector<int, 64> WidenedMask;
40847 if (!canWidenShuffleElements(WideMask, WidenedMask))
40848 break;
40849 WideMask = std::move(WidenedMask);
40850 }
40851
40852 // Canonicalization of binary shuffle masks to improve pattern matching by
40853 // commuting the inputs.
40854 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40856 std::swap(WideInputs[0], WideInputs[1]);
40857 }
40858
40859 // Increase depth for every upper subvector we've peeked through.
40860 Depth += AdjustedMasks;
40861
40862 // Attempt to combine wider chain.
40863 // TODO: Can we use a better Root?
40864 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40865 WideInputs.back().getValueSizeInBits()
40866 ? WideInputs.front()
40867 : WideInputs.back();
40868 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40869 "WideRootSize mismatch");
40870
40871 if (SDValue WideShuffle = combineX86ShuffleChain(
40872 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40873 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40874 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40875 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40876 return DAG.getBitcast(RootVT, WideShuffle);
40877 }
40878
40879 return SDValue();
40880}
40881
40882// Canonicalize the combined shuffle mask chain with horizontal ops.
40883// NOTE: This may update the Ops and Mask.
40886 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40887 const X86Subtarget &Subtarget) {
40888 if (Mask.empty() || Ops.empty())
40889 return SDValue();
40890
40892 for (SDValue Op : Ops)
40894
40895 // All ops must be the same horizop + type.
40896 SDValue BC0 = BC[0];
40897 EVT VT0 = BC0.getValueType();
40898 unsigned Opcode0 = BC0.getOpcode();
40899 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40900 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40901 }))
40902 return SDValue();
40903
40904 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40905 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40906 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40907 if (!isHoriz && !isPack)
40908 return SDValue();
40909
40910 // Do all ops have a single use?
40911 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40912 return Op.hasOneUse() &&
40914 });
40915
40916 int NumElts = VT0.getVectorNumElements();
40917 int NumLanes = VT0.getSizeInBits() / 128;
40918 int NumEltsPerLane = NumElts / NumLanes;
40919 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40920 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40921 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40922
40923 if (NumEltsPerLane >= 4 &&
40924 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40925 SmallVector<int> LaneMask, ScaledMask;
40926 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40927 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40928 // See if we can remove the shuffle by resorting the HOP chain so that
40929 // the HOP args are pre-shuffled.
40930 // TODO: Generalize to any sized/depth chain.
40931 // TODO: Add support for PACKSS/PACKUS.
40932 if (isHoriz) {
40933 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40934 auto GetHOpSrc = [&](int M) {
40935 if (M == SM_SentinelUndef)
40936 return DAG.getUNDEF(VT0);
40937 if (M == SM_SentinelZero)
40938 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40939 SDValue Src0 = BC[M / 4];
40940 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40941 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40942 return Src1.getOperand(M % 2);
40943 return SDValue();
40944 };
40945 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40946 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40947 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40948 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40949 if (M0 && M1 && M2 && M3) {
40950 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40951 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40952 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40953 }
40954 }
40955 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40956 if (Ops.size() >= 2) {
40957 SDValue LHS, RHS;
40958 auto GetHOpSrc = [&](int M, int &OutM) {
40959 // TODO: Support SM_SentinelZero
40960 if (M < 0)
40961 return M == SM_SentinelUndef;
40962 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40963 if (!LHS || LHS == Src) {
40964 LHS = Src;
40965 OutM = (M % 2);
40966 return true;
40967 }
40968 if (!RHS || RHS == Src) {
40969 RHS = Src;
40970 OutM = (M % 2) + 2;
40971 return true;
40972 }
40973 return false;
40974 };
40975 int PostMask[4] = {-1, -1, -1, -1};
40976 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40977 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40978 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40979 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40980 LHS = DAG.getBitcast(SrcVT, LHS);
40981 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40982 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40983 // Use SHUFPS for the permute so this will work on SSE2 targets,
40984 // shuffle combining and domain handling will simplify this later on.
40985 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40986 Res = DAG.getBitcast(ShuffleVT, Res);
40987 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40988 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40989 }
40990 }
40991 }
40992 }
40993
40994 if (2 < Ops.size())
40995 return SDValue();
40996
40997 SDValue BC1 = BC[BC.size() - 1];
40998 if (Mask.size() == VT0.getVectorNumElements()) {
40999 // Canonicalize binary shuffles of horizontal ops that use the
41000 // same sources to an unary shuffle.
41001 // TODO: Try to perform this fold even if the shuffle remains.
41002 if (Ops.size() == 2) {
41003 auto ContainsOps = [](SDValue HOp, SDValue Op) {
41004 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
41005 };
41006 // Commute if all BC0's ops are contained in BC1.
41007 if (ContainsOps(BC1, BC0.getOperand(0)) &&
41008 ContainsOps(BC1, BC0.getOperand(1))) {
41010 std::swap(Ops[0], Ops[1]);
41011 std::swap(BC0, BC1);
41012 }
41013
41014 // If BC1 can be represented by BC0, then convert to unary shuffle.
41015 if (ContainsOps(BC0, BC1.getOperand(0)) &&
41016 ContainsOps(BC0, BC1.getOperand(1))) {
41017 for (int &M : Mask) {
41018 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
41019 continue;
41020 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
41021 M -= NumElts + (SubLane * NumHalfEltsPerLane);
41022 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
41023 M += NumHalfEltsPerLane;
41024 }
41025 }
41026 }
41027
41028 // Canonicalize unary horizontal ops to only refer to lower halves.
41029 for (int i = 0; i != NumElts; ++i) {
41030 int &M = Mask[i];
41031 if (isUndefOrZero(M))
41032 continue;
41033 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
41034 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41035 M -= NumHalfEltsPerLane;
41036 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
41037 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41038 M -= NumHalfEltsPerLane;
41039 }
41040 }
41041
41042 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
41043 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
41044 // represents the LHS/RHS inputs for the lower/upper halves.
41045 SmallVector<int, 16> TargetMask128, WideMask128;
41046 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41047 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41048 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41049 bool SingleOp = (Ops.size() == 1);
41050 if (isPack || OneUseOps ||
41051 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41052 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41053 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41054 Lo = Lo.getOperand(WideMask128[0] & 1);
41055 Hi = Hi.getOperand(WideMask128[1] & 1);
41056 if (SingleOp) {
41057 SDValue Undef = DAG.getUNDEF(SrcVT);
41058 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41059 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41060 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41061 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41062 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41063 }
41064 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41065 }
41066 }
41067
41068 // If we are post-shuffling a 256-bit hop and not requiring the upper
41069 // elements, then try to narrow to a 128-bit hop directly.
41070 SmallVector<int, 16> WideMask64;
41071 if (Ops.size() == 1 && NumLanes == 2 &&
41072 scaleShuffleElements(Mask, 4, WideMask64) &&
41073 isUndefInRange(WideMask64, 2, 2)) {
41074 int M0 = WideMask64[0];
41075 int M1 = WideMask64[1];
41076 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41078 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41079 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41080 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41081 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41082 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41083 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41084 }
41085 }
41086
41087 return SDValue();
41088}
41089
41090// Attempt to constant fold all of the constant source ops.
41091// Returns true if the entire shuffle is folded to a constant.
41092// TODO: Extend this to merge multiple constant Ops and update the mask.
41094 ArrayRef<int> Mask,
41095 ArrayRef<const SDNode *> SrcNodes,
41096 SelectionDAG &DAG, const SDLoc &DL,
41097 const X86Subtarget &Subtarget) {
41098 unsigned SizeInBits = VT.getSizeInBits();
41099 unsigned NumMaskElts = Mask.size();
41100 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41101 unsigned NumOps = Ops.size();
41102
41103 // Extract constant bits from each source op.
41104 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41106 for (unsigned I = 0; I != NumOps; ++I)
41107 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41108 RawBitsOps[I],
41109 /*AllowWholeUndefs*/ true,
41110 /*AllowPartialUndefs*/ true))
41111 return SDValue();
41112
41113 // If we're optimizing for size, only fold if at least one of the constants is
41114 // only used once or the combined shuffle has included a variable mask
41115 // shuffle, this is to avoid constant pool bloat.
41116 bool IsOptimizingSize = DAG.shouldOptForSize();
41117 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41118 return isTargetShuffleVariableMask(N->getOpcode());
41119 });
41120 if (IsOptimizingSize && !HasVariableMask &&
41121 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41122 return SDValue();
41123
41124 // Shuffle the constant bits according to the mask.
41125 APInt UndefElts(NumMaskElts, 0);
41126 APInt ZeroElts(NumMaskElts, 0);
41127 APInt ConstantElts(NumMaskElts, 0);
41128 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41129 APInt::getZero(MaskSizeInBits));
41130 for (unsigned i = 0; i != NumMaskElts; ++i) {
41131 int M = Mask[i];
41132 if (M == SM_SentinelUndef) {
41133 UndefElts.setBit(i);
41134 continue;
41135 } else if (M == SM_SentinelZero) {
41136 ZeroElts.setBit(i);
41137 continue;
41138 }
41139 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41140
41141 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41142 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41143
41144 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41145 if (SrcUndefElts[SrcMaskIdx]) {
41146 UndefElts.setBit(i);
41147 continue;
41148 }
41149
41150 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41151 APInt &Bits = SrcEltBits[SrcMaskIdx];
41152 if (!Bits) {
41153 ZeroElts.setBit(i);
41154 continue;
41155 }
41156
41157 ConstantElts.setBit(i);
41158 ConstantBitData[i] = Bits;
41159 }
41160 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41161
41162 // Attempt to create a zero vector.
41163 if ((UndefElts | ZeroElts).isAllOnes())
41164 return getZeroVector(VT, Subtarget, DAG, DL);
41165
41166 // Create the constant data.
41167 MVT MaskSVT;
41168 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41169 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41170 else
41171 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41172
41173 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41174 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41175 return SDValue();
41176
41177 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41178 return DAG.getBitcast(VT, CstOp);
41179}
41180
41181namespace llvm {
41182 namespace X86 {
41183 enum {
41185 };
41186 } // namespace X86
41187} // namespace llvm
41188
41189/// Fully generic combining of x86 shuffle instructions.
41190///
41191/// This should be the last combine run over the x86 shuffle instructions. Once
41192/// they have been fully optimized, this will recursively consider all chains
41193/// of single-use shuffle instructions, build a generic model of the cumulative
41194/// shuffle operation, and check for simpler instructions which implement this
41195/// operation. We use this primarily for two purposes:
41196///
41197/// 1) Collapse generic shuffles to specialized single instructions when
41198/// equivalent. In most cases, this is just an encoding size win, but
41199/// sometimes we will collapse multiple generic shuffles into a single
41200/// special-purpose shuffle.
41201/// 2) Look for sequences of shuffle instructions with 3 or more total
41202/// instructions, and replace them with the slightly more expensive SSSE3
41203/// PSHUFB instruction if available. We do this as the last combining step
41204/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41205/// a suitable short sequence of other instructions. The PSHUFB will either
41206/// use a register or have to read from memory and so is slightly (but only
41207/// slightly) more expensive than the other shuffle instructions.
41208///
41209/// Because this is inherently a quadratic operation (for each shuffle in
41210/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41211/// This should never be an issue in practice as the shuffle lowering doesn't
41212/// produce sequences of more than 8 instructions.
41213///
41214/// FIXME: We will currently miss some cases where the redundant shuffling
41215/// would simplify under the threshold for PSHUFB formation because of
41216/// combine-ordering. To fix this, we should do the redundant instruction
41217/// combining in this recursive walk.
41219 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41220 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41221 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41222 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41223 const SDLoc &DL, const X86Subtarget &Subtarget) {
41224 assert(!RootMask.empty() &&
41225 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41226 "Illegal shuffle root mask");
41227 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41228 unsigned RootSizeInBits = RootVT.getSizeInBits();
41229 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41230
41231 // Bound the depth of our recursive combine because this is ultimately
41232 // quadratic in nature.
41233 if (Depth >= MaxDepth)
41234 return SDValue();
41235
41236 // Directly rip through bitcasts to find the underlying operand.
41237 SDValue Op = SrcOps[SrcOpIndex];
41239
41240 EVT VT = Op.getValueType();
41241 if (!VT.isVector() || !VT.isSimple())
41242 return SDValue(); // Bail if we hit a non-simple non-vector.
41243
41244 // FIXME: Just bail on f16 for now.
41245 if (VT.getVectorElementType() == MVT::f16)
41246 return SDValue();
41247
41248 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41249 "Can only combine shuffles upto size of the root op.");
41250
41251 // Create a demanded elts mask from the referenced elements of Op.
41252 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41253 for (int M : RootMask) {
41254 int BaseIdx = RootMask.size() * SrcOpIndex;
41255 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41256 OpDemandedElts.setBit(M - BaseIdx);
41257 }
41258 if (RootSizeInBits != VT.getSizeInBits()) {
41259 // Op is smaller than Root - extract the demanded elts for the subvector.
41260 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41261 unsigned NumOpMaskElts = RootMask.size() / Scale;
41262 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41263 assert(OpDemandedElts
41264 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41265 .isZero() &&
41266 "Out of range elements referenced in root mask");
41267 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41268 }
41269 OpDemandedElts =
41270 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41271
41272 // Extract target shuffle mask and resolve sentinels and inputs.
41273 SmallVector<int, 64> OpMask;
41274 SmallVector<SDValue, 2> OpInputs;
41275 APInt OpUndef, OpZero;
41276 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41277 OpZero, DAG, Depth, false)) {
41278 // Shuffle inputs must not be larger than the shuffle result.
41279 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41280 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41281 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41282 }))
41283 return SDValue();
41284 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41285 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41286 !isNullConstant(Op.getOperand(1))) {
41287 SDValue SrcVec = Op.getOperand(0);
41288 int ExtractIdx = Op.getConstantOperandVal(1);
41289 unsigned NumElts = VT.getVectorNumElements();
41290 OpInputs.assign({SrcVec});
41291 OpMask.assign(NumElts, SM_SentinelUndef);
41292 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41293 OpZero = OpUndef = APInt::getZero(NumElts);
41294 } else {
41295 return SDValue();
41296 }
41297
41298 // If the shuffle result was smaller than the root, we need to adjust the
41299 // mask indices and pad the mask with undefs.
41300 if (RootSizeInBits > VT.getSizeInBits()) {
41301 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41302 unsigned OpMaskSize = OpMask.size();
41303 if (OpInputs.size() > 1) {
41304 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41305 for (int &M : OpMask) {
41306 if (M < 0)
41307 continue;
41308 int EltIdx = M % OpMaskSize;
41309 int OpIdx = M / OpMaskSize;
41310 M = (PaddedMaskSize * OpIdx) + EltIdx;
41311 }
41312 }
41313 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41314 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41315 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41316 }
41317
41320
41321 // We don't need to merge masks if the root is empty.
41322 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41323 if (EmptyRoot) {
41324 // Only resolve zeros if it will remove an input, otherwise we might end
41325 // up in an infinite loop.
41326 bool ResolveKnownZeros = true;
41327 if (!OpZero.isZero()) {
41328 APInt UsedInputs = APInt::getZero(OpInputs.size());
41329 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41330 int M = OpMask[i];
41331 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41332 continue;
41333 UsedInputs.setBit(M / OpMask.size());
41334 if (UsedInputs.isAllOnes()) {
41335 ResolveKnownZeros = false;
41336 break;
41337 }
41338 }
41339 }
41340 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41341 ResolveKnownZeros);
41342
41343 Mask = OpMask;
41344 Ops.append(OpInputs.begin(), OpInputs.end());
41345 } else {
41346 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41347
41348 // Add the inputs to the Ops list, avoiding duplicates.
41349 Ops.append(SrcOps.begin(), SrcOps.end());
41350
41351 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41352 // Attempt to find an existing match.
41354 for (int i = 0, e = Ops.size(); i < e; ++i)
41355 if (InputBC == peekThroughBitcasts(Ops[i]))
41356 return i;
41357 // Match failed - should we replace an existing Op?
41358 if (InsertionPoint >= 0) {
41360 return InsertionPoint;
41361 }
41362 // Add to the end of the Ops list.
41363 Ops.push_back(Input);
41364 return Ops.size() - 1;
41365 };
41366
41367 SmallVector<int, 2> OpInputIdx;
41368 for (SDValue OpInput : OpInputs)
41369 OpInputIdx.push_back(
41370 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41371
41372 assert(((RootMask.size() > OpMask.size() &&
41373 RootMask.size() % OpMask.size() == 0) ||
41374 (OpMask.size() > RootMask.size() &&
41375 OpMask.size() % RootMask.size() == 0) ||
41376 OpMask.size() == RootMask.size()) &&
41377 "The smaller number of elements must divide the larger.");
41378
41379 // This function can be performance-critical, so we rely on the power-of-2
41380 // knowledge that we have about the mask sizes to replace div/rem ops with
41381 // bit-masks and shifts.
41383 "Non-power-of-2 shuffle mask sizes");
41385 "Non-power-of-2 shuffle mask sizes");
41386 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41387 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41388
41389 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41390 unsigned RootRatio =
41391 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41392 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41393 assert((RootRatio == 1 || OpRatio == 1) &&
41394 "Must not have a ratio for both incoming and op masks!");
41395
41396 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41397 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41398 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41399 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41400 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41401
41402 Mask.resize(MaskWidth, SM_SentinelUndef);
41403
41404 // Merge this shuffle operation's mask into our accumulated mask. Note that
41405 // this shuffle's mask will be the first applied to the input, followed by
41406 // the root mask to get us all the way to the root value arrangement. The
41407 // reason for this order is that we are recursing up the operation chain.
41408 for (unsigned i = 0; i < MaskWidth; ++i) {
41409 unsigned RootIdx = i >> RootRatioLog2;
41410 if (RootMask[RootIdx] < 0) {
41411 // This is a zero or undef lane, we're done.
41412 Mask[i] = RootMask[RootIdx];
41413 continue;
41414 }
41415
41416 unsigned RootMaskedIdx =
41417 RootRatio == 1
41418 ? RootMask[RootIdx]
41419 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41420
41421 // Just insert the scaled root mask value if it references an input other
41422 // than the SrcOp we're currently inserting.
41423 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41424 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41425 Mask[i] = RootMaskedIdx;
41426 continue;
41427 }
41428
41429 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41430 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41431 if (OpMask[OpIdx] < 0) {
41432 // The incoming lanes are zero or undef, it doesn't matter which ones we
41433 // are using.
41434 Mask[i] = OpMask[OpIdx];
41435 continue;
41436 }
41437
41438 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41439 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41440 : (OpMask[OpIdx] << OpRatioLog2) +
41441 (RootMaskedIdx & (OpRatio - 1));
41442
41443 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41444 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41445 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41446 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41447
41448 Mask[i] = OpMaskedIdx;
41449 }
41450 }
41451
41452 // Peek through any free bitcasts to insert_subvector vector widenings or
41453 // extract_subvector nodes back to root size.
41454 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41455 for (auto [I, Op] : enumerate(Ops)) {
41456 SDValue BC = Op;
41457 while (1) {
41458 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41459 BC = BC.getOperand(0);
41460 continue;
41461 }
41462 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41463 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41464 // Set out of bounds mask indices to undef.
41465 Op = BC = BC.getOperand(1);
41466 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41467 int Lo = I * Mask.size();
41468 int Hi = (I + 1) * Mask.size();
41469 int NewHi = Lo + (Mask.size() / Scale);
41470 for (int &M : Mask) {
41471 if (Lo <= M && NewHi <= M && M < Hi)
41472 M = SM_SentinelUndef;
41473 }
41474 continue;
41475 }
41476 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41477 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41478 isNullConstant(BC.getOperand(1))) {
41479 Op = BC = BC.getOperand(0);
41480 continue;
41481 }
41482 break;
41483 }
41484 }
41485
41486 // Remove unused/repeated shuffle source ops.
41488
41489 // Handle the all undef/zero/ones cases early.
41490 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41491 return DAG.getUNDEF(RootVT);
41492 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41493 return getZeroVector(RootVT, Subtarget, DAG, DL);
41494 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41496 return getOnesVector(RootVT, DAG, DL);
41497
41498 assert(!Ops.empty() && "Shuffle with no inputs detected");
41499
41500 // Update the list of shuffle nodes that have been combined so far.
41501 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41502 CombinedNodes.push_back(Op.getNode());
41503
41504 // See if we can recurse into each shuffle source op (if it's a target
41505 // shuffle). The source op should only be generally combined if it either has
41506 // a single use (i.e. current Op) or all its users have already been combined,
41507 // if not then we can still combine but should prevent generation of variable
41508 // shuffles to avoid constant pool bloat.
41509 // Don't recurse if we already have more source ops than we can combine in
41510 // the remaining recursion depth.
41511 if (Ops.size() < (MaxDepth - Depth)) {
41512 for (int i = 0, e = Ops.size(); i < e; ++i) {
41513 // For empty roots, we need to resolve zeroable elements before combining
41514 // them with other shuffles.
41515 SmallVector<int, 64> ResolvedMask = Mask;
41516 if (EmptyRoot)
41517 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41518 bool AllowCrossLaneVar = false;
41519 bool AllowPerLaneVar = false;
41520 if (Ops[i].getNode()->hasOneUse() ||
41521 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41522 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41523 AllowPerLaneVar = AllowVariablePerLaneMask;
41524 }
41526 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41527 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41528 DAG, DL, Subtarget))
41529 return Res;
41530 }
41531 }
41532
41533 // Attempt to constant fold all of the constant source ops.
41535 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41536 return Cst;
41537
41538 // If constant fold failed and we only have constants - then we have
41539 // multiple uses by a single non-variable shuffle - just bail.
41540 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41541 APInt UndefElts;
41542 SmallVector<APInt> RawBits;
41543 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41544 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41545 RawBits,
41546 /*AllowWholeUndefs*/ true,
41547 /*AllowPartialUndefs*/ true);
41548 })) {
41549 return SDValue();
41550 }
41551
41552 // Canonicalize the combined shuffle mask chain with horizontal ops.
41553 // NOTE: This will update the Ops and Mask.
41555 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41556 return DAG.getBitcast(RootVT, HOp);
41557
41558 // Try to refine our inputs given our knowledge of target shuffle mask.
41559 for (auto I : enumerate(Ops)) {
41560 int OpIdx = I.index();
41561 SDValue &Op = I.value();
41562
41563 // What range of shuffle mask element values results in picking from Op?
41564 int Lo = OpIdx * Mask.size();
41565 int Hi = Lo + Mask.size();
41566
41567 // Which elements of Op do we demand, given the mask's granularity?
41568 APInt OpDemandedElts(Mask.size(), 0);
41569 for (int MaskElt : Mask) {
41570 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41571 int OpEltIdx = MaskElt - Lo;
41572 OpDemandedElts.setBit(OpEltIdx);
41573 }
41574 }
41575
41576 // Is the shuffle result smaller than the root?
41577 if (Op.getValueSizeInBits() < RootSizeInBits) {
41578 // We padded the mask with undefs. But we now need to undo that.
41579 unsigned NumExpectedVectorElts = Mask.size();
41580 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41581 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41582 assert(!OpDemandedElts.extractBits(
41583 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41584 "Demanding the virtual undef widening padding?");
41585 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41586 }
41587
41588 // The Op itself may be of different VT, so we need to scale the mask.
41589 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41590 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41591
41592 // Can this operand be simplified any further, given it's demanded elements?
41594 Op, OpScaledDemandedElts, DAG))
41595 Op = NewOp;
41596 }
41597 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41598
41599 // Widen any subvector shuffle inputs we've collected.
41600 // TODO: Remove this to avoid generating temporary nodes, we should only
41601 // widen once combineX86ShuffleChain has found a match.
41602 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41603 return Op.getValueSizeInBits() < RootSizeInBits;
41604 })) {
41605 for (SDValue &Op : Ops)
41606 if (Op.getValueSizeInBits() < RootSizeInBits)
41607 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41608 RootSizeInBits);
41609 // Reresolve - we might have repeated subvector sources.
41611 }
41612
41613 // Handle the all undef/zero/ones cases.
41614 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41615 return DAG.getUNDEF(RootVT);
41616 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41617 return getZeroVector(RootVT, Subtarget, DAG, DL);
41618 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41620 return getOnesVector(RootVT, DAG, DL);
41621
41622 assert(!Ops.empty() && "Shuffle with no inputs detected");
41623
41624 // We can only combine unary and binary shuffle mask cases.
41625 if (Ops.size() <= 2) {
41626 // Minor canonicalization of the accumulated shuffle mask to make it easier
41627 // to match below. All this does is detect masks with sequential pairs of
41628 // elements, and shrink them to the half-width mask. It does this in a loop
41629 // so it will reduce the size of the mask to the minimal width mask which
41630 // performs an equivalent shuffle.
41631 while (Mask.size() > 1) {
41632 SmallVector<int, 64> WidenedMask;
41633 if (!canWidenShuffleElements(Mask, WidenedMask))
41634 break;
41635 Mask = std::move(WidenedMask);
41636 }
41637
41638 // Canonicalization of binary shuffle masks to improve pattern matching by
41639 // commuting the inputs.
41640 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41642 std::swap(Ops[0], Ops[1]);
41643 }
41644
41645 // Try to combine into a single shuffle instruction.
41646 if (SDValue Shuffle = combineX86ShuffleChain(
41647 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41648 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41649 IsMaskedShuffle, DAG, DL, Subtarget))
41650 return Shuffle;
41651
41652 // If all the operands come from the same larger vector, fallthrough and try
41653 // to use combineX86ShuffleChainWithExtract.
41656 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41657 (RootSizeInBits / Mask.size()) != 64 ||
41658 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41659 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41660 LHS.getOperand(0) != RHS.getOperand(0))
41661 return SDValue();
41662 }
41663
41664 // If that failed and any input is extracted then try to combine as a
41665 // shuffle with the larger type.
41667 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41668 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41669 DAG, DL, Subtarget);
41670}
41671
41672/// Helper entry wrapper to combineX86ShufflesRecursively.
41674 const X86Subtarget &Subtarget) {
41676 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41677 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41678 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41679 SDLoc(Op), Subtarget);
41680}
41681
41682/// Get the PSHUF-style mask from PSHUF node.
41683///
41684/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41685/// PSHUF-style masks that can be reused with such instructions.
41687 MVT VT = N.getSimpleValueType();
41690 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41691 (void)HaveMask;
41692 assert(HaveMask);
41693
41694 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41695 // matter. Check that the upper masks are repeats and remove them.
41696 if (VT.getSizeInBits() > 128) {
41697 int LaneElts = 128 / VT.getScalarSizeInBits();
41698#ifndef NDEBUG
41699 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41700 for (int j = 0; j < LaneElts; ++j)
41701 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41702 "Mask doesn't repeat in high 128-bit lanes!");
41703#endif
41704 Mask.resize(LaneElts);
41705 }
41706
41707 switch (N.getOpcode()) {
41708 case X86ISD::PSHUFD:
41709 return Mask;
41710 case X86ISD::PSHUFLW:
41711 Mask.resize(4);
41712 return Mask;
41713 case X86ISD::PSHUFHW:
41714 Mask.erase(Mask.begin(), Mask.begin() + 4);
41715 for (int &M : Mask)
41716 M -= 4;
41717 return Mask;
41718 default:
41719 llvm_unreachable("No valid shuffle instruction found!");
41720 }
41721}
41722
41723/// Get the expanded blend mask from a BLENDI node.
41724/// For v16i16 nodes, this will splat the repeated i8 mask.
41726 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41727 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41728 APInt Mask = V.getConstantOperandAPInt(2);
41729 if (Mask.getBitWidth() > NumElts)
41730 Mask = Mask.trunc(NumElts);
41731 if (NumElts == 16) {
41732 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41733 Mask = APInt::getSplat(16, Mask);
41734 }
41735 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41736 return Mask;
41737}
41738
41739/// Search for a combinable shuffle across a chain ending in pshufd.
41740///
41741/// We walk up the chain and look for a combinable shuffle, skipping over
41742/// shuffles that we could hoist this shuffle's transformation past without
41743/// altering anything.
41746 const SDLoc &DL,
41747 SelectionDAG &DAG) {
41748 assert(N.getOpcode() == X86ISD::PSHUFD &&
41749 "Called with something other than an x86 128-bit half shuffle!");
41750
41751 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41752 // of the shuffles in the chain so that we can form a fresh chain to replace
41753 // this one.
41755 SDValue V = N.getOperand(0);
41756 for (; V.hasOneUse(); V = V.getOperand(0)) {
41757 switch (V.getOpcode()) {
41758 default:
41759 return SDValue(); // Nothing combined!
41760
41761 case ISD::BITCAST:
41762 // Skip bitcasts as we always know the type for the target specific
41763 // instructions.
41764 continue;
41765
41766 case X86ISD::PSHUFD:
41767 // Found another dword shuffle.
41768 break;
41769
41770 case X86ISD::PSHUFLW:
41771 // Check that the low words (being shuffled) are the identity in the
41772 // dword shuffle, and the high words are self-contained.
41773 if (Mask[0] != 0 || Mask[1] != 1 ||
41774 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41775 return SDValue();
41776
41777 Chain.push_back(V);
41778 continue;
41779
41780 case X86ISD::PSHUFHW:
41781 // Check that the high words (being shuffled) are the identity in the
41782 // dword shuffle, and the low words are self-contained.
41783 if (Mask[2] != 2 || Mask[3] != 3 ||
41784 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41785 return SDValue();
41786
41787 Chain.push_back(V);
41788 continue;
41789
41790 case X86ISD::UNPCKL:
41791 case X86ISD::UNPCKH:
41792 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41793 // shuffle into a preceding word shuffle.
41794 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41795 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41796 return SDValue();
41797
41798 // Search for a half-shuffle which we can combine with.
41799 unsigned CombineOp =
41800 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41801 if (V.getOperand(0) != V.getOperand(1) ||
41802 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41803 return SDValue();
41804 Chain.push_back(V);
41805 V = V.getOperand(0);
41806 do {
41807 switch (V.getOpcode()) {
41808 default:
41809 return SDValue(); // Nothing to combine.
41810
41811 case X86ISD::PSHUFLW:
41812 case X86ISD::PSHUFHW:
41813 if (V.getOpcode() == CombineOp)
41814 break;
41815
41816 Chain.push_back(V);
41817
41818 [[fallthrough]];
41819 case ISD::BITCAST:
41820 V = V.getOperand(0);
41821 continue;
41822 }
41823 break;
41824 } while (V.hasOneUse());
41825 break;
41826 }
41827 // Break out of the loop if we break out of the switch.
41828 break;
41829 }
41830
41831 if (!V.hasOneUse())
41832 // We fell out of the loop without finding a viable combining instruction.
41833 return SDValue();
41834
41835 // Merge this node's mask and our incoming mask.
41837 for (int &M : Mask)
41838 M = VMask[M];
41839 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41840 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41841
41842 // Rebuild the chain around this new shuffle.
41843 while (!Chain.empty()) {
41844 SDValue W = Chain.pop_back_val();
41845
41846 if (V.getValueType() != W.getOperand(0).getValueType())
41847 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41848
41849 switch (W.getOpcode()) {
41850 default:
41851 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41852
41853 case X86ISD::UNPCKL:
41854 case X86ISD::UNPCKH:
41855 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41856 break;
41857
41858 case X86ISD::PSHUFD:
41859 case X86ISD::PSHUFLW:
41860 case X86ISD::PSHUFHW:
41861 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41862 break;
41863 }
41864 }
41865 if (V.getValueType() != N.getValueType())
41866 V = DAG.getBitcast(N.getValueType(), V);
41867
41868 // Return the new chain to replace N.
41869 return V;
41870}
41871
41872// Attempt to commute shufps LHS loads:
41873// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41875 SelectionDAG &DAG) {
41876 // TODO: Add vXf64 support.
41877 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41878 return SDValue();
41879
41880 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41881 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41882 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41883 return SDValue();
41884 SDValue N0 = V.getOperand(0);
41885 SDValue N1 = V.getOperand(1);
41886 unsigned Imm = V.getConstantOperandVal(2);
41887 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41888 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41890 return SDValue();
41891 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41892 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41893 DAG.getTargetConstant(Imm, DL, MVT::i8));
41894 };
41895
41896 switch (N.getOpcode()) {
41897 case X86ISD::VPERMILPI:
41898 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41899 unsigned Imm = N.getConstantOperandVal(1);
41900 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41901 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41902 }
41903 break;
41904 case X86ISD::SHUFP: {
41905 SDValue N0 = N.getOperand(0);
41906 SDValue N1 = N.getOperand(1);
41907 unsigned Imm = N.getConstantOperandVal(2);
41908 if (N0 == N1) {
41909 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41910 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41911 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41912 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41913 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41914 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41915 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41916 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41917 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41918 }
41919 break;
41920 }
41921 }
41922
41923 return SDValue();
41924}
41925
41926// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41927// iff we don't demand the same element index for both X and Y.
41928static SDValue
41930 const APInt &DemandedElts, SelectionDAG &DAG,
41931 const X86Subtarget &Subtarget, const SDLoc &DL) {
41932 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41933 if (!N0.hasOneUse() || !N1.hasOneUse())
41934 return SDValue();
41935
41936 unsigned NumElts = VT.getVectorNumElements();
41939
41940 // See if both operands are shuffles, and that we can scale the shuffle masks
41941 // to the same width as the blend mask.
41942 // TODO: Support SM_SentinelZero?
41943 SmallVector<SDValue, 2> Ops0, Ops1;
41944 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41945 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41946 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41947 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41948 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41949 return SDValue();
41950
41951 // Determine the demanded elts from both permutes.
41952 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41953 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41954 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41955 Demanded1,
41956 /*AllowUndefElts=*/true) ||
41957 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41958 DemandedRHS0, /*AllowUndefElts=*/true) ||
41959 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41960 DemandedRHS1, /*AllowUndefElts=*/true))
41961 return SDValue();
41962
41963 // Confirm that we only use a single operand from both permutes and that we
41964 // don't demand the same index from both.
41965 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41966 DemandedLHS0.intersects(DemandedLHS1))
41967 return SDValue();
41968
41969 // Use the permute demanded elts masks as the new blend mask.
41970 // Create the new permute mask as a blend of the 2 original permute masks.
41971 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41972 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41973 for (unsigned I = 0; I != NumElts; ++I) {
41974 if (Demanded0[I]) {
41975 int M = ScaledMask0[I];
41976 if (0 <= M) {
41977 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41978 "BlendMask demands LHS AND RHS");
41979 NewBlendMask[M] = M;
41980 NewPermuteMask[I] = M;
41981 }
41982 } else if (Demanded1[I]) {
41983 int M = ScaledMask1[I];
41984 if (0 <= M) {
41985 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41986 "BlendMask demands LHS AND RHS");
41987 NewBlendMask[M] = M + NumElts;
41988 NewPermuteMask[I] = M;
41989 }
41990 }
41991 }
41992 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41993 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41994
41995 // v16i16 shuffles can explode in complexity very easily, only accept them if
41996 // the blend mask is the same in the 128-bit subvectors (or can widen to
41997 // v8i32) and the permute can be widened as well.
41998 if (VT == MVT::v16i16) {
41999 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
42000 !canWidenShuffleElements(NewBlendMask))
42001 return SDValue();
42002 if (!canWidenShuffleElements(NewPermuteMask))
42003 return SDValue();
42004 }
42005
42006 // Don't introduce lane-crossing permutes without AVX2, unless it can be
42007 // widened to a lane permute (vperm2f128).
42008 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
42010 NewPermuteMask) &&
42011 !canScaleShuffleElements(NewPermuteMask, 2))
42012 return SDValue();
42013
42014 SDValue NewBlend =
42015 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
42016 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
42017 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
42018 NewPermuteMask);
42019}
42020
42021// TODO - move this to TLI like isBinOp?
42022static bool isUnaryOp(unsigned Opcode) {
42023 switch (Opcode) {
42024 case ISD::CTLZ:
42025 case ISD::CTTZ:
42026 case ISD::CTPOP:
42027 return true;
42028 }
42029 return false;
42030}
42031
42032// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42033// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42035 const SDLoc &DL) {
42036 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42037 EVT ShuffleVT = N.getValueType();
42038 unsigned Opc = N.getOpcode();
42039
42040 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
42041 // AllZeros/AllOnes constants are freely shuffled and will peek through
42042 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
42043 // merge with target shuffles if it has one use so shuffle combining is
42044 // likely to kick in. Shuffles of splats are expected to be removed.
42045 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
42046 ISD::isBuildVectorAllZeros(Op.getNode()) ||
42050 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
42051 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
42052 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
42053 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
42054 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
42055 };
42056 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
42057 // Ensure we only shuffle whole vector src elements, unless its a logical
42058 // binops where we can more aggressively move shuffles from dst to src.
42059 return isLogicOp(BinOp) ||
42060 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42061 };
42062
42063 switch (Opc) {
42064 // Unary and Unary+Permute Shuffles.
42065 case X86ISD::PSHUFB: {
42066 // Don't merge PSHUFB if it contains zero'd elements.
42067 SmallVector<int> Mask;
42069 if (!getTargetShuffleMask(N, false, Ops, Mask))
42070 break;
42071 [[fallthrough]];
42072 }
42073 case X86ISD::VBROADCAST:
42074 case X86ISD::MOVDDUP:
42075 case X86ISD::PSHUFD:
42076 case X86ISD::PSHUFHW:
42077 case X86ISD::PSHUFLW:
42078 case X86ISD::VPERMV:
42079 case X86ISD::VPERMI:
42080 case X86ISD::VPERMILPI: {
42081 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42082 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42083 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42084 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42085 unsigned SrcOpcode = N0.getOpcode();
42086 EVT OpVT = N0.getValueType();
42087 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42090 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42091 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42092 IsMergeableWithShuffle(Op01, FoldShuf)) {
42093 SDValue LHS, RHS;
42094 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42095 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42096 if (Opc == X86ISD::VPERMV) {
42097 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42098 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42099 } else if (N.getNumOperands() == 2) {
42100 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42101 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42102 } else {
42103 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42104 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42105 }
42106 return DAG.getBitcast(ShuffleVT,
42107 DAG.getNode(SrcOpcode, DL, OpVT,
42108 DAG.getBitcast(OpVT, LHS),
42109 DAG.getBitcast(OpVT, RHS)));
42110 }
42111 }
42112 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42113 OpVT.getScalarSizeInBits() ==
42115 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42116 if (Opc == X86ISD::VPERMV)
42117 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42118 else if (N.getNumOperands() == 2)
42119 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42120 else
42121 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42122 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42123 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42124 }
42125 }
42126 break;
42127 }
42128 // Binary and Binary+Permute Shuffles.
42129 case X86ISD::INSERTPS: {
42130 // Don't merge INSERTPS if it contains zero'd elements.
42131 unsigned InsertPSMask = N.getConstantOperandVal(2);
42132 unsigned ZeroMask = InsertPSMask & 0xF;
42133 if (ZeroMask != 0)
42134 break;
42135 [[fallthrough]];
42136 }
42137 case X86ISD::MOVSD:
42138 case X86ISD::MOVSS:
42139 case X86ISD::BLENDI:
42140 case X86ISD::SHUFP:
42141 case X86ISD::UNPCKH:
42142 case X86ISD::UNPCKL: {
42143 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42144 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42145 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42146 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42147 unsigned SrcOpcode = N0.getOpcode();
42148 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42149 N0.getValueType() == N1.getValueType() &&
42150 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42151 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42156 // Ensure the total number of shuffles doesn't increase by folding this
42157 // shuffle through to the source ops.
42158 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42159 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42160 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42161 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42162 SDValue LHS, RHS;
42163 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42164 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42165 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42166 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42167 if (N.getNumOperands() == 3) {
42168 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42169 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42170 } else {
42171 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42172 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42173 }
42174 EVT OpVT = N0.getValueType();
42175 return DAG.getBitcast(ShuffleVT,
42176 DAG.getNode(SrcOpcode, DL, OpVT,
42177 DAG.getBitcast(OpVT, LHS),
42178 DAG.getBitcast(OpVT, RHS)));
42179 }
42180 }
42181 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42182 N0.getValueType() == N1.getValueType() &&
42183 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42184 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42187 SDValue Res;
42188 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42189 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42190 if (N.getNumOperands() == 3) {
42191 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42192 } else {
42193 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42194 }
42195 EVT OpVT = N0.getValueType();
42196 return DAG.getBitcast(
42197 ShuffleVT,
42198 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42199 }
42200 // TODO: We can generalize this for other shuffles/conversions.
42201 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42202 N1.getOpcode() == SrcOpcode &&
42203 N0.getValueType() == N1.getValueType() &&
42204 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42205 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42206 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42207 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42208 EVT OpSrcVT = N0.getOperand(0).getValueType();
42209 EVT OpDstVT = N0.getValueType();
42210 SDValue Res =
42211 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42212 return DAG.getBitcast(ShuffleVT,
42213 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42214 }
42215 }
42216 break;
42217 }
42218 }
42219 return SDValue();
42220}
42221
42222/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42224 SelectionDAG &DAG,
42225 const SDLoc &DL) {
42226 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42227
42228 MVT VT = V.getSimpleValueType();
42229 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42230 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42231 unsigned SrcOpc0 = Src0.getOpcode();
42232 unsigned SrcOpc1 = Src1.getOpcode();
42233 EVT SrcVT0 = Src0.getValueType();
42234 EVT SrcVT1 = Src1.getValueType();
42235
42236 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42237 return SDValue();
42238
42239 switch (SrcOpc0) {
42240 case X86ISD::MOVDDUP: {
42241 SDValue LHS = Src0.getOperand(0);
42242 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42243 SDValue Res =
42244 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42245 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42246 return DAG.getBitcast(VT, Res);
42247 }
42248 case X86ISD::VPERMILPI:
42249 // TODO: Handle v4f64 permutes with different low/high lane masks.
42250 if (SrcVT0 == MVT::v4f64) {
42251 uint64_t Mask = Src0.getConstantOperandVal(1);
42252 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42253 break;
42254 }
42255 [[fallthrough]];
42256 case X86ISD::VSHLI:
42257 case X86ISD::VSRLI:
42258 case X86ISD::VSRAI:
42259 case X86ISD::PSHUFD:
42260 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42261 SDValue LHS = Src0.getOperand(0);
42262 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42263 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42264 V.getOperand(2));
42265 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42266 return DAG.getBitcast(VT, Res);
42267 }
42268 break;
42269 }
42270
42271 return SDValue();
42272}
42273
42274/// Try to combine x86 target specific shuffles.
42276 SelectionDAG &DAG,
42278 const X86Subtarget &Subtarget) {
42279 using namespace SDPatternMatch;
42280
42281 MVT VT = N.getSimpleValueType();
42282 unsigned NumElts = VT.getVectorNumElements();
42284 unsigned Opcode = N.getOpcode();
42285 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42286
42287 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42288 return R;
42289
42290 // Handle specific target shuffles.
42291 switch (Opcode) {
42292 case X86ISD::MOVDDUP: {
42293 SDValue Src = N.getOperand(0);
42294 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42295 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42296 ISD::isNormalLoad(Src.getNode())) {
42297 LoadSDNode *LN = cast<LoadSDNode>(Src);
42298 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42299 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42300 DCI.CombineTo(N.getNode(), Movddup);
42301 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42303 return N; // Return N so it doesn't get rechecked!
42304 }
42305 }
42306
42307 return SDValue();
42308 }
42309 case X86ISD::VBROADCAST: {
42310 SDValue Src = N.getOperand(0);
42311 SDValue BC = peekThroughBitcasts(Src);
42312 EVT SrcVT = Src.getValueType();
42313 EVT BCVT = BC.getValueType();
42314
42315 // If broadcasting from another shuffle, attempt to simplify it.
42316 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42317 if (isTargetShuffle(BC.getOpcode()) &&
42318 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42319 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42320 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42322 for (unsigned i = 0; i != Scale; ++i)
42323 DemandedMask[i] = i;
42325 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42326 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42327 /*AllowVariableCrossLaneMask=*/true,
42328 /*AllowVariablePerLaneMask=*/true,
42329 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42330 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42331 DAG.getBitcast(SrcVT, Res));
42332 }
42333
42334 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42335 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42336 if (Src.getOpcode() == ISD::BITCAST &&
42337 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42338 TLI.isTypeLegal(BCVT) &&
42340 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42341 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42343 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42344 }
42345
42346 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42347 // If we're re-broadcasting a smaller type then broadcast with that type and
42348 // bitcast.
42349 // TODO: Do this for any splat?
42350 if (Src.getOpcode() == ISD::BITCAST &&
42351 (BC.getOpcode() == X86ISD::VBROADCAST ||
42353 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42354 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42355 MVT NewVT =
42357 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42358 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42359 }
42360
42361 // Reduce broadcast source vector to lowest 128-bits.
42362 if (SrcVT.getSizeInBits() > 128)
42363 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42364 extract128BitVector(Src, 0, DAG, DL));
42365
42366 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42367 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42368 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42369 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42370
42371 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42372 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42373 isNullConstant(Src.getOperand(1)) &&
42374 Src.getValueType() ==
42375 Src.getOperand(0).getValueType().getScalarType() &&
42376 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42377 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42378
42379 // Share broadcast with the longest vector and extract low subvector (free).
42380 // Ensure the same SDValue from the SDNode use is being used.
42381 for (SDNode *User : Src->users())
42382 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42383 Src == User->getOperand(0) &&
42384 User->getValueSizeInBits(0).getFixedValue() >
42385 VT.getFixedSizeInBits()) {
42386 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42387 VT.getSizeInBits());
42388 }
42389
42390 // vbroadcast(scalarload X) -> vbroadcast_load X
42391 // For float loads, extract other uses of the scalar from the broadcast.
42392 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42393 ISD::isNormalLoad(Src.getNode())) {
42394 LoadSDNode *LN = cast<LoadSDNode>(Src);
42395 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42396 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42397 SDValue BcastLd =
42399 LN->getMemoryVT(), LN->getMemOperand());
42400 // If the load value is used only by N, replace it via CombineTo N.
42401 bool NoReplaceExtract = Src.hasOneUse();
42402 DCI.CombineTo(N.getNode(), BcastLd);
42403 if (NoReplaceExtract) {
42404 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42406 } else {
42407 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42408 DAG.getVectorIdxConstant(0, DL));
42409 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42410 }
42411 return N; // Return N so it doesn't get rechecked!
42412 }
42413
42414 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42415 // i16. So shrink it ourselves if we can make a broadcast_load.
42416 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42417 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42418 assert(Subtarget.hasAVX2() && "Expected AVX2");
42419 SDValue TruncIn = Src.getOperand(0);
42420
42421 // If this is a truncate of a non extending load we can just narrow it to
42422 // use a broadcast_load.
42423 if (ISD::isNormalLoad(TruncIn.getNode())) {
42424 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42425 // Unless its volatile or atomic.
42426 if (LN->isSimple()) {
42427 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42428 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42429 SDValue BcastLd = DAG.getMemIntrinsicNode(
42430 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42431 LN->getPointerInfo(), LN->getBaseAlign(),
42432 LN->getMemOperand()->getFlags());
42433 DCI.CombineTo(N.getNode(), BcastLd);
42434 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42435 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42436 return N; // Return N so it doesn't get rechecked!
42437 }
42438 }
42439
42440 // If this is a truncate of an i16 extload, we can directly replace it.
42441 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42442 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42443 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42444 if (LN->getMemoryVT().getSizeInBits() == 16) {
42445 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42446 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42447 SDValue BcastLd =
42449 LN->getMemoryVT(), LN->getMemOperand());
42450 DCI.CombineTo(N.getNode(), BcastLd);
42451 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42452 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42453 return N; // Return N so it doesn't get rechecked!
42454 }
42455 }
42456
42457 // If this is a truncate of load that has been shifted right, we can
42458 // offset the pointer and use a narrower load.
42459 if (TruncIn.getOpcode() == ISD::SRL &&
42460 TruncIn.getOperand(0).hasOneUse() &&
42461 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42462 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42463 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42464 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42465 // Make sure the shift amount and the load size are divisible by 16.
42466 // Don't do this if the load is volatile or atomic.
42467 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42468 LN->isSimple()) {
42469 unsigned Offset = ShiftAmt / 8;
42470 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42473 SDValue Ops[] = { LN->getChain(), Ptr };
42474 SDValue BcastLd = DAG.getMemIntrinsicNode(
42475 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42477 LN->getMemOperand()->getFlags());
42478 DCI.CombineTo(N.getNode(), BcastLd);
42479 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42480 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42481 return N; // Return N so it doesn't get rechecked!
42482 }
42483 }
42484 }
42485
42486 // vbroadcast(vzload X) -> vbroadcast_load X
42487 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42489 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42490 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42491 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42492 SDValue BcastLd =
42494 LN->getMemoryVT(), LN->getMemOperand());
42495 DCI.CombineTo(N.getNode(), BcastLd);
42496 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42498 return N; // Return N so it doesn't get rechecked!
42499 }
42500 }
42501
42502 // vbroadcast(vector load X) -> vbroadcast_load
42503 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42504 LoadSDNode *LN = cast<LoadSDNode>(Src);
42505 // Unless the load is volatile or atomic.
42506 if (LN->isSimple()) {
42507 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42508 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42509 SDValue BcastLd = DAG.getMemIntrinsicNode(
42511 LN->getPointerInfo(), LN->getBaseAlign(),
42512 LN->getMemOperand()->getFlags());
42513 DCI.CombineTo(N.getNode(), BcastLd);
42514 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42516 return N; // Return N so it doesn't get rechecked!
42517 }
42518 }
42519
42520 return SDValue();
42521 }
42522 case X86ISD::VZEXT_MOVL: {
42523 SDValue N0 = N.getOperand(0);
42524
42525 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42526 // Zeroing out the upper elements means we're just shifting a zero value.
42527 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42528 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42529 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42530 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42531 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42532 if (N0.hasOneUse())
42533 return DAG.getNode(
42534 N0.getOpcode(), DL, VT,
42535 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42536 N0.getOperand(1));
42537 }
42538
42539 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42540 // the load is volatile.
42541 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42542 auto *LN = cast<LoadSDNode>(N0);
42543 if (SDValue VZLoad =
42544 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42545 DCI.CombineTo(N.getNode(), VZLoad);
42546 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42548 return N;
42549 }
42550 }
42551
42552 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42553 // and can just use a VZEXT_LOAD.
42554 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42555 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42556 auto *LN = cast<MemSDNode>(N0);
42557 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42558 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42559 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42560 SDValue VZLoad =
42562 LN->getMemoryVT(), LN->getMemOperand());
42563 DCI.CombineTo(N.getNode(), VZLoad);
42564 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42566 return N;
42567 }
42568 }
42569
42570 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42571 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42572 // if the upper bits of the i64 are zero.
42573 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42574 N0.getOperand(0).hasOneUse() &&
42575 N0.getOperand(0).getValueType() == MVT::i64) {
42576 SDValue In = N0.getOperand(0);
42577 APInt Mask = APInt::getHighBitsSet(64, 32);
42578 if (DAG.MaskedValueIsZero(In, Mask)) {
42579 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42580 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42581 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42582 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42583 return DAG.getBitcast(VT, Movl);
42584 }
42585 }
42586
42587 // Load a scalar integer constant directly to XMM instead of transferring an
42588 // immediate value from GPR.
42589 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42590 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42591 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42592 // Create a vector constant - scalar constant followed by zeros.
42593 EVT ScalarVT = N0.getOperand(0).getValueType();
42594 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42595 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42596 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42597 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42598
42599 // Load the vector constant from constant pool.
42600 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42601 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42602 MachinePointerInfo MPI =
42604 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42605 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42607 }
42608 }
42609
42610 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42611 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42612 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42613 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42614 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42616
42617 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42618 isNullConstant(V.getOperand(2))) {
42619 SDValue In = V.getOperand(1);
42621 In.getValueSizeInBits() /
42622 VT.getScalarSizeInBits());
42623 In = DAG.getBitcast(SubVT, In);
42624 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42625 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42626 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42627 V.getOperand(2));
42628 }
42629 }
42630
42631 return SDValue();
42632 }
42633 case X86ISD::BLENDI: {
42634 SDValue N0 = N.getOperand(0);
42635 SDValue N1 = N.getOperand(1);
42636 unsigned EltBits = VT.getScalarSizeInBits();
42637
42638 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42639 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42640 // TODO: Handle MVT::v16i16 repeated blend mask.
42641 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42642 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42643 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42644 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42645 unsigned NewSize = SrcVT.getVectorNumElements();
42646 APInt BlendMask = getBLENDIBlendMask(N);
42647 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42648 return DAG.getBitcast(
42649 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42650 N1.getOperand(0),
42651 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42652 DL, MVT::i8)));
42653 }
42654 }
42655 // Share PSHUFB masks:
42656 // blend(pshufb(x,m1),pshufb(y,m2))
42657 // --> m3 = blend(m1,m2)
42658 // blend(pshufb(x,m3),pshufb(y,m3))
42659 if (N0.hasOneUse() && N1.hasOneUse()) {
42660 SmallVector<int> Mask, ByteMask;
42664 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42665 RHS.getOpcode() == X86ISD::PSHUFB &&
42666 LHS.getOperand(1) != RHS.getOperand(1) &&
42667 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42668 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42669 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42671 "BLENDI decode mismatch");
42672 MVT ShufVT = LHS.getSimpleValueType();
42673 SDValue MaskLHS = LHS.getOperand(1);
42674 SDValue MaskRHS = RHS.getOperand(1);
42675 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42677 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42678 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42679 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42680 LHS.getOperand(0), NewMask);
42681 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42682 RHS.getOperand(0), NewMask);
42683 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42684 DAG.getBitcast(VT, NewLHS),
42685 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42686 }
42687 }
42688 }
42689 }
42690 return SDValue();
42691 }
42692 case X86ISD::SHUFP: {
42693 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42694 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42695 // TODO: Support types other than v4f32.
42696 if (VT == MVT::v4f32) {
42697 bool Updated = false;
42698 SmallVector<int> Mask;
42700 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42701 for (int i = 0; i != 2; ++i) {
42702 SmallVector<SDValue> SubOps;
42703 SmallVector<int> SubMask, SubScaledMask;
42705 // TODO: Scaling might be easier if we specify the demanded elts.
42706 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42707 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42708 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42709 int Ofs = i * 2;
42710 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42711 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42712 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42713 Updated = true;
42714 }
42715 }
42716 }
42717 if (Updated) {
42718 for (int &M : Mask)
42719 M %= 4;
42720 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42721 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42722 }
42723 }
42724 return SDValue();
42725 }
42726 case X86ISD::VPERMI: {
42727 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42728 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42729 SDValue N0 = N.getOperand(0);
42730 SDValue N1 = N.getOperand(1);
42731 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42732 if (N0.getOpcode() == ISD::BITCAST &&
42733 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42734 SDValue Src = N0.getOperand(0);
42735 EVT SrcVT = Src.getValueType();
42736 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42737 return DAG.getBitcast(VT, Res);
42738 }
42739 return SDValue();
42740 }
42741 case X86ISD::SHUF128: {
42742 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42743 // see if we can peek through and access the subvector directly.
42744 if (VT.is512BitVector()) {
42745 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42746 // the upper subvector is used.
42747 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42748 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42749 uint64_t Mask = N->getConstantOperandVal(2);
42750 SmallVector<SDValue> LHSOps, RHSOps;
42751 SDValue NewLHS, NewRHS;
42752 if ((Mask & 0x0A) == 0x0A &&
42753 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42754 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42755 Mask &= ~0x0A;
42756 }
42757 if ((Mask & 0xA0) == 0xA0 &&
42758 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42759 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42760 Mask &= ~0xA0;
42761 }
42762 if (NewLHS || NewRHS)
42763 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42764 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42765 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42766 DAG.getTargetConstant(Mask, DL, MVT::i8));
42767 }
42768 return SDValue();
42769 }
42770 case X86ISD::VPERM2X128: {
42771 SDValue LHS = N->getOperand(0);
42772 SDValue RHS = N->getOperand(1);
42773 unsigned Imm = N.getConstantOperandVal(2) & 255;
42774
42775 // Canonicalize unary/repeated operands to LHS.
42776 if (LHS.isUndef() && !RHS.isUndef())
42777 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42778 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42779 if (LHS == RHS)
42780 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42781 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42782
42783 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42784 if (LHS.getOpcode() == ISD::BITCAST &&
42785 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42786 EVT SrcVT = LHS.getOperand(0).getValueType();
42787 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42788 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42789 DAG.getBitcast(SrcVT, LHS),
42790 DAG.getBitcast(SrcVT, RHS),
42791 N->getOperand(2)));
42792 }
42793 }
42794
42795 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42797 return Res;
42798
42799 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42800 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42801 auto FindSubVector128 = [&](unsigned Idx) {
42802 if (Idx > 3)
42803 return SDValue();
42804 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42805 SmallVector<SDValue> SubOps;
42806 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42807 return SubOps[Idx & 1];
42808 unsigned NumElts = Src.getValueType().getVectorNumElements();
42809 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42810 Src.getOperand(1).getValueSizeInBits() == 128 &&
42811 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42812 return Src.getOperand(1);
42813 }
42814 return SDValue();
42815 };
42816 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42817 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42818 MVT SubVT = VT.getHalfNumVectorElementsVT();
42819 SubLo = DAG.getBitcast(SubVT, SubLo);
42820 SubHi = DAG.getBitcast(SubVT, SubHi);
42821 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42822 }
42823 }
42824
42825 // Attempt to match VBROADCAST*128 subvector broadcast load.
42826 if (RHS.isUndef()) {
42828 DecodeVPERM2X128Mask(4, Imm, Mask);
42829 if (isUndefOrInRange(Mask, 0, 4)) {
42830 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42831 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42832 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42833 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42834 MVT MemVT = VT.getHalfNumVectorElementsVT();
42835 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42837 cast<LoadSDNode>(LHS), Ofs, DAG);
42838 }
42839 }
42840 }
42841
42842 return SDValue();
42843 }
42844 case X86ISD::PSHUFD:
42845 case X86ISD::PSHUFLW:
42846 case X86ISD::PSHUFHW: {
42847 SDValue N0 = N.getOperand(0);
42848 SDValue N1 = N.getOperand(1);
42849 if (N0->hasOneUse()) {
42851 switch (V.getOpcode()) {
42852 case X86ISD::VSHL:
42853 case X86ISD::VSRL:
42854 case X86ISD::VSRA:
42855 case X86ISD::VSHLI:
42856 case X86ISD::VSRLI:
42857 case X86ISD::VSRAI:
42858 case X86ISD::VROTLI:
42859 case X86ISD::VROTRI: {
42860 MVT InnerVT = V.getSimpleValueType();
42861 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42862 SDValue Res = DAG.getNode(Opcode, DL, VT,
42863 DAG.getBitcast(VT, V.getOperand(0)), N1);
42864 Res = DAG.getBitcast(InnerVT, Res);
42865 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42866 return DAG.getBitcast(VT, Res);
42867 }
42868 break;
42869 }
42870 }
42871 }
42872
42873 Mask = getPSHUFShuffleMask(N);
42874 assert(Mask.size() == 4);
42875 break;
42876 }
42877 case X86ISD::MOVSD:
42878 case X86ISD::MOVSH:
42879 case X86ISD::MOVSS: {
42880 SDValue N0 = N.getOperand(0);
42881 SDValue N1 = N.getOperand(1);
42882
42883 // Canonicalize scalar FPOps:
42884 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42885 // If commutable, allow OP(N1[0], N0[0]).
42886 unsigned Opcode1 = N1.getOpcode();
42887 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42888 Opcode1 == ISD::FDIV) {
42889 SDValue N10 = N1.getOperand(0);
42890 SDValue N11 = N1.getOperand(1);
42891 if (N10 == N0 ||
42892 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42893 if (N10 != N0)
42894 std::swap(N10, N11);
42895 MVT SVT = VT.getVectorElementType();
42896 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42897 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42898 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42899 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42900 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42901 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42902 }
42903 }
42904
42905 return SDValue();
42906 }
42907 case X86ISD::INSERTPS: {
42908 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42909 SDValue Op0 = N.getOperand(0);
42910 SDValue Op1 = N.getOperand(1);
42911 unsigned InsertPSMask = N.getConstantOperandVal(2);
42912 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42913 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42914 unsigned ZeroMask = InsertPSMask & 0xF;
42915
42916 // If we zero out all elements from Op0 then we don't need to reference it.
42917 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42918 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42919 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42920
42921 // If we zero out the element from Op1 then we don't need to reference it.
42922 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42923 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42924 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42925
42926 // Attempt to merge insertps Op1 with an inner target shuffle node.
42927 SmallVector<int, 8> TargetMask1;
42929 APInt KnownUndef1, KnownZero1;
42930 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42931 KnownZero1)) {
42932 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42933 // Zero/UNDEF insertion - zero out element and remove dependency.
42934 InsertPSMask |= (1u << DstIdx);
42935 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42936 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42937 }
42938 // Update insertps mask srcidx and reference the source input directly.
42939 int M = TargetMask1[SrcIdx];
42940 assert(0 <= M && M < 8 && "Shuffle index out of range");
42941 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42942 Op1 = Ops1[M < 4 ? 0 : 1];
42943 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42944 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42945 }
42946
42947 // Attempt to merge insertps Op0 with an inner target shuffle node.
42948 SmallVector<int, 8> TargetMask0;
42950 APInt KnownUndef0, KnownZero0;
42951 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42952 KnownZero0)) {
42953 bool Updated = false;
42954 bool UseInput00 = false;
42955 bool UseInput01 = false;
42956 for (int i = 0; i != 4; ++i) {
42957 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42958 // No change if element is already zero or the inserted element.
42959 continue;
42960 }
42961
42962 if (KnownUndef0[i] || KnownZero0[i]) {
42963 // If the target mask is undef/zero then we must zero the element.
42964 InsertPSMask |= (1u << i);
42965 Updated = true;
42966 continue;
42967 }
42968
42969 // The input vector element must be inline.
42970 int M = TargetMask0[i];
42971 if (M != i && M != (i + 4))
42972 return SDValue();
42973
42974 // Determine which inputs of the target shuffle we're using.
42975 UseInput00 |= (0 <= M && M < 4);
42976 UseInput01 |= (4 <= M);
42977 }
42978
42979 // If we're not using both inputs of the target shuffle then use the
42980 // referenced input directly.
42981 if (UseInput00 && !UseInput01) {
42982 Updated = true;
42983 Op0 = Ops0[0];
42984 } else if (!UseInput00 && UseInput01) {
42985 Updated = true;
42986 Op0 = Ops0[1];
42987 }
42988
42989 if (Updated)
42990 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42991 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42992 }
42993
42994 // If we're inserting an element from a vbroadcast load, fold the
42995 // load into the X86insertps instruction. We need to convert the scalar
42996 // load to a vector and clear the source lane of the INSERTPS control.
42997 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42998 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42999 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
43000 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
43001 MemIntr->getBasePtr(),
43002 MemIntr->getMemOperand());
43003 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
43005 Load),
43006 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
43007 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
43008 return Insert;
43009 }
43010 }
43011
43012 return SDValue();
43013 }
43014 case X86ISD::VPERMV: {
43015 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
43017 SmallVector<SDValue, 2> SrcOps, SubOps;
43018 SDValue Src = peekThroughBitcasts(N.getOperand(1));
43019 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
43020 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
43021 collectConcatOps(Src.getNode(), SubOps, DAG)) {
43022 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43023 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
43024 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
43025 "Unexpected split ops");
43026 // Bail if we were permuting a widened vector.
43027 if (SubOps[1].isUndef() &&
43028 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
43029 return SDValue();
43030 // Bail if any subops would have folded into the concat.
43031 if (any_of(SubOps, isShuffleFoldableLoad))
43032 return SDValue();
43033 // Concat 4x128 back to 2x256.
43034 if (SubOps.size() == 4) {
43035 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
43036 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
43037 }
43038 // Convert mask to 2 operand shuffle.
43039 int HalfElts = NumElts / 2;
43040 for (int &M : Mask)
43041 M += M >= HalfElts ? HalfElts : 0;
43042 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
43043 VT.getSizeInBits());
43044 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
43045 VT.getSizeInBits());
43046 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
43047 DAG.getBitcast(VT, Hi), Subtarget, DAG);
43048 }
43049 return SDValue();
43050 }
43051 case X86ISD::VPERMV3: {
43052 MVT WideVT = VT.getDoubleNumVectorElementsVT();
43053 bool CanConcat = VT.is128BitVector() ||
43054 (VT.is256BitVector() && Subtarget.useAVX512Regs());
43057 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43058 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43059 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43060 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43061 // Canonicalize to VPERMV if both sources are the same.
43062 if (V1 == V2) {
43063 for (int &M : Mask)
43064 M = (M < 0 ? M : (M & (NumElts - 1)));
43065 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43066 DAG.getUNDEF(VT), Subtarget, DAG);
43067 }
43068 // If sources are half width, then concat and use VPERMV with adjusted
43069 // mask.
43070 SDValue Ops[2];
43071 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43072 if (sd_match(V1,
43074 sd_match(V2,
43076 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43077 if (SDValue ConcatSrc =
43078 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43079 for (int &M : Mask)
43080 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43081 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43082 DAG.getUNDEF(VT), Subtarget, DAG);
43083 }
43084 }
43085 // Commute foldable source to the RHS.
43086 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43087 !isShuffleFoldableLoad(N.getOperand(2))) {
43089 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43090 N.getOperand(0), Subtarget, DAG);
43091 }
43092 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43093 // freely concatenated, with a commuted shuffle mask.
43094 if (CanConcat) {
43095 if (SDValue ConcatSrc = combineConcatVectorOps(
43096 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43097 Subtarget)) {
43099 Mask.append(NumElts, SM_SentinelUndef);
43100 SDValue Perm =
43101 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43102 DAG.getUNDEF(WideVT), Subtarget, DAG);
43103 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43104 DAG.getVectorIdxConstant(0, DL));
43105 }
43106 }
43107 }
43108 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43109 // freely concatenated.
43110 if (CanConcat) {
43111 if (SDValue ConcatSrc = combineConcatVectorOps(
43112 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43113 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43114 DL, WideVT.getSizeInBits());
43115 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43116 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43117 DAG.getVectorIdxConstant(0, DL));
43118 }
43119 }
43120 return SDValue();
43121 }
43122 default:
43123 return SDValue();
43124 }
43125
43126 // Nuke no-op shuffles that show up after combining.
43127 if (isNoopShuffleMask(Mask))
43128 return N.getOperand(0);
43129
43130 // Look for simplifications involving one or two shuffle instructions.
43131 SDValue V = N.getOperand(0);
43132 switch (N.getOpcode()) {
43133 default:
43134 break;
43135 case X86ISD::PSHUFLW:
43136 case X86ISD::PSHUFHW:
43137 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43138
43139 // See if this reduces to a PSHUFD which is no more expensive and can
43140 // combine with more operations. Note that it has to at least flip the
43141 // dwords as otherwise it would have been removed as a no-op.
43142 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43143 int DMask[] = {0, 1, 2, 3};
43144 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43145 DMask[DOffset + 0] = DOffset + 1;
43146 DMask[DOffset + 1] = DOffset + 0;
43147 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43148 V = DAG.getBitcast(DVT, V);
43149 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43150 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43151 return DAG.getBitcast(VT, V);
43152 }
43153
43154 // Look for shuffle patterns which can be implemented as a single unpack.
43155 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43156 // only works when we have a PSHUFD followed by two half-shuffles.
43157 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43158 (V.getOpcode() == X86ISD::PSHUFLW ||
43159 V.getOpcode() == X86ISD::PSHUFHW) &&
43160 V.getOpcode() != N.getOpcode() &&
43161 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43162 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43163 if (D.getOpcode() == X86ISD::PSHUFD) {
43166 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43167 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43168 int WordMask[8];
43169 for (int i = 0; i < 4; ++i) {
43170 WordMask[i + NOffset] = Mask[i] + NOffset;
43171 WordMask[i + VOffset] = VMask[i] + VOffset;
43172 }
43173 // Map the word mask through the DWord mask.
43174 int MappedMask[8];
43175 for (int i = 0; i < 8; ++i)
43176 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43177 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43178 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43179 // We can replace all three shuffles with an unpack.
43180 V = DAG.getBitcast(VT, D.getOperand(0));
43181 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43183 DL, VT, V, V);
43184 }
43185 }
43186 }
43187
43188 break;
43189
43190 case X86ISD::PSHUFD:
43191 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43192 return NewN;
43193
43194 break;
43195 }
43196
43197 return SDValue();
43198}
43199
43200/// Checks if the shuffle mask takes subsequent elements
43201/// alternately from two vectors.
43202/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43203static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43204
43205 int ParitySrc[2] = {-1, -1};
43206 unsigned Size = Mask.size();
43207 for (unsigned i = 0; i != Size; ++i) {
43208 int M = Mask[i];
43209 if (M < 0)
43210 continue;
43211
43212 // Make sure we are using the matching element from the input.
43213 if ((M % Size) != i)
43214 return false;
43215
43216 // Make sure we use the same input for all elements of the same parity.
43217 int Src = M / Size;
43218 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43219 return false;
43220 ParitySrc[i % 2] = Src;
43221 }
43222
43223 // Make sure each input is used.
43224 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43225 return false;
43226
43227 Op0Even = ParitySrc[0] == 0;
43228 return true;
43229}
43230
43231/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43232/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43233/// are written to the parameters \p Opnd0 and \p Opnd1.
43234///
43235/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43236/// so it is easier to generically match. We also insert dummy vector shuffle
43237/// nodes for the operands which explicitly discard the lanes which are unused
43238/// by this operation to try to flow through the rest of the combiner
43239/// the fact that they're unused.
43240static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43241 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43242 bool &IsSubAdd, bool &HasAllowContract) {
43243
43244 EVT VT = N->getValueType(0);
43245 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43246 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43248 return false;
43249
43250 // We only handle target-independent shuffles.
43251 // FIXME: It would be easy and harmless to use the target shuffle mask
43252 // extraction tool to support more.
43253 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43254 return false;
43255
43256 SDValue V1 = N->getOperand(0);
43257 SDValue V2 = N->getOperand(1);
43258
43259 // Make sure we have an FADD and an FSUB.
43260 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43261 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43262 V1.getOpcode() == V2.getOpcode())
43263 return false;
43264
43265 // If there are other uses of these operations we can't fold them.
43266 if (!V1->hasOneUse() || !V2->hasOneUse())
43267 return false;
43268
43269 // Ensure that both operations have the same operands. Note that we can
43270 // commute the FADD operands.
43271 SDValue LHS, RHS;
43272 if (V1.getOpcode() == ISD::FSUB) {
43273 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43274 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43275 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43276 return false;
43277 } else {
43278 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43279 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43280 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43281 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43282 return false;
43283 }
43284
43285 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43286 bool Op0Even;
43287 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43288 return false;
43289
43290 // It's a subadd if the vector in the even parity is an FADD.
43291 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43292 : V2->getOpcode() == ISD::FADD;
43293 HasAllowContract =
43295
43296 Opnd0 = LHS;
43297 Opnd1 = RHS;
43298 return true;
43299}
43300
43301/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43303 const X86Subtarget &Subtarget,
43304 SelectionDAG &DAG) {
43305 // We only handle target-independent shuffles.
43306 // FIXME: It would be easy and harmless to use the target shuffle mask
43307 // extraction tool to support more.
43308 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43309 return SDValue();
43310
43311 MVT VT = N->getSimpleValueType(0);
43312 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43313 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43314 return SDValue();
43315
43316 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43317 SDValue Op0 = N->getOperand(0);
43318 SDValue Op1 = N->getOperand(1);
43319 SDValue FMAdd = Op0, FMSub = Op1;
43320 if (FMSub.getOpcode() != X86ISD::FMSUB)
43321 std::swap(FMAdd, FMSub);
43322
43323 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43324 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43325 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43326 FMAdd.getOperand(2) != FMSub.getOperand(2))
43327 return SDValue();
43328
43329 // Check for correct shuffle mask.
43330 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43331 bool Op0Even;
43332 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43333 return SDValue();
43334
43335 // FMAddSub takes zeroth operand from FMSub node.
43336 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43337 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43338 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43339 FMAdd.getOperand(2));
43340}
43341
43342/// Try to combine a shuffle into a target-specific add-sub or
43343/// mul-add-sub node.
43345 const X86Subtarget &Subtarget,
43346 SelectionDAG &DAG) {
43347 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43348 return V;
43349
43350 SDValue Opnd0, Opnd1;
43351 bool IsSubAdd;
43352 bool HasAllowContract;
43353 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43354 HasAllowContract))
43355 return SDValue();
43356
43357 MVT VT = N->getSimpleValueType(0);
43358
43359 // Try to generate X86ISD::FMADDSUB node here.
43360 SDValue Opnd2;
43361 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43362 HasAllowContract)) {
43363 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43364 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43365 }
43366
43367 if (IsSubAdd)
43368 return SDValue();
43369
43370 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43371 // the ADDSUB idiom has been successfully recognized. There are no known
43372 // X86 targets with 512-bit ADDSUB instructions!
43373 if (VT.is512BitVector())
43374 return SDValue();
43375
43376 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43377 // the ADDSUB idiom has been successfully recognized. There are no known
43378 // X86 targets with FP16 ADDSUB instructions!
43379 if (VT.getVectorElementType() == MVT::f16)
43380 return SDValue();
43381
43382 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43383}
43384
43385/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43386/// low half of each source vector and does not set any high half elements in
43387/// the destination vector, narrow the shuffle to half its original size.
43389 EVT VT = Shuf->getValueType(0);
43390 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43391 return SDValue();
43392 if (!VT.is256BitVector() && !VT.is512BitVector())
43393 return SDValue();
43394
43395 // See if we can ignore all of the high elements of the shuffle.
43396 ArrayRef<int> Mask = Shuf->getMask();
43397 if (!isUndefUpperHalf(Mask))
43398 return SDValue();
43399
43400 // Check if the shuffle mask accesses only the low half of each input vector
43401 // (half-index output is 0 or 2).
43402 int HalfIdx1, HalfIdx2;
43403 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43404 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43405 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43406 return SDValue();
43407
43408 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43409 // The trick is knowing that all of the insert/extract are actually free
43410 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43411 // of narrow inputs into a narrow output, and that is always cheaper than
43412 // the wide shuffle that we started with.
43413 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43414 Shuf->getOperand(1), HalfMask, HalfIdx1,
43415 HalfIdx2, false, DAG, /*UseConcat*/ true);
43416}
43417
43420 const X86Subtarget &Subtarget) {
43421 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43422 if (SDValue V = narrowShuffle(Shuf, DAG))
43423 return V;
43424
43425 // If we have legalized the vector types, look for blends of FADD and FSUB
43426 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43427 SDLoc dl(N);
43428 EVT VT = N->getValueType(0);
43429 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43430 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43431 if (SDValue AddSub =
43432 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43433 return AddSub;
43434
43435 // Attempt to combine into a vector load/broadcast.
43437 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43438 return LD;
43439
43440 if (isTargetShuffle(N->getOpcode())) {
43441 SDValue Op(N, 0);
43442 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43443 return Shuffle;
43444
43445 // Try recursively combining arbitrary sequences of x86 shuffle
43446 // instructions into higher-order shuffles. We do this after combining
43447 // specific PSHUF instruction sequences into their minimal form so that we
43448 // can evaluate how many specialized shuffle instructions are involved in
43449 // a particular chain.
43450 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43451 return Res;
43452
43453 // Simplify source operands based on shuffle mask.
43454 // TODO - merge this into combineX86ShufflesRecursively.
43455 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43456 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43457 return SDValue(N, 0);
43458
43459 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43460 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43461 // Perform this after other shuffle combines to allow inner shuffles to be
43462 // combined away first.
43463 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43464 return BinOp;
43465 }
43466
43467 return SDValue();
43468}
43469
43470// Simplify variable target shuffle masks based on the demanded elements.
43471// TODO: Handle DemandedBits in mask indices as well?
43473 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43474 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43475 // If we're demanding all elements don't bother trying to simplify the mask.
43476 unsigned NumElts = DemandedElts.getBitWidth();
43477 if (DemandedElts.isAllOnes())
43478 return false;
43479
43480 SDValue Mask = Op.getOperand(MaskIndex);
43481 if (!Mask.hasOneUse())
43482 return false;
43483
43484 // Attempt to generically simplify the variable shuffle mask.
43485 APInt MaskUndef, MaskZero;
43486 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43487 Depth + 1))
43488 return true;
43489
43490 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43491 // TODO: Support other types from getTargetShuffleMaskIndices?
43493 EVT BCVT = BC.getValueType();
43494 auto *Load = dyn_cast<LoadSDNode>(BC);
43495 if (!Load || !Load->getBasePtr().hasOneUse())
43496 return false;
43497
43498 const Constant *C = getTargetConstantFromNode(Load);
43499 if (!C)
43500 return false;
43501
43502 Type *CTy = C->getType();
43503 if (!CTy->isVectorTy() ||
43504 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43505 return false;
43506
43507 // Handle scaling for i64 elements on 32-bit targets.
43508 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43509 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43510 return false;
43511 unsigned Scale = NumCstElts / NumElts;
43512
43513 // Simplify mask if we have an undemanded element that is not undef.
43514 bool Simplified = false;
43515 SmallVector<Constant *, 32> ConstVecOps;
43516 for (unsigned i = 0; i != NumCstElts; ++i) {
43517 Constant *Elt = C->getAggregateElement(i);
43518 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43519 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43520 Simplified = true;
43521 continue;
43522 }
43523 ConstVecOps.push_back(Elt);
43524 }
43525 if (!Simplified)
43526 return false;
43527
43528 // Generate new constant pool entry + legalize immediately for the load.
43529 SDLoc DL(Op);
43530 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43531 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43532 SDValue NewMask = TLO.DAG.getLoad(
43533 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43535 Load->getAlign());
43536 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43537}
43538
43540 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43541 TargetLoweringOpt &TLO, unsigned Depth) const {
43542 int NumElts = DemandedElts.getBitWidth();
43543 unsigned Opc = Op.getOpcode();
43544 EVT VT = Op.getValueType();
43545
43546 // Handle special case opcodes.
43547 switch (Opc) {
43548 case X86ISD::PMULDQ:
43549 case X86ISD::PMULUDQ: {
43550 APInt LHSUndef, LHSZero;
43551 APInt RHSUndef, RHSZero;
43552 SDValue LHS = Op.getOperand(0);
43553 SDValue RHS = Op.getOperand(1);
43554 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43555 Depth + 1))
43556 return true;
43557 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43558 Depth + 1))
43559 return true;
43560 // Multiply by zero.
43561 KnownZero = LHSZero | RHSZero;
43562 break;
43563 }
43564 case X86ISD::VPMADDUBSW:
43565 case X86ISD::VPMADDWD: {
43566 APInt LHSUndef, LHSZero;
43567 APInt RHSUndef, RHSZero;
43568 SDValue LHS = Op.getOperand(0);
43569 SDValue RHS = Op.getOperand(1);
43570 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43571
43572 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43573 Depth + 1))
43574 return true;
43575 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43576 Depth + 1))
43577 return true;
43578
43579 // TODO: Multiply by zero.
43580
43581 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43582 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43583 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43584 Depth + 1))
43585 return true;
43586 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43587 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43588 Depth + 1))
43589 return true;
43590 break;
43591 }
43592 case X86ISD::PSADBW: {
43593 SDValue LHS = Op.getOperand(0);
43594 SDValue RHS = Op.getOperand(1);
43595 assert(VT.getScalarType() == MVT::i64 &&
43596 LHS.getValueType() == RHS.getValueType() &&
43597 LHS.getValueType().getScalarType() == MVT::i8 &&
43598 "Unexpected PSADBW types");
43599
43600 // Aggressively peek through ops to get at the demanded elts.
43601 if (!DemandedElts.isAllOnes()) {
43602 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43603 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43605 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43607 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43608 if (NewLHS || NewRHS) {
43609 NewLHS = NewLHS ? NewLHS : LHS;
43610 NewRHS = NewRHS ? NewRHS : RHS;
43611 return TLO.CombineTo(
43612 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43613 }
43614 }
43615 break;
43616 }
43617 case X86ISD::VSHL:
43618 case X86ISD::VSRL:
43619 case X86ISD::VSRA: {
43620 // We only need the bottom 64-bits of the (128-bit) shift amount.
43621 SDValue Amt = Op.getOperand(1);
43622 MVT AmtVT = Amt.getSimpleValueType();
43623 assert(AmtVT.is128BitVector() && "Unexpected value type");
43624
43625 // If we reuse the shift amount just for sse shift amounts then we know that
43626 // only the bottom 64-bits are only ever used.
43627 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43628 unsigned UseOpc = Use->getOpcode();
43629 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43630 UseOpc == X86ISD::VSRA) &&
43631 Use->getOperand(0) != Amt;
43632 });
43633
43634 APInt AmtUndef, AmtZero;
43635 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43636 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43637 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43638 Depth + 1, AssumeSingleUse))
43639 return true;
43640 [[fallthrough]];
43641 }
43642 case X86ISD::VSHLI:
43643 case X86ISD::VSRLI:
43644 case X86ISD::VSRAI: {
43645 SDValue Src = Op.getOperand(0);
43646 APInt SrcUndef;
43647 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43648 Depth + 1))
43649 return true;
43650
43651 // Fold shift(0,x) -> 0
43652 if (DemandedElts.isSubsetOf(KnownZero))
43653 return TLO.CombineTo(
43654 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43655
43656 // Aggressively peek through ops to get at the demanded elts.
43657 if (!DemandedElts.isAllOnes())
43659 Src, DemandedElts, TLO.DAG, Depth + 1))
43660 return TLO.CombineTo(
43661 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43662 break;
43663 }
43664 case X86ISD::VPSHA:
43665 case X86ISD::VPSHL:
43666 case X86ISD::VSHLV:
43667 case X86ISD::VSRLV:
43668 case X86ISD::VSRAV: {
43669 APInt LHSUndef, LHSZero;
43670 APInt RHSUndef, RHSZero;
43671 SDValue LHS = Op.getOperand(0);
43672 SDValue RHS = Op.getOperand(1);
43673 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43674 Depth + 1))
43675 return true;
43676
43677 // Fold shift(0,x) -> 0
43678 if (DemandedElts.isSubsetOf(LHSZero))
43679 return TLO.CombineTo(
43680 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43681
43682 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43683 Depth + 1))
43684 return true;
43685
43686 KnownZero = LHSZero;
43687 break;
43688 }
43689 case X86ISD::CMPM:
43690 case X86ISD::CMPP: {
43691 // Scalarize packed fp comparison if we only require element 0.
43692 if (DemandedElts == 1) {
43693 SDLoc dl(Op);
43694 MVT VT = Op.getSimpleValueType();
43695 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43696 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43697 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43698 SDValue CC = Op.getOperand(2);
43699 if (Opc == X86ISD::CMPM) {
43700 SDValue Cmp =
43701 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43702 return TLO.CombineTo(
43703 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43704 }
43705 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43706 return TLO.CombineTo(Op,
43707 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43708 }
43709 break;
43710 }
43711 case X86ISD::PCMPEQ:
43712 case X86ISD::PCMPGT: {
43713 APInt LHSUndef, LHSZero;
43714 APInt RHSUndef, RHSZero;
43715 SDValue LHS = Op.getOperand(0);
43716 SDValue RHS = Op.getOperand(1);
43717 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43718 Depth + 1))
43719 return true;
43720 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43721 Depth + 1))
43722 return true;
43723 break;
43724 }
43725 case X86ISD::KSHIFTL: {
43726 SDValue Src = Op.getOperand(0);
43727 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43728 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43729 unsigned ShiftAmt = Amt->getZExtValue();
43730
43731 if (ShiftAmt == 0)
43732 return TLO.CombineTo(Op, Src);
43733
43734 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43735 // single shift. We can do this if the bottom bits (which are shifted
43736 // out) are never demanded.
43737 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43738 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43739 unsigned C1 = Src.getConstantOperandVal(1);
43740 unsigned NewOpc = X86ISD::KSHIFTL;
43741 int Diff = ShiftAmt - C1;
43742 if (Diff < 0) {
43743 Diff = -Diff;
43744 NewOpc = X86ISD::KSHIFTR;
43745 }
43746
43747 SDLoc dl(Op);
43748 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43749 return TLO.CombineTo(
43750 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43751 }
43752 }
43753
43754 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43755 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43756 Depth + 1))
43757 return true;
43758
43759 KnownUndef <<= ShiftAmt;
43760 KnownZero <<= ShiftAmt;
43761 KnownZero.setLowBits(ShiftAmt);
43762 break;
43763 }
43764 case X86ISD::KSHIFTR: {
43765 SDValue Src = Op.getOperand(0);
43766 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43767 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43768 unsigned ShiftAmt = Amt->getZExtValue();
43769
43770 if (ShiftAmt == 0)
43771 return TLO.CombineTo(Op, Src);
43772
43773 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43774 // single shift. We can do this if the top bits (which are shifted
43775 // out) are never demanded.
43776 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43777 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43778 unsigned C1 = Src.getConstantOperandVal(1);
43779 unsigned NewOpc = X86ISD::KSHIFTR;
43780 int Diff = ShiftAmt - C1;
43781 if (Diff < 0) {
43782 Diff = -Diff;
43783 NewOpc = X86ISD::KSHIFTL;
43784 }
43785
43786 SDLoc dl(Op);
43787 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43788 return TLO.CombineTo(
43789 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43790 }
43791 }
43792
43793 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43794 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43795 Depth + 1))
43796 return true;
43797
43798 KnownUndef.lshrInPlace(ShiftAmt);
43799 KnownZero.lshrInPlace(ShiftAmt);
43800 KnownZero.setHighBits(ShiftAmt);
43801 break;
43802 }
43803 case X86ISD::ANDNP: {
43804 // ANDNP = (~LHS & RHS);
43805 SDValue LHS = Op.getOperand(0);
43806 SDValue RHS = Op.getOperand(1);
43807
43808 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43809 APInt UndefElts;
43810 SmallVector<APInt> EltBits;
43811 int NumElts = VT.getVectorNumElements();
43812 int EltSizeInBits = VT.getScalarSizeInBits();
43813 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43814 APInt OpElts = DemandedElts;
43815 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43816 EltBits)) {
43817 OpBits.clearAllBits();
43818 OpElts.clearAllBits();
43819 for (int I = 0; I != NumElts; ++I) {
43820 if (!DemandedElts[I])
43821 continue;
43822 if (UndefElts[I]) {
43823 // We can't assume an undef src element gives an undef dst - the
43824 // other src might be zero.
43825 OpBits.setAllBits();
43826 OpElts.setBit(I);
43827 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43828 (!Invert && !EltBits[I].isZero())) {
43829 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43830 OpElts.setBit(I);
43831 }
43832 }
43833 }
43834 return std::make_pair(OpBits, OpElts);
43835 };
43836 APInt BitsLHS, EltsLHS;
43837 APInt BitsRHS, EltsRHS;
43838 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43839 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43840
43841 APInt LHSUndef, LHSZero;
43842 APInt RHSUndef, RHSZero;
43843 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43844 Depth + 1))
43845 return true;
43846 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43847 Depth + 1))
43848 return true;
43849
43850 if (!DemandedElts.isAllOnes()) {
43851 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43852 TLO.DAG, Depth + 1);
43853 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43854 TLO.DAG, Depth + 1);
43855 if (NewLHS || NewRHS) {
43856 NewLHS = NewLHS ? NewLHS : LHS;
43857 NewRHS = NewRHS ? NewRHS : RHS;
43858 return TLO.CombineTo(
43859 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43860 }
43861 }
43862 break;
43863 }
43864 case X86ISD::CVTSI2P:
43865 case X86ISD::CVTUI2P:
43866 case X86ISD::CVTPH2PS:
43867 case X86ISD::CVTPS2PH: {
43868 SDValue Src = Op.getOperand(0);
43869 EVT SrcVT = Src.getValueType();
43870 APInt SrcUndef, SrcZero;
43871 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43872 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43873 Depth + 1))
43874 return true;
43875 break;
43876 }
43877 case X86ISD::PACKSS:
43878 case X86ISD::PACKUS: {
43879 SDValue N0 = Op.getOperand(0);
43880 SDValue N1 = Op.getOperand(1);
43881
43882 APInt DemandedLHS, DemandedRHS;
43883 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43884
43885 APInt LHSUndef, LHSZero;
43886 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43887 Depth + 1))
43888 return true;
43889 APInt RHSUndef, RHSZero;
43890 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43891 Depth + 1))
43892 return true;
43893
43894 // TODO - pass on known zero/undef.
43895
43896 // Aggressively peek through ops to get at the demanded elts.
43897 // TODO - we should do this for all target/faux shuffles ops.
43898 if (!DemandedElts.isAllOnes()) {
43899 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43900 TLO.DAG, Depth + 1);
43901 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43902 TLO.DAG, Depth + 1);
43903 if (NewN0 || NewN1) {
43904 NewN0 = NewN0 ? NewN0 : N0;
43905 NewN1 = NewN1 ? NewN1 : N1;
43906 return TLO.CombineTo(Op,
43907 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43908 }
43909 }
43910 break;
43911 }
43912 case X86ISD::HADD:
43913 case X86ISD::HSUB:
43914 case X86ISD::FHADD:
43915 case X86ISD::FHSUB: {
43916 SDValue N0 = Op.getOperand(0);
43917 SDValue N1 = Op.getOperand(1);
43918
43919 APInt DemandedLHS, DemandedRHS;
43920 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43921
43922 APInt LHSUndef, LHSZero;
43923 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43924 Depth + 1))
43925 return true;
43926 APInt RHSUndef, RHSZero;
43927 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43928 Depth + 1))
43929 return true;
43930
43931 // TODO - pass on known zero/undef.
43932
43933 // Aggressively peek through ops to get at the demanded elts.
43934 // TODO: Handle repeated operands.
43935 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43936 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43937 TLO.DAG, Depth + 1);
43938 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43939 TLO.DAG, Depth + 1);
43940 if (NewN0 || NewN1) {
43941 NewN0 = NewN0 ? NewN0 : N0;
43942 NewN1 = NewN1 ? NewN1 : N1;
43943 return TLO.CombineTo(Op,
43944 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43945 }
43946 }
43947 break;
43948 }
43949 case X86ISD::VTRUNC:
43950 case X86ISD::VTRUNCS:
43951 case X86ISD::VTRUNCUS: {
43952 SDValue Src = Op.getOperand(0);
43953 MVT SrcVT = Src.getSimpleValueType();
43954 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43955 APInt SrcUndef, SrcZero;
43956 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43957 Depth + 1))
43958 return true;
43959 KnownZero = SrcZero.zextOrTrunc(NumElts);
43960 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43961 break;
43962 }
43963 case X86ISD::BLENDI: {
43964 SmallVector<int, 16> BlendMask;
43965 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43967 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43968 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43969 return TLO.CombineTo(Op, R);
43970 break;
43971 }
43972 case X86ISD::BLENDV: {
43973 APInt SelUndef, SelZero;
43974 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43975 SelZero, TLO, Depth + 1))
43976 return true;
43977
43978 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43979 APInt LHSUndef, LHSZero;
43980 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43981 LHSZero, TLO, Depth + 1))
43982 return true;
43983
43984 APInt RHSUndef, RHSZero;
43985 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43986 RHSZero, TLO, Depth + 1))
43987 return true;
43988
43989 KnownZero = LHSZero & RHSZero;
43990 KnownUndef = LHSUndef & RHSUndef;
43991 break;
43992 }
43993 case X86ISD::VZEXT_MOVL: {
43994 // If upper demanded elements are already zero then we have nothing to do.
43995 SDValue Src = Op.getOperand(0);
43996 APInt DemandedUpperElts = DemandedElts;
43997 DemandedUpperElts.clearLowBits(1);
43998 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43999 return TLO.CombineTo(Op, Src);
44000 break;
44001 }
44002 case X86ISD::VZEXT_LOAD: {
44003 // If upper demanded elements are not demanded then simplify to a
44004 // scalar_to_vector(load()).
44006 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
44007 SDLoc DL(Op);
44008 auto *Mem = cast<MemSDNode>(Op);
44009 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
44010 Mem->getMemOperand());
44011 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
44012 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
44013 }
44014 break;
44015 }
44016 case X86ISD::VBROADCAST: {
44017 SDValue Src = Op.getOperand(0);
44018 MVT SrcVT = Src.getSimpleValueType();
44019 // Don't bother broadcasting if we just need the 0'th element.
44020 if (DemandedElts == 1) {
44021 if (!SrcVT.isVector())
44022 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
44023 else if (Src.getValueType() != VT)
44024 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
44025 SDLoc(Op));
44026 return TLO.CombineTo(Op, Src);
44027 }
44028 if (!SrcVT.isVector())
44029 break;
44030 APInt SrcUndef, SrcZero;
44031 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
44032 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
44033 Depth + 1))
44034 return true;
44035 // Aggressively peek through src to get at the demanded elt.
44036 // TODO - we should do this for all target/faux shuffles ops.
44038 Src, SrcElts, TLO.DAG, Depth + 1))
44039 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44040 break;
44041 }
44042 case X86ISD::VPERMV:
44043 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
44044 Depth))
44045 return true;
44046 break;
44047 case X86ISD::PSHUFB:
44048 case X86ISD::VPERMV3:
44049 case X86ISD::VPERMILPV:
44050 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
44051 Depth))
44052 return true;
44053 break;
44054 case X86ISD::VPPERM:
44055 case X86ISD::VPERMIL2:
44056 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
44057 Depth))
44058 return true;
44059 break;
44060 }
44061
44062 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44063 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44064 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44065 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44066 DemandedElts.lshr(NumElts / 2) == 0) {
44067 unsigned SizeInBits = VT.getSizeInBits();
44068 unsigned ExtSizeInBits = SizeInBits / 2;
44069
44070 // See if 512-bit ops only use the bottom 128-bits.
44071 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44072 ExtSizeInBits = SizeInBits / 4;
44073
44074 switch (Opc) {
44075 // Scalar broadcast.
44076 case X86ISD::VBROADCAST: {
44077 SDLoc DL(Op);
44078 SDValue Src = Op.getOperand(0);
44079 if (Src.getValueSizeInBits() > ExtSizeInBits)
44080 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44081 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44082 ExtSizeInBits / VT.getScalarSizeInBits());
44083 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44084 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44085 TLO.DAG, DL, ExtSizeInBits));
44086 }
44088 SDLoc DL(Op);
44089 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44090 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44091 ExtSizeInBits / VT.getScalarSizeInBits());
44092 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44093 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44094 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44095 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44096 MemIntr->getMemOperand());
44098 Bcst.getValue(1));
44099 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44100 TLO.DAG, DL, ExtSizeInBits));
44101 }
44102 // Subvector broadcast.
44104 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44105 EVT MemVT = MemIntr->getMemoryVT();
44106 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44107 SDLoc DL(Op);
44108 SDValue Ld =
44109 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44110 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44112 Ld.getValue(1));
44113 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44114 TLO.DAG, DL, ExtSizeInBits));
44115 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44116 SDLoc DL(Op);
44117 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44118 ExtSizeInBits / VT.getScalarSizeInBits());
44119 if (SDValue BcstLd =
44120 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44121 return TLO.CombineTo(Op,
44122 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44123 TLO.DAG, DL, ExtSizeInBits));
44124 }
44125 break;
44126 }
44127 // Byte shifts by immediate.
44128 case X86ISD::VSHLDQ:
44129 case X86ISD::VSRLDQ:
44130 // Shift by uniform.
44131 case X86ISD::VSHL:
44132 case X86ISD::VSRL:
44133 case X86ISD::VSRA:
44134 // Shift by immediate.
44135 case X86ISD::VSHLI:
44136 case X86ISD::VSRLI:
44137 case X86ISD::VSRAI: {
44138 SDLoc DL(Op);
44139 SDValue Ext0 =
44140 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44141 SDValue ExtOp =
44142 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44143 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44144 SDValue Insert =
44145 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44146 return TLO.CombineTo(Op, Insert);
44147 }
44148 case X86ISD::VPERMI: {
44149 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44150 // TODO: This should be done in shuffle combining.
44151 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44153 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44154 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44155 SDLoc DL(Op);
44156 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44157 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44158 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44159 return TLO.CombineTo(Op, Insert);
44160 }
44161 }
44162 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44163 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44164 SDLoc DL(Op);
44165 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44166 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44167 Op.getOperand(1));
44168 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44169 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44170 return TLO.CombineTo(Op, Insert);
44171 }
44172 break;
44173 }
44174 case X86ISD::VPERMV: {
44177 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44178 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44179 VT == MVT::v16f32) &&
44180 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44181 // For lane-crossing shuffles, only split in half in case we're still
44182 // referencing higher elements.
44183 unsigned HalfElts = NumElts / 2;
44184 unsigned HalfSize = SizeInBits / 2;
44185 Mask.resize(HalfElts);
44186 if (all_of(Mask,
44187 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44189 SDLoc DL(Op);
44190 SDValue Ext;
44191 SDValue M =
44192 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44193 SDValue V =
44194 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44195 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44196 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44197 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44198 else {
44200 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44201 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44202 TLO.DAG.getBitcast(ShufVT, V), M);
44203 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44204 }
44205 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44206 Subtarget, TLO.DAG, DL, SizeInBits);
44207 return TLO.CombineTo(Op, Insert);
44208 }
44209 }
44210 break;
44211 }
44212 case X86ISD::VPERMV3: {
44215 if (Subtarget.hasVLX() &&
44216 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44217 // For lane-crossing shuffles, only split in half in case we're still
44218 // referencing higher elements.
44219 unsigned HalfElts = NumElts / 2;
44220 unsigned HalfSize = SizeInBits / 2;
44221 Mask.resize(HalfElts);
44222 if (all_of(Mask, [&](int M) {
44223 return isUndefOrInRange(M, 0, HalfElts) ||
44224 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44225 })) {
44226 // Adjust mask elements for 2nd operand to point to half width.
44227 for (int &M : Mask)
44228 M = (M < NumElts) ? M : (M - HalfElts);
44230 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44231 SDLoc DL(Op);
44232 SDValue Ext = TLO.DAG.getNode(
44233 Opc, DL, HalfVT,
44234 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44235 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44236 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44237 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44238 Subtarget, TLO.DAG, DL, SizeInBits);
44239 return TLO.CombineTo(Op, Insert);
44240 }
44241 }
44242 break;
44243 }
44244 case X86ISD::VPERM2X128: {
44245 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44246 SDLoc DL(Op);
44247 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44248 if (LoMask & 0x8)
44249 return TLO.CombineTo(
44250 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44251 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44252 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44253 SDValue ExtOp =
44254 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44255 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44256 SDValue Insert =
44257 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44258 return TLO.CombineTo(Op, Insert);
44259 }
44260 // Conversions.
44261 // TODO: Add more CVT opcodes when we have test coverage.
44262 case X86ISD::CVTTP2UI: {
44263 if (!Subtarget.hasVLX())
44264 break;
44265 [[fallthrough]];
44266 }
44267 case X86ISD::CVTTP2SI: {
44268 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44269 !Subtarget.hasVLX())
44270 break;
44271 [[fallthrough]];
44272 }
44273 case X86ISD::CVTPH2PS: {
44274 SDLoc DL(Op);
44275 unsigned Scale = SizeInBits / ExtSizeInBits;
44276 SDValue SrcOp = Op.getOperand(0);
44277 MVT SrcVT = SrcOp.getSimpleValueType();
44278 unsigned SrcExtSize =
44279 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44281 ExtSizeInBits / VT.getScalarSizeInBits());
44282 SDValue ExtOp = TLO.DAG.getNode(
44283 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44284 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44285 SDValue Insert =
44286 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44287 return TLO.CombineTo(Op, Insert);
44288 }
44289 // Zero upper elements.
44290 case X86ISD::VZEXT_MOVL:
44291 // Variable blend.
44292 case X86ISD::BLENDV:
44293 // Target unary shuffles:
44294 case X86ISD::MOVDDUP:
44295 // Target unary shuffles by immediate:
44296 case X86ISD::PSHUFD:
44297 case X86ISD::PSHUFLW:
44298 case X86ISD::PSHUFHW:
44299 case X86ISD::VPERMILPI:
44300 // (Non-Lane Crossing) Target Shuffles.
44301 case X86ISD::VPERMILPV:
44302 case X86ISD::VPERMIL2:
44303 case X86ISD::PSHUFB:
44304 case X86ISD::UNPCKL:
44305 case X86ISD::UNPCKH:
44306 case X86ISD::BLENDI:
44307 // Integer ops.
44308 case X86ISD::PACKSS:
44309 case X86ISD::PACKUS:
44310 case X86ISD::PCMPEQ:
44311 case X86ISD::PCMPGT:
44312 case X86ISD::PMULUDQ:
44313 case X86ISD::PMULDQ:
44314 case X86ISD::VSHLV:
44315 case X86ISD::VSRLV:
44316 case X86ISD::VSRAV:
44317 // Float ops.
44318 case X86ISD::FMAX:
44319 case X86ISD::FMIN:
44320 case X86ISD::FMAXC:
44321 case X86ISD::FMINC:
44322 case X86ISD::FRSQRT:
44323 case X86ISD::FRCP:
44324 // Horizontal Ops.
44325 case X86ISD::HADD:
44326 case X86ISD::HSUB:
44327 case X86ISD::FHADD:
44328 case X86ISD::FHSUB: {
44329 SDLoc DL(Op);
44331 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44332 SDValue SrcOp = Op.getOperand(i);
44333 EVT SrcVT = SrcOp.getValueType();
44334 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44335 "Unsupported vector size");
44336 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44337 ExtSizeInBits)
44338 : SrcOp);
44339 }
44340 MVT ExtVT = VT.getSimpleVT();
44341 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44342 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44343 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44344 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44345 SDValue Insert =
44346 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44347 return TLO.CombineTo(Op, Insert);
44348 }
44349 }
44350 }
44351
44352 // For splats, unless we *only* demand the 0'th element,
44353 // stop attempts at simplification here, we aren't going to improve things,
44354 // this is better than any potential shuffle.
44355 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44356 return false;
44357
44358 // Get target/faux shuffle mask.
44359 APInt OpUndef, OpZero;
44360 SmallVector<int, 64> OpMask;
44361 SmallVector<SDValue, 2> OpInputs;
44362 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44363 OpZero, TLO.DAG, Depth, false))
44364 return false;
44365
44366 // Shuffle inputs must be the same size as the result.
44367 if (OpMask.size() != (unsigned)NumElts ||
44368 llvm::any_of(OpInputs, [VT](SDValue V) {
44369 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44370 !V.getValueType().isVector();
44371 }))
44372 return false;
44373
44374 KnownZero = OpZero;
44375 KnownUndef = OpUndef;
44376
44377 // Check if shuffle mask can be simplified to undef/zero/identity.
44378 int NumSrcs = OpInputs.size();
44379 for (int i = 0; i != NumElts; ++i)
44380 if (!DemandedElts[i])
44381 OpMask[i] = SM_SentinelUndef;
44382
44383 if (isUndefInRange(OpMask, 0, NumElts)) {
44384 KnownUndef.setAllBits();
44385 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44386 }
44387 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44388 KnownZero.setAllBits();
44389 return TLO.CombineTo(
44390 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44391 }
44392 for (int Src = 0; Src != NumSrcs; ++Src)
44393 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44394 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44395
44396 // Attempt to simplify inputs.
44397 for (int Src = 0; Src != NumSrcs; ++Src) {
44398 // TODO: Support inputs of different types.
44399 if (OpInputs[Src].getValueType() != VT)
44400 continue;
44401
44402 int Lo = Src * NumElts;
44403 APInt SrcElts = APInt::getZero(NumElts);
44404 for (int i = 0; i != NumElts; ++i)
44405 if (DemandedElts[i]) {
44406 int M = OpMask[i] - Lo;
44407 if (0 <= M && M < NumElts)
44408 SrcElts.setBit(M);
44409 }
44410
44411 // TODO - Propagate input undef/zero elts.
44412 APInt SrcUndef, SrcZero;
44413 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44414 TLO, Depth + 1))
44415 return true;
44416 }
44417
44418 // If we don't demand all elements, then attempt to combine to a simpler
44419 // shuffle.
44420 // We need to convert the depth to something combineX86ShufflesRecursively
44421 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44422 // to match. This prevents combineX86ShuffleChain from returning a
44423 // combined shuffle that's the same as the original root, causing an
44424 // infinite loop.
44425 if (!DemandedElts.isAllOnes()) {
44426 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44427
44428 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44429 for (int i = 0; i != NumElts; ++i)
44430 if (DemandedElts[i])
44431 DemandedMask[i] = i;
44432
44434 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44436 /*AllowVariableCrossLaneMask=*/true,
44437 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44438 TLO.DAG, SDLoc(Op), Subtarget);
44439 if (NewShuffle)
44440 return TLO.CombineTo(Op, NewShuffle);
44441 }
44442
44443 return false;
44444}
44445
44447 SDValue Op, const APInt &OriginalDemandedBits,
44448 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44449 unsigned Depth) const {
44450 EVT VT = Op.getValueType();
44451 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44452 unsigned Opc = Op.getOpcode();
44453 switch(Opc) {
44454 case X86ISD::VTRUNC: {
44455 KnownBits KnownOp;
44456 SDValue Src = Op.getOperand(0);
44457 MVT SrcVT = Src.getSimpleValueType();
44458
44459 // Simplify the input, using demanded bit information.
44460 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44461 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44462 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44463 return true;
44464 break;
44465 }
44466 case X86ISD::PMULDQ:
44467 case X86ISD::PMULUDQ: {
44468 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44469 KnownBits KnownLHS, KnownRHS;
44470 SDValue LHS = Op.getOperand(0);
44471 SDValue RHS = Op.getOperand(1);
44472
44473 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44474 // FIXME: Can we bound this better?
44475 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44476 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44477 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44478
44479 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44480 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44481 DemandedMaskLHS = DemandedMask;
44482 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44483 DemandedMaskRHS = DemandedMask;
44484
44485 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44486 KnownLHS, TLO, Depth + 1))
44487 return true;
44488 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44489 KnownRHS, TLO, Depth + 1))
44490 return true;
44491
44492 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44493 KnownRHS = KnownRHS.trunc(32);
44494 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44495 KnownRHS.getConstant().isOne()) {
44496 SDLoc DL(Op);
44497 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44498 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44499 }
44500
44501 // Aggressively peek through ops to get at the demanded low bits.
44503 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44505 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44506 if (DemandedLHS || DemandedRHS) {
44507 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44508 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44509 return TLO.CombineTo(
44510 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44511 }
44512 break;
44513 }
44514 case X86ISD::ANDNP: {
44515 KnownBits Known2;
44516 SDValue Op0 = Op.getOperand(0);
44517 SDValue Op1 = Op.getOperand(1);
44518
44519 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44520 Known, TLO, Depth + 1))
44521 return true;
44522
44523 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44524 OriginalDemandedElts, Known2, TLO, Depth + 1))
44525 return true;
44526
44527 // If the RHS is a constant, see if we can simplify it.
44528 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44529 OriginalDemandedElts, TLO))
44530 return true;
44531
44532 // ANDNP = (~Op0 & Op1);
44533 Known.One &= Known2.Zero;
44534 Known.Zero |= Known2.One;
44535 break;
44536 }
44537 case X86ISD::VSHLI: {
44538 SDValue Op0 = Op.getOperand(0);
44539 SDValue Op1 = Op.getOperand(1);
44540
44541 unsigned ShAmt = Op1->getAsZExtVal();
44542 if (ShAmt >= BitWidth)
44543 break;
44544
44545 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44546
44547 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44548 // single shift. We can do this if the bottom bits (which are shifted
44549 // out) are never demanded.
44550 if (Op0.getOpcode() == X86ISD::VSRLI &&
44551 OriginalDemandedBits.countr_zero() >= ShAmt) {
44552 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44553 if (Shift2Amt < BitWidth) {
44554 int Diff = ShAmt - Shift2Amt;
44555 if (Diff == 0)
44556 return TLO.CombineTo(Op, Op0.getOperand(0));
44557
44558 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44559 SDValue NewShift = TLO.DAG.getNode(
44560 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44561 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44562 return TLO.CombineTo(Op, NewShift);
44563 }
44564 }
44565
44566 // If we are only demanding sign bits then we can use the shift source directly.
44567 unsigned NumSignBits =
44568 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44569 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44570 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44571 return TLO.CombineTo(Op, Op0);
44572
44573 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44574 TLO, Depth + 1))
44575 return true;
44576
44577 Known <<= ShAmt;
44578
44579 // Low bits known zero.
44580 Known.Zero.setLowBits(ShAmt);
44581
44582 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44583 // Attempt to avoid multi-use ops if we don't need anything from them.
44584 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44585 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44586 SDValue NewOp =
44587 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44588 return TLO.CombineTo(Op, NewOp);
44589 }
44590 }
44591 return false;
44592 }
44593 case X86ISD::VSRLI: {
44594 SDValue Op0 = Op.getOperand(0);
44595 SDValue Op1 = Op.getOperand(1);
44596
44597 unsigned ShAmt = Op1->getAsZExtVal();
44598 if (ShAmt >= BitWidth)
44599 break;
44600
44601 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44602
44603 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44604 TLO, Depth + 1))
44605 return true;
44606
44607 Known >>= ShAmt;
44608
44609 // High bits known zero.
44610 Known.Zero.setHighBits(ShAmt);
44611
44612 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44613 // Attempt to avoid multi-use ops if we don't need anything from them.
44614 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44615 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44616 SDValue NewOp =
44617 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44618 return TLO.CombineTo(Op, NewOp);
44619 }
44620 }
44621 return false;
44622 }
44623 case X86ISD::VSRAI: {
44624 SDValue Op0 = Op.getOperand(0);
44625 SDValue Op1 = Op.getOperand(1);
44626
44627 unsigned ShAmt = Op1->getAsZExtVal();
44628 if (ShAmt >= BitWidth)
44629 break;
44630
44631 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44632
44633 // If we just want the sign bit then we don't need to shift it.
44634 if (OriginalDemandedBits.isSignMask())
44635 return TLO.CombineTo(Op, Op0);
44636
44637 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44638 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44639 SDValue Op00 = Op0.getOperand(0);
44640 unsigned NumSignBits =
44641 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44642 if (ShAmt < NumSignBits)
44643 return TLO.CombineTo(Op, Op00);
44644 }
44645
44646 // If any of the demanded bits are produced by the sign extension, we also
44647 // demand the input sign bit.
44648 if (OriginalDemandedBits.countl_zero() < ShAmt)
44649 DemandedMask.setSignBit();
44650
44651 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44652 TLO, Depth + 1))
44653 return true;
44654
44655 Known >>= ShAmt;
44656
44657 // If the input sign bit is known to be zero, or if none of the top bits
44658 // are demanded, turn this into an unsigned shift right.
44659 if (Known.Zero[BitWidth - ShAmt - 1] ||
44660 OriginalDemandedBits.countl_zero() >= ShAmt)
44661 return TLO.CombineTo(
44662 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44663
44664 // High bits are known one.
44665 if (Known.One[BitWidth - ShAmt - 1])
44666 Known.One.setHighBits(ShAmt);
44667
44668 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44669 // Attempt to avoid multi-use ops if we don't need anything from them.
44670 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44671 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44672 SDValue NewOp =
44673 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44674 return TLO.CombineTo(Op, NewOp);
44675 }
44676 }
44677 return false;
44678 }
44679 case X86ISD::BLENDI: {
44680 SDValue LHS = Op.getOperand(0);
44681 SDValue RHS = Op.getOperand(1);
44682 APInt Mask = getBLENDIBlendMask(Op);
44683
44684 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44685 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44686 TLO, Depth + 1))
44687 return true;
44688
44689 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44690 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44691 TLO, Depth + 1))
44692 return true;
44693
44694 // Attempt to avoid multi-use ops if we don't need anything from them.
44696 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44698 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44699 if (NewLHS || NewRHS) {
44700 NewLHS = NewLHS ? NewLHS : LHS;
44701 NewRHS = NewRHS ? NewRHS : RHS;
44702 return TLO.CombineTo(Op,
44703 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44704 NewLHS, NewRHS, Op.getOperand(2)));
44705 }
44706 break;
44707 }
44708 case X86ISD::BLENDV: {
44709 SDValue Sel = Op.getOperand(0);
44710 SDValue LHS = Op.getOperand(1);
44711 SDValue RHS = Op.getOperand(2);
44712
44713 APInt SignMask = APInt::getSignMask(BitWidth);
44715 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44717 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44719 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44720
44721 if (NewSel || NewLHS || NewRHS) {
44722 NewSel = NewSel ? NewSel : Sel;
44723 NewLHS = NewLHS ? NewLHS : LHS;
44724 NewRHS = NewRHS ? NewRHS : RHS;
44725 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44726 NewSel, NewLHS, NewRHS));
44727 }
44728 break;
44729 }
44730 case X86ISD::PEXTRB:
44731 case X86ISD::PEXTRW: {
44732 SDValue Vec = Op.getOperand(0);
44733 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44734 MVT VecVT = Vec.getSimpleValueType();
44735 unsigned NumVecElts = VecVT.getVectorNumElements();
44736
44737 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44738 unsigned Idx = CIdx->getZExtValue();
44739 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44740
44741 // If we demand no bits from the vector then we must have demanded
44742 // bits from the implict zext - simplify to zero.
44743 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44744 if (DemandedVecBits == 0)
44745 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44746
44747 APInt KnownUndef, KnownZero;
44748 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44749 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44750 KnownZero, TLO, Depth + 1))
44751 return true;
44752
44753 KnownBits KnownVec;
44754 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44755 KnownVec, TLO, Depth + 1))
44756 return true;
44757
44759 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44760 return TLO.CombineTo(
44761 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44762
44763 Known = KnownVec.zext(BitWidth);
44764 return false;
44765 }
44766 break;
44767 }
44768 case X86ISD::PINSRB:
44769 case X86ISD::PINSRW: {
44770 SDValue Vec = Op.getOperand(0);
44771 SDValue Scl = Op.getOperand(1);
44772 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44773 MVT VecVT = Vec.getSimpleValueType();
44774
44775 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44776 unsigned Idx = CIdx->getZExtValue();
44777 if (!OriginalDemandedElts[Idx])
44778 return TLO.CombineTo(Op, Vec);
44779
44780 KnownBits KnownVec;
44781 APInt DemandedVecElts(OriginalDemandedElts);
44782 DemandedVecElts.clearBit(Idx);
44783 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44784 KnownVec, TLO, Depth + 1))
44785 return true;
44786
44787 KnownBits KnownScl;
44788 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44789 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44790 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44791 return true;
44792
44793 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44794 Known = KnownVec.intersectWith(KnownScl);
44795 return false;
44796 }
44797 break;
44798 }
44799 case X86ISD::PACKSS:
44800 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44801 // sign bit then we can just ask for the source operands sign bit.
44802 // TODO - add known bits handling.
44803 if (OriginalDemandedBits.isSignMask()) {
44804 APInt DemandedLHS, DemandedRHS;
44805 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44806
44807 KnownBits KnownLHS, KnownRHS;
44808 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44809 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44810 KnownLHS, TLO, Depth + 1))
44811 return true;
44812 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44813 KnownRHS, TLO, Depth + 1))
44814 return true;
44815
44816 // Attempt to avoid multi-use ops if we don't need anything from them.
44818 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44820 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44821 if (DemandedOp0 || DemandedOp1) {
44822 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44823 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44824 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44825 }
44826 }
44827 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44828 break;
44829 case X86ISD::VBROADCAST: {
44830 SDValue Src = Op.getOperand(0);
44831 MVT SrcVT = Src.getSimpleValueType();
44832 APInt DemandedElts = APInt::getOneBitSet(
44833 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44834 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44835 TLO, Depth + 1))
44836 return true;
44837 // If we don't need the upper bits, attempt to narrow the broadcast source.
44838 // Don't attempt this on AVX512 as it might affect broadcast folding.
44839 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44840 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44841 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44842 Src->hasOneUse()) {
44843 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44844 SDValue NewSrc =
44845 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44846 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44847 SDValue NewBcst =
44848 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44849 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44850 }
44851 break;
44852 }
44853 case X86ISD::PCMPGT:
44854 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44855 // iff we only need the sign bit then we can use R directly.
44856 if (OriginalDemandedBits.isSignMask() &&
44857 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44858 return TLO.CombineTo(Op, Op.getOperand(1));
44859 break;
44860 case X86ISD::MOVMSK: {
44861 SDValue Src = Op.getOperand(0);
44862 MVT SrcVT = Src.getSimpleValueType();
44863 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44864 unsigned NumElts = SrcVT.getVectorNumElements();
44865
44866 // If we don't need the sign bits at all just return zero.
44867 if (OriginalDemandedBits.countr_zero() >= NumElts)
44868 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44869
44870 // See if we only demand bits from the lower 128-bit vector.
44871 if (SrcVT.is256BitVector() &&
44872 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44873 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44874 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44875 }
44876
44877 // Only demand the vector elements of the sign bits we need.
44878 APInt KnownUndef, KnownZero;
44879 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44880 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44881 TLO, Depth + 1))
44882 return true;
44883
44884 Known.Zero = KnownZero.zext(BitWidth);
44885 Known.Zero.setHighBits(BitWidth - NumElts);
44886
44887 // MOVMSK only uses the MSB from each vector element.
44888 KnownBits KnownSrc;
44889 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44890 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44891 Depth + 1))
44892 return true;
44893
44894 if (KnownSrc.One[SrcBits - 1])
44895 Known.One.setLowBits(NumElts);
44896 else if (KnownSrc.Zero[SrcBits - 1])
44897 Known.Zero.setLowBits(NumElts);
44898
44899 // Attempt to avoid multi-use os if we don't need anything from it.
44901 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44902 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44903 return false;
44904 }
44905 case X86ISD::TESTP: {
44906 SDValue Op0 = Op.getOperand(0);
44907 SDValue Op1 = Op.getOperand(1);
44908 MVT OpVT = Op0.getSimpleValueType();
44909 assert((OpVT.getVectorElementType() == MVT::f32 ||
44910 OpVT.getVectorElementType() == MVT::f64) &&
44911 "Illegal vector type for X86ISD::TESTP");
44912
44913 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44914 KnownBits KnownSrc;
44915 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44916 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44917 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44918 AssumeSingleUse) ||
44919 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44920 AssumeSingleUse);
44921 }
44922 case X86ISD::CMOV: {
44923 KnownBits Known2;
44924 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44925 OriginalDemandedElts, Known2, TLO, Depth + 1))
44926 return true;
44927 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44928 OriginalDemandedElts, Known, TLO, Depth + 1))
44929 return true;
44930
44931 // Only known if known in both the LHS and RHS.
44932 Known = Known.intersectWith(Known2);
44933 return false;
44934 }
44935 case X86ISD::BEXTR:
44936 case X86ISD::BEXTRI: {
44937 SDValue Op0 = Op.getOperand(0);
44938 SDValue Op1 = Op.getOperand(1);
44939
44940 // Only bottom 16-bits of the control bits are required.
44941 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44942 // NOTE: SimplifyDemandedBits won't do this for constants.
44943 uint64_t Val1 = Cst1->getZExtValue();
44944 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44945 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44946 SDLoc DL(Op);
44947 return TLO.CombineTo(
44948 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44949 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44950 }
44951
44952 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44953 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44954
44955 // If the length is 0, the result is 0.
44956 if (Length == 0) {
44957 Known.setAllZero();
44958 return false;
44959 }
44960
44961 if ((Shift + Length) <= BitWidth) {
44962 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44963 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44964 return true;
44965
44966 Known = Known.extractBits(Length, Shift);
44967 Known = Known.zextOrTrunc(BitWidth);
44968 return false;
44969 }
44970 } else {
44971 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44972 KnownBits Known1;
44973 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44974 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44975 return true;
44976
44977 // If the length is 0, replace with 0.
44978 KnownBits LengthBits = Known1.extractBits(8, 8);
44979 if (LengthBits.isZero())
44980 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44981 }
44982
44983 break;
44984 }
44985 case X86ISD::PDEP: {
44986 SDValue Op0 = Op.getOperand(0);
44987 SDValue Op1 = Op.getOperand(1);
44988
44989 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44990 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44991
44992 // If the demanded bits has leading zeroes, we don't demand those from the
44993 // mask.
44994 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44995 return true;
44996
44997 // The number of possible 1s in the mask determines the number of LSBs of
44998 // operand 0 used. Undemanded bits from the mask don't matter so filter
44999 // them before counting.
45000 KnownBits Known2;
45001 uint64_t Count = (~Known.Zero & LoMask).popcount();
45002 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
45003 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
45004 return true;
45005
45006 // Zeroes are retained from the mask, but not ones.
45007 Known.One.clearAllBits();
45008 // The result will have at least as many trailing zeros as the non-mask
45009 // operand since bits can only map to the same or higher bit position.
45010 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
45011 return false;
45012 }
45013 case X86ISD::VPMADD52L:
45014 case X86ISD::VPMADD52H: {
45015 KnownBits KnownOp0, KnownOp1, KnownOp2;
45016 SDValue Op0 = Op.getOperand(0);
45017 SDValue Op1 = Op.getOperand(1);
45018 SDValue Op2 = Op.getOperand(2);
45019 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
45020 // operand 2).
45021 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
45022 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
45023 TLO, Depth + 1))
45024 return true;
45025
45026 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
45027 TLO, Depth + 1))
45028 return true;
45029
45030 if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
45031 KnownOp2, TLO, Depth + 1))
45032 return true;
45033
45034 KnownBits KnownMul;
45035 KnownOp0 = KnownOp0.trunc(52);
45036 KnownOp1 = KnownOp1.trunc(52);
45037 KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
45038 : KnownBits::mulhu(KnownOp0, KnownOp1);
45039 KnownMul = KnownMul.zext(64);
45040
45041 // lo/hi(X * Y) + Z --> C + Z
45042 if (KnownMul.isConstant()) {
45043 SDLoc DL(Op);
45044 SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
45045 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
45046 }
45047
45048 Known = KnownBits::add(KnownMul, KnownOp2);
45049 return false;
45050 }
45051 }
45052
45054 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
45055}
45056
45058 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
45059 SelectionDAG &DAG, unsigned Depth) const {
45060 int NumElts = DemandedElts.getBitWidth();
45061 unsigned Opc = Op.getOpcode();
45062 EVT VT = Op.getValueType();
45063
45064 switch (Opc) {
45065 case X86ISD::PINSRB:
45066 case X86ISD::PINSRW: {
45067 // If we don't demand the inserted element, return the base vector.
45068 SDValue Vec = Op.getOperand(0);
45069 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
45070 MVT VecVT = Vec.getSimpleValueType();
45071 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
45072 !DemandedElts[CIdx->getZExtValue()])
45073 return Vec;
45074 break;
45075 }
45076 case X86ISD::VSHLI: {
45077 // If we are only demanding sign bits then we can use the shift source
45078 // directly.
45079 SDValue Op0 = Op.getOperand(0);
45080 unsigned ShAmt = Op.getConstantOperandVal(1);
45081 unsigned BitWidth = DemandedBits.getBitWidth();
45082 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45083 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45084 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45085 return Op0;
45086 break;
45087 }
45088 case X86ISD::VSRAI:
45089 // iff we only need the sign bit then we can use the source directly.
45090 // TODO: generalize where we only demand extended signbits.
45091 if (DemandedBits.isSignMask())
45092 return Op.getOperand(0);
45093 break;
45094 case X86ISD::PCMPGT:
45095 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45096 // iff we only need the sign bit then we can use R directly.
45097 if (DemandedBits.isSignMask() &&
45098 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45099 return Op.getOperand(1);
45100 break;
45101 case X86ISD::BLENDV: {
45102 // BLENDV: Cond (MSB) ? LHS : RHS
45103 SDValue Cond = Op.getOperand(0);
45104 SDValue LHS = Op.getOperand(1);
45105 SDValue RHS = Op.getOperand(2);
45106
45107 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45108 if (CondKnown.isNegative())
45109 return LHS;
45110 if (CondKnown.isNonNegative())
45111 return RHS;
45112 break;
45113 }
45114 case X86ISD::ANDNP: {
45115 // ANDNP = (~LHS & RHS);
45116 SDValue LHS = Op.getOperand(0);
45117 SDValue RHS = Op.getOperand(1);
45118
45119 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45120 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45121
45122 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45123 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45124 // this context, so return RHS.
45125 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45126 return RHS;
45127 break;
45128 }
45129 }
45130
45131 APInt ShuffleUndef, ShuffleZero;
45132 SmallVector<int, 16> ShuffleMask;
45134 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45135 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45136 // If all the demanded elts are from one operand and are inline,
45137 // then we can use the operand directly.
45138 int NumOps = ShuffleOps.size();
45139 if (ShuffleMask.size() == (unsigned)NumElts &&
45141 return VT.getSizeInBits() == V.getValueSizeInBits();
45142 })) {
45143
45144 if (DemandedElts.isSubsetOf(ShuffleUndef))
45145 return DAG.getUNDEF(VT);
45146 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45147 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45148
45149 // Bitmask that indicates which ops have only been accessed 'inline'.
45150 APInt IdentityOp = APInt::getAllOnes(NumOps);
45151 for (int i = 0; i != NumElts; ++i) {
45152 int M = ShuffleMask[i];
45153 if (!DemandedElts[i] || ShuffleUndef[i])
45154 continue;
45155 int OpIdx = M / NumElts;
45156 int EltIdx = M % NumElts;
45157 if (M < 0 || EltIdx != i) {
45158 IdentityOp.clearAllBits();
45159 break;
45160 }
45161 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45162 if (IdentityOp == 0)
45163 break;
45164 }
45165 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45166 "Multiple identity shuffles detected");
45167
45168 if (IdentityOp != 0)
45169 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45170 }
45171 }
45172
45174 Op, DemandedBits, DemandedElts, DAG, Depth);
45175}
45176
45178 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45179 bool PoisonOnly, unsigned Depth) const {
45180 unsigned NumElts = DemandedElts.getBitWidth();
45181
45182 switch (Op.getOpcode()) {
45184 case X86ISD::Wrapper:
45185 case X86ISD::WrapperRIP:
45186 return true;
45187 case X86ISD::BLENDI:
45188 case X86ISD::PSHUFD:
45189 case X86ISD::UNPCKL:
45190 case X86ISD::UNPCKH:
45191 case X86ISD::VPERMILPI:
45192 case X86ISD::VPERMV3: {
45195 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45196 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45197 APInt::getZero(NumElts));
45198 for (auto M : enumerate(Mask)) {
45199 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45200 continue;
45201 if (M.value() == SM_SentinelUndef)
45202 return false;
45203 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45204 "Shuffle mask index out of range");
45205 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45206 }
45207 for (auto Op : enumerate(Ops))
45208 if (!DemandedSrcElts[Op.index()].isZero() &&
45210 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45211 return false;
45212 return true;
45213 }
45214 break;
45215 }
45216 }
45218 Op, DemandedElts, DAG, PoisonOnly, Depth);
45219}
45220
45222 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45223 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45224
45225 switch (Op.getOpcode()) {
45226 // SSE bit logic.
45227 case X86ISD::FAND:
45228 case X86ISD::FOR:
45229 case X86ISD::FXOR:
45230 case X86ISD::FANDN:
45231 case X86ISD::ANDNP:
45232 case X86ISD::VPTERNLOG:
45233 return false;
45234 // SSE vector insert/extracts use modulo indices.
45235 case X86ISD::PINSRB:
45236 case X86ISD::PINSRW:
45237 case X86ISD::PEXTRB:
45238 case X86ISD::PEXTRW:
45239 return false;
45240 // SSE vector multiplies are either inbounds or saturate.
45241 case X86ISD::VPMADDUBSW:
45242 case X86ISD::VPMADDWD:
45243 return false;
45244 // SSE vector shifts handle out of bounds shift amounts.
45245 case X86ISD::VSHLI:
45246 case X86ISD::VSRLI:
45247 case X86ISD::VSRAI:
45248 return false;
45249 // SSE blends.
45250 case X86ISD::BLENDI:
45251 case X86ISD::BLENDV:
45252 return false;
45253 // SSE target shuffles.
45254 case X86ISD::PSHUFD:
45255 case X86ISD::UNPCKL:
45256 case X86ISD::UNPCKH:
45257 case X86ISD::VPERMILPI:
45258 case X86ISD::VPERMV3:
45259 return false;
45260 // SSE comparisons handle all icmp/fcmp cases.
45261 // TODO: Add CMPM/MM with test coverage.
45262 case X86ISD::CMPP:
45263 case X86ISD::PCMPEQ:
45264 case X86ISD::PCMPGT:
45265 return false;
45266 // SSE signbit extraction.
45267 case X86ISD::MOVMSK:
45268 return false;
45269 // GFNI instructions.
45272 case X86ISD::GF2P8MULB:
45273 return false;
45275 switch (Op->getConstantOperandVal(0)) {
45276 case Intrinsic::x86_sse2_pmadd_wd:
45277 case Intrinsic::x86_avx2_pmadd_wd:
45278 case Intrinsic::x86_avx512_pmaddw_d_512:
45279 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45280 case Intrinsic::x86_avx2_pmadd_ub_sw:
45281 case Intrinsic::x86_avx512_pmaddubs_w_512:
45282 return false;
45283 case Intrinsic::x86_avx512_vpermi2var_d_128:
45284 case Intrinsic::x86_avx512_vpermi2var_d_256:
45285 case Intrinsic::x86_avx512_vpermi2var_d_512:
45286 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45287 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45288 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45289 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45290 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45291 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45292 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45293 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45294 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45295 case Intrinsic::x86_avx512_vpermi2var_q_128:
45296 case Intrinsic::x86_avx512_vpermi2var_q_256:
45297 case Intrinsic::x86_avx512_vpermi2var_q_512:
45298 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45299 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45300 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45301 return false;
45302 }
45303 }
45305 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45306}
45307
45309 const APInt &DemandedElts,
45310 APInt &UndefElts,
45311 const SelectionDAG &DAG,
45312 unsigned Depth) const {
45313 unsigned NumElts = DemandedElts.getBitWidth();
45314 unsigned Opc = Op.getOpcode();
45315
45316 switch (Opc) {
45317 case X86ISD::VBROADCAST:
45319 UndefElts = APInt::getZero(NumElts);
45320 return true;
45321 }
45322
45323 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45324 DAG, Depth);
45325}
45326
45327// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45328// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45329static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45330 bool AllowTruncate, unsigned Depth) {
45331 // Limit recursion.
45333 return false;
45334 switch (Src.getOpcode()) {
45335 case ISD::TRUNCATE:
45336 if (!AllowTruncate)
45337 return false;
45338 [[fallthrough]];
45339 case ISD::SETCC:
45340 return Src.getOperand(0).getValueSizeInBits() == Size;
45341 case ISD::FREEZE:
45342 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45343 Depth + 1);
45344 case ISD::AND:
45345 case ISD::XOR:
45346 case ISD::OR:
45347 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45348 Depth + 1) &&
45349 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45350 Depth + 1);
45351 case ISD::SELECT:
45352 case ISD::VSELECT:
45353 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45354 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45355 Depth + 1) &&
45356 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45357 Depth + 1);
45358 case ISD::BUILD_VECTOR:
45359 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45360 ISD::isBuildVectorAllOnes(Src.getNode());
45361 }
45362 return false;
45363}
45364
45365// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45366static unsigned getAltBitOpcode(unsigned Opcode) {
45367 switch(Opcode) {
45368 // clang-format off
45369 case ISD::AND: return X86ISD::FAND;
45370 case ISD::OR: return X86ISD::FOR;
45371 case ISD::XOR: return X86ISD::FXOR;
45372 case X86ISD::ANDNP: return X86ISD::FANDN;
45373 // clang-format on
45374 }
45375 llvm_unreachable("Unknown bitwise opcode");
45376}
45377
45378// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45380 const SDLoc &DL) {
45381 EVT SrcVT = Src.getValueType();
45382 if (SrcVT != MVT::v4i1)
45383 return SDValue();
45384
45385 switch (Src.getOpcode()) {
45386 case ISD::SETCC:
45387 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45388 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45389 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45390 SDValue Op0 = Src.getOperand(0);
45391 if (ISD::isNormalLoad(Op0.getNode()))
45392 return DAG.getBitcast(MVT::v4f32, Op0);
45393 if (Op0.getOpcode() == ISD::BITCAST &&
45394 Op0.getOperand(0).getValueType() == MVT::v4f32)
45395 return Op0.getOperand(0);
45396 }
45397 break;
45398 case ISD::AND:
45399 case ISD::XOR:
45400 case ISD::OR: {
45401 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45402 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45403 if (Op0 && Op1)
45404 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45405 Op1);
45406 break;
45407 }
45408 }
45409 return SDValue();
45410}
45411
45412// Helper to push sign extension of vXi1 SETCC result through bitops.
45414 SDValue Src, const SDLoc &DL) {
45415 switch (Src.getOpcode()) {
45416 case ISD::SETCC:
45417 case ISD::FREEZE:
45418 case ISD::TRUNCATE:
45419 case ISD::BUILD_VECTOR:
45420 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45421 case ISD::AND:
45422 case ISD::XOR:
45423 case ISD::OR:
45424 return DAG.getNode(
45425 Src.getOpcode(), DL, SExtVT,
45426 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45427 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45428 case ISD::SELECT:
45429 case ISD::VSELECT:
45430 return DAG.getSelect(
45431 DL, SExtVT, Src.getOperand(0),
45432 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45433 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45434 }
45435 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45436}
45437
45438// Try to match patterns such as
45439// (i16 bitcast (v16i1 x))
45440// ->
45441// (i16 movmsk (16i8 sext (v16i1 x)))
45442// before the illegal vector is scalarized on subtargets that don't have legal
45443// vxi1 types.
45445 const SDLoc &DL,
45446 const X86Subtarget &Subtarget) {
45447 EVT SrcVT = Src.getValueType();
45448 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
45449 return SDValue();
45450
45451 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45452 // legalization destroys the v4i32 type.
45453 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45454 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45455 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45456 DAG.getBitcast(MVT::v4f32, V));
45457 return DAG.getZExtOrTrunc(V, DL, VT);
45458 }
45459 }
45460
45461 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45462 // movmskb even with avx512. This will be better than truncating to vXi1 and
45463 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45464 // vpcmpeqb/vpcmpgtb.
45465 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45466 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45467 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45468 Src.getOperand(0).getValueType() == MVT::v64i8);
45469
45470 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45471 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45472 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45473 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45474 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45475 EVT CmpVT = Src.getOperand(0).getValueType();
45476 EVT EltVT = CmpVT.getVectorElementType();
45477 if (CmpVT.getSizeInBits() <= 256 &&
45478 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45479 PreferMovMsk = true;
45480 }
45481
45482 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45483 // MOVMSK is supported in SSE2 or later.
45484 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45485 return SDValue();
45486
45487 // If the upper ops of a concatenation are undef, then try to bitcast the
45488 // lower op and extend.
45489 SmallVector<SDValue, 4> SubSrcOps;
45490 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45491 SubSrcOps.size() >= 2) {
45492 SDValue LowerOp = SubSrcOps[0];
45493 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45494 if (LowerOp.getOpcode() == ISD::SETCC &&
45495 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45496 EVT SubVT = VT.getIntegerVT(
45497 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45498 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45499 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45500 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45501 }
45502 }
45503 }
45504
45505 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45506 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45507 // v8i16 and v16i16.
45508 // For these two cases, we can shuffle the upper element bytes to a
45509 // consecutive sequence at the start of the vector and treat the results as
45510 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45511 // for v16i16 this is not the case, because the shuffle is expensive, so we
45512 // avoid sign-extending to this type entirely.
45513 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45514 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45515 MVT SExtVT;
45516 bool PropagateSExt = false;
45517 switch (SrcVT.getSimpleVT().SimpleTy) {
45518 default:
45519 return SDValue();
45520 case MVT::v2i1:
45521 SExtVT = MVT::v2i64;
45522 break;
45523 case MVT::v4i1:
45524 SExtVT = MVT::v4i32;
45525 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45526 // sign-extend to a 256-bit operation to avoid truncation.
45527 if (Subtarget.hasAVX() &&
45528 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45529 SExtVT = MVT::v4i64;
45530 PropagateSExt = true;
45531 }
45532 break;
45533 case MVT::v8i1:
45534 SExtVT = MVT::v8i16;
45535 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45536 // sign-extend to a 256-bit operation to match the compare.
45537 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45538 // 256-bit because the shuffle is cheaper than sign extending the result of
45539 // the compare.
45540 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45541 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45542 SExtVT = MVT::v8i32;
45543 PropagateSExt = true;
45544 }
45545 break;
45546 case MVT::v16i1:
45547 SExtVT = MVT::v16i8;
45548 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45549 // it is not profitable to sign-extend to 256-bit because this will
45550 // require an extra cross-lane shuffle which is more expensive than
45551 // truncating the result of the compare to 128-bits.
45552 break;
45553 case MVT::v32i1:
45554 SExtVT = MVT::v32i8;
45555 break;
45556 case MVT::v64i1:
45557 // If we have AVX512F, but not AVX512BW and the input is truncated from
45558 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45559 if (Subtarget.hasAVX512()) {
45560 if (Subtarget.hasBWI())
45561 return SDValue();
45562 SExtVT = MVT::v64i8;
45563 break;
45564 }
45565 // Split if this is a <64 x i8> comparison result.
45566 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45567 SExtVT = MVT::v64i8;
45568 break;
45569 }
45570 return SDValue();
45571 };
45572
45573 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45574 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45575
45576 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45577 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45578 } else {
45579 if (SExtVT == MVT::v8i16) {
45580 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45581 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45582 }
45583 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45584 }
45585
45586 EVT IntVT =
45588 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45589 return DAG.getBitcast(VT, V);
45590}
45591
45592// Convert a vXi1 constant build vector to the same width scalar integer.
45594 EVT SrcVT = Op.getValueType();
45595 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45596 "Expected a vXi1 vector");
45598 "Expected a constant build vector");
45599
45600 APInt Imm(SrcVT.getVectorNumElements(), 0);
45601 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45602 SDValue In = Op.getOperand(Idx);
45603 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45604 Imm.setBit(Idx);
45605 }
45606 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45607 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45608}
45609
45612 const X86Subtarget &Subtarget) {
45613 using namespace SDPatternMatch;
45614 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45615
45616 if (!DCI.isBeforeLegalizeOps())
45617 return SDValue();
45618
45619 // Only do this if we have k-registers.
45620 if (!Subtarget.hasAVX512())
45621 return SDValue();
45622
45623 EVT DstVT = N->getValueType(0);
45624 SDValue Op = N->getOperand(0);
45625 EVT SrcVT = Op.getValueType();
45626
45627 // Make sure we have a bitcast between mask registers and a scalar type.
45628 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45629 DstVT.isScalarInteger()) &&
45630 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45631 SrcVT.isScalarInteger()))
45632 return SDValue();
45633
45634 SDValue LHS, RHS;
45635
45636 // Look for logic ops.
45638 return SDValue();
45639
45640 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45641 // least one of the getBitcast() will fold away).
45642 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45644 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45645 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45646
45647 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45648 // Most of these have to move a constant from the scalar domain anyway.
45651 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45652 DAG.getBitcast(DstVT, LHS), RHS);
45653 }
45654
45655 return SDValue();
45656}
45657
45659 const X86Subtarget &Subtarget) {
45660 SDLoc DL(BV);
45661 unsigned NumElts = BV->getNumOperands();
45662 SDValue Splat = BV->getSplatValue();
45663
45664 // Build MMX element from integer GPR or SSE float values.
45665 auto CreateMMXElement = [&](SDValue V) {
45666 if (V.isUndef())
45667 return DAG.getUNDEF(MVT::x86mmx);
45668 if (V.getValueType().isFloatingPoint()) {
45669 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45670 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45671 V = DAG.getBitcast(MVT::v2i64, V);
45672 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45673 }
45674 V = DAG.getBitcast(MVT::i32, V);
45675 } else {
45676 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45677 }
45678 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45679 };
45680
45681 // Convert build vector ops to MMX data in the bottom elements.
45683
45684 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45685
45686 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45687 if (Splat) {
45688 if (Splat.isUndef())
45689 return DAG.getUNDEF(MVT::x86mmx);
45690
45691 Splat = CreateMMXElement(Splat);
45692
45693 if (Subtarget.hasSSE1()) {
45694 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45695 if (NumElts == 8)
45696 Splat = DAG.getNode(
45697 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45698 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45699 TLI.getPointerTy(DAG.getDataLayout())),
45700 Splat, Splat);
45701
45702 // Use PSHUFW to repeat 16-bit elements.
45703 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45704 return DAG.getNode(
45705 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45706 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45707 TLI.getPointerTy(DAG.getDataLayout())),
45708 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45709 }
45710 Ops.append(NumElts, Splat);
45711 } else {
45712 for (unsigned i = 0; i != NumElts; ++i)
45713 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45714 }
45715
45716 // Use tree of PUNPCKLs to build up general MMX vector.
45717 while (Ops.size() > 1) {
45718 unsigned NumOps = Ops.size();
45719 unsigned IntrinOp =
45720 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45721 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45722 : Intrinsic::x86_mmx_punpcklbw));
45723 SDValue Intrin = DAG.getTargetConstant(
45724 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45725 for (unsigned i = 0; i != NumOps; i += 2)
45726 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45727 Ops[i], Ops[i + 1]);
45728 Ops.resize(NumOps / 2);
45729 }
45730
45731 return Ops[0];
45732}
45733
45734// Recursive function that attempts to find if a bool vector node was originally
45735// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45736// integer. If so, replace the scalar ops with bool vector equivalents back down
45737// the chain.
45739 SelectionDAG &DAG,
45740 const X86Subtarget &Subtarget,
45741 unsigned Depth = 0) {
45743 return SDValue(); // Limit search depth.
45744
45745 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45746 unsigned Opc = V.getOpcode();
45747 switch (Opc) {
45748 case ISD::BITCAST: {
45749 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45750 SDValue Src = V.getOperand(0);
45751 EVT SrcVT = Src.getValueType();
45752 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45753 return DAG.getBitcast(VT, Src);
45754 break;
45755 }
45756 case ISD::Constant: {
45757 auto *C = cast<ConstantSDNode>(V);
45758 if (C->isZero())
45759 return DAG.getConstant(0, DL, VT);
45760 if (C->isAllOnes())
45761 return DAG.getAllOnesConstant(DL, VT);
45762 break;
45763 }
45764 case ISD::TRUNCATE: {
45765 // If we find a suitable source, a truncated scalar becomes a subvector.
45766 SDValue Src = V.getOperand(0);
45767 EVT NewSrcVT =
45768 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45769 if (TLI.isTypeLegal(NewSrcVT))
45770 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45771 Subtarget, Depth + 1))
45772 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45773 DAG.getVectorIdxConstant(0, DL));
45774 break;
45775 }
45776 case ISD::ANY_EXTEND:
45777 case ISD::ZERO_EXTEND: {
45778 // If we find a suitable source, an extended scalar becomes a subvector.
45779 SDValue Src = V.getOperand(0);
45780 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45781 Src.getScalarValueSizeInBits());
45782 if (TLI.isTypeLegal(NewSrcVT))
45783 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45784 Subtarget, Depth + 1))
45785 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45786 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45787 : DAG.getConstant(0, DL, VT),
45788 N0, DAG.getVectorIdxConstant(0, DL));
45789 break;
45790 }
45791 case ISD::OR:
45792 case ISD::XOR: {
45793 // If we find suitable sources, we can just move the op to the vector
45794 // domain.
45795 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45796 Subtarget, Depth + 1))
45797 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45798 Subtarget, Depth + 1))
45799 return DAG.getNode(Opc, DL, VT, N0, N1);
45800 break;
45801 }
45802 case ISD::SHL: {
45803 // If we find a suitable source, a SHL becomes a KSHIFTL.
45804 SDValue Src0 = V.getOperand(0);
45805 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45806 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45807 break;
45808
45809 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45810 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45811 Depth + 1))
45812 return DAG.getNode(
45813 X86ISD::KSHIFTL, DL, VT, N0,
45814 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45815 break;
45816 }
45817 }
45818
45819 // Does the inner bitcast already exist?
45820 if (Depth > 0)
45821 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45822 return SDValue(Alt, 0);
45823
45824 return SDValue();
45825}
45826
45829 const X86Subtarget &Subtarget) {
45830 SDValue N0 = N->getOperand(0);
45831 EVT VT = N->getValueType(0);
45832 EVT SrcVT = N0.getValueType();
45833 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45834
45835 // Try to match patterns such as
45836 // (i16 bitcast (v16i1 x))
45837 // ->
45838 // (i16 movmsk (16i8 sext (v16i1 x)))
45839 // before the setcc result is scalarized on subtargets that don't have legal
45840 // vxi1 types.
45841 if (DCI.isBeforeLegalize()) {
45842 SDLoc dl(N);
45843 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45844 return V;
45845
45846 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45847 // type, widen both sides to avoid a trip through memory.
45848 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45849 Subtarget.hasAVX512()) {
45850 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45851 N0 = DAG.getBitcast(MVT::v8i1, N0);
45852 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45853 DAG.getVectorIdxConstant(0, dl));
45854 }
45855
45856 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45857 // type, widen both sides to avoid a trip through memory.
45858 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45859 Subtarget.hasAVX512()) {
45860 // Use zeros for the widening if we already have some zeroes. This can
45861 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45862 // stream of this.
45863 // FIXME: It might make sense to detect a concat_vectors with a mix of
45864 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45865 // a separate combine. What we can't do is canonicalize the operands of
45866 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45867 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45868 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45869 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45870 SrcVT = LastOp.getValueType();
45871 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45873 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45874 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45875 N0 = DAG.getBitcast(MVT::i8, N0);
45876 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45877 }
45878 }
45879
45880 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45881 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45882 Ops[0] = N0;
45883 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45884 N0 = DAG.getBitcast(MVT::i8, N0);
45885 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45886 }
45887 } else if (DCI.isAfterLegalizeDAG()) {
45888 // If we're bitcasting from iX to vXi1, see if the integer originally
45889 // began as a vXi1 and whether we can remove the bitcast entirely.
45890 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45891 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45892 if (SDValue V =
45893 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45894 return V;
45895 }
45896 }
45897
45898 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45899 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45900 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45901 // we can help with known bits propagation from the vXi1 domain to the
45902 // scalar domain.
45903 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45904 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45905 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45907 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45908 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45909
45910 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45911 // and the vbroadcast_load are both integer or both fp. In some cases this
45912 // will remove the bitcast entirely.
45913 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45914 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45915 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45916 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45917 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45918 // Don't swap i8/i16 since don't have fp types that size.
45919 if (MemSize >= 32) {
45920 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45921 : MVT::getIntegerVT(MemSize);
45922 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45923 : MVT::getIntegerVT(SrcVTSize);
45924 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45925
45926 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45927 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45928 SDValue ResNode =
45930 MemVT, BCast->getMemOperand());
45931 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45932 return DAG.getBitcast(VT, ResNode);
45933 }
45934 }
45935
45936 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45937 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45938 SDValue Src = peekThroughTruncates(N0);
45939 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45940 Src.getOperand(0).getValueSizeInBits() == 128 &&
45941 isNullConstant(Src.getOperand(1))) {
45942 SDLoc DL(N);
45943 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45944 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45945 DAG.getVectorIdxConstant(0, DL));
45946 }
45947 }
45948
45949 // Since MMX types are special and don't usually play with other vector types,
45950 // it's better to handle them early to be sure we emit efficient code by
45951 // avoiding store-load conversions.
45952 if (VT == MVT::x86mmx) {
45953 // Detect MMX constant vectors.
45954 APInt UndefElts;
45955 SmallVector<APInt, 1> EltBits;
45956 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45957 /*AllowWholeUndefs*/ true,
45958 /*AllowPartialUndefs*/ true)) {
45959 SDLoc DL(N0);
45960 // Handle zero-extension of i32 with MOVD.
45961 if (EltBits[0].countl_zero() >= 32)
45962 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45963 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45964 // Else, bitcast to a double.
45965 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45966 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45967 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45968 }
45969
45970 // Detect bitcasts to x86mmx low word.
45971 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45972 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45973 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45974 bool LowUndef = true, AllUndefOrZero = true;
45975 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45976 SDValue Op = N0.getOperand(i);
45977 LowUndef &= Op.isUndef() || (i >= e/2);
45978 AllUndefOrZero &= isNullConstantOrUndef(Op);
45979 }
45980 if (AllUndefOrZero) {
45981 SDValue N00 = N0.getOperand(0);
45982 SDLoc dl(N00);
45983 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45984 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45985 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45986 }
45987 }
45988
45989 // Detect bitcasts of 64-bit build vectors and convert to a
45990 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45991 // lowest element.
45992 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45993 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
45994 SrcVT == MVT::v8i8))
45995 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
45996
45997 // Detect bitcasts between element or subvector extraction to x86mmx.
45998 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
46000 isNullConstant(N0.getOperand(1))) {
46001 SDValue N00 = N0.getOperand(0);
46002 if (N00.getValueType().is128BitVector())
46003 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
46004 DAG.getBitcast(MVT::v2i64, N00));
46005 }
46006
46007 // Detect bitcasts from FP_TO_SINT to x86mmx.
46008 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
46009 SDLoc DL(N0);
46010 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
46011 DAG.getUNDEF(MVT::v2i32));
46012 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
46013 DAG.getBitcast(MVT::v2i64, Res));
46014 }
46015 }
46016
46017 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
46018 // most of these to scalar anyway.
46019 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
46020 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
46022 return combinevXi1ConstantToInteger(N0, DAG);
46023 }
46024
46025 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
46026 VT.getVectorElementType() == MVT::i1) {
46027 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
46028 if (C->isAllOnes())
46029 return DAG.getConstant(1, SDLoc(N0), VT);
46030 if (C->isZero())
46031 return DAG.getConstant(0, SDLoc(N0), VT);
46032 }
46033 }
46034
46035 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
46036 // Turn it into a sign bit compare that produces a k-register. This avoids
46037 // a trip through a GPR.
46038 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
46039 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
46041 unsigned NumElts = VT.getVectorNumElements();
46042 SDValue Src = N0;
46043
46044 // Peek through truncate.
46045 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
46046 Src = N0.getOperand(0);
46047
46048 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
46049 SDValue MovmskIn = Src.getOperand(0);
46050 MVT MovmskVT = MovmskIn.getSimpleValueType();
46051 unsigned MovMskElts = MovmskVT.getVectorNumElements();
46052
46053 // We allow extra bits of the movmsk to be used since they are known zero.
46054 // We can't convert a VPMOVMSKB without avx512bw.
46055 if (MovMskElts <= NumElts &&
46056 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
46057 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
46058 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
46059 SDLoc dl(N);
46060 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
46061 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
46062 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
46063 if (EVT(CmpVT) == VT)
46064 return Cmp;
46065
46066 // Pad with zeroes up to original VT to replace the zeroes that were
46067 // being used from the MOVMSK.
46068 unsigned NumConcats = NumElts / MovMskElts;
46069 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
46070 Ops[0] = Cmp;
46071 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
46072 }
46073 }
46074 }
46075
46076 // Try to remove bitcasts from input and output of mask arithmetic to
46077 // remove GPR<->K-register crossings.
46078 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46079 return V;
46080
46081 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46082 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46083 SrcVT.getVectorNumElements() == 1)
46084 return N0.getOperand(1);
46085
46086 // Convert a bitcasted integer logic operation that has one bitcasted
46087 // floating-point operand into a floating-point logic operation. This may
46088 // create a load of a constant, but that is cheaper than materializing the
46089 // constant in an integer register and transferring it to an SSE register or
46090 // transferring the SSE operand to integer register and back.
46091 unsigned FPOpcode;
46092 switch (N0.getOpcode()) {
46093 // clang-format off
46094 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46095 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46096 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46097 default: return SDValue();
46098 // clang-format on
46099 }
46100
46101 // Check if we have a bitcast from another integer type as well.
46102 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46103 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46104 (Subtarget.hasFP16() && VT == MVT::f16) ||
46105 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46106 TLI.isTypeLegal(VT))))
46107 return SDValue();
46108
46109 SDValue LogicOp0 = N0.getOperand(0);
46110 SDValue LogicOp1 = N0.getOperand(1);
46111 SDLoc DL0(N0);
46112
46113 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46114 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46115 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46116 LogicOp0.getOperand(0).getValueType() == VT &&
46117 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46118 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46119 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46120 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46121 }
46122 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46123 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46124 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46125 LogicOp1.getOperand(0).getValueType() == VT &&
46126 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46127 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46128 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46129 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46130 }
46131
46132 return SDValue();
46133}
46134
46135// (mul (zext a), (sext, b))
46136static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46137 SDValue &Op1) {
46138 Op0 = Mul.getOperand(0);
46139 Op1 = Mul.getOperand(1);
46140
46141 // The operand1 should be signed extend
46142 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46143 std::swap(Op0, Op1);
46144
46145 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46146 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46147 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46148 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46149 return true;
46150
46151 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46152 return (BV && BV->isConstant());
46153 };
46154
46155 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46156 // value, we need to check Op0 is zero extended value. Op1 should be signed
46157 // value, so we just check the signed bits.
46158 if ((IsFreeTruncation(Op0) &&
46159 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46160 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46161 return true;
46162
46163 return false;
46164}
46165
46167 unsigned &LogBias, const SDLoc &DL,
46168 const X86Subtarget &Subtarget) {
46169 // Extend or truncate to MVT::i8 first.
46170 MVT Vi8VT =
46171 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46172 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46173 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46174
46175 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46176 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46177 // The src A, B element type is i8, but the dst C element type is i32.
46178 // When we calculate the reduce stage, we use src vector type vXi8 for it
46179 // so we need logbias 2 to avoid extra 2 stages.
46180 LogBias = 2;
46181
46182 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46183 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46184 RegSize = std::max(512u, RegSize);
46185
46186 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46187 // fill in the missing vector elements with 0.
46188 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46189 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46190 Ops[0] = LHS;
46191 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46192 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46193 Ops[0] = RHS;
46194 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46195
46196 // Actually build the DotProduct, split as 256/512 bits for
46197 // AVXVNNI/AVX512VNNI.
46198 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46200 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46201 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46202 };
46203 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46204 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46205
46206 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46207 DpBuilder, false);
46208}
46209
46210// Create a PSADBW given two sources representable as zexts of vXi8.
46212 const SDLoc &DL, const X86Subtarget &Subtarget) {
46213 // Find the appropriate width for the PSADBW.
46214 EVT DstVT = N0.getValueType();
46215 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46216 DstVT.getVectorElementCount());
46217 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46218
46219 // Widen the vXi8 vectors, padding with zero vector elements.
46220 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46221 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46222 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46223 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46224 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46225 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46226 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46227
46228 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46229 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46231 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46232 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46233 };
46234 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46235 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46236 PSADBWBuilder);
46237}
46238
46239// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46240// PHMINPOSUW.
46242 const X86Subtarget &Subtarget) {
46243 // Bail without SSE41.
46244 if (!Subtarget.hasSSE41())
46245 return SDValue();
46246
46247 EVT ExtractVT = Extract->getValueType(0);
46248 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46249 return SDValue();
46250
46251 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46252 ISD::NodeType BinOp;
46253 SDValue Src = DAG.matchBinOpReduction(
46254 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46255 if (!Src)
46256 return SDValue();
46257
46258 EVT SrcVT = Src.getValueType();
46259 EVT SrcSVT = SrcVT.getScalarType();
46260 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46261 return SDValue();
46262
46263 SDLoc DL(Extract);
46264 SDValue MinPos = Src;
46265
46266 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46267 while (SrcVT.getSizeInBits() > 128) {
46268 SDValue Lo, Hi;
46269 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46270 SrcVT = Lo.getValueType();
46271 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46272 }
46273 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46274 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46275 "Unexpected value type");
46276
46277 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46278 // to flip the value accordingly.
46279 SDValue Mask;
46280 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46281 if (BinOp == ISD::SMAX)
46282 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46283 else if (BinOp == ISD::SMIN)
46284 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46285 else if (BinOp == ISD::UMAX)
46286 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46287
46288 if (Mask)
46289 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46290
46291 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46292 // shuffling each upper element down and insert zeros. This means that the
46293 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46294 // ready for the PHMINPOS.
46295 if (ExtractVT == MVT::i8) {
46297 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46298 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46299 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46300 }
46301
46302 // Perform the PHMINPOS on a v8i16 vector,
46303 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46304 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46305 MinPos = DAG.getBitcast(SrcVT, MinPos);
46306
46307 if (Mask)
46308 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46309
46310 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46311 DAG.getVectorIdxConstant(0, DL));
46312}
46313
46314// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46316 const X86Subtarget &Subtarget) {
46317 // Bail without SSE2.
46318 if (!Subtarget.hasSSE2())
46319 return SDValue();
46320
46321 EVT ExtractVT = Extract->getValueType(0);
46322 unsigned BitWidth = ExtractVT.getSizeInBits();
46323 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46324 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46325 return SDValue();
46326
46327 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46328 ISD::NodeType BinOp;
46329 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46330 if (!Match && ExtractVT == MVT::i1)
46331 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46332 if (!Match)
46333 return SDValue();
46334
46335 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46336 // which we can't support here for now.
46337 if (Match.getScalarValueSizeInBits() != BitWidth)
46338 return SDValue();
46339
46340 SDValue Movmsk;
46341 SDLoc DL(Extract);
46342 EVT MatchVT = Match.getValueType();
46343 unsigned NumElts = MatchVT.getVectorNumElements();
46344 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46345 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46346 LLVMContext &Ctx = *DAG.getContext();
46347
46348 if (ExtractVT == MVT::i1) {
46349 // Special case for (pre-legalization) vXi1 reductions.
46350 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46351 return SDValue();
46352 if (Match.getOpcode() == ISD::SETCC) {
46353 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46354 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46355 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46356 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46357 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46358 X86::CondCode X86CC;
46359 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46360 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46361 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46362 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46363 DAG, X86CC))
46364 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46365 getSETCC(X86CC, V, DL, DAG));
46366 }
46367 }
46368 if (TLI.isTypeLegal(MatchVT)) {
46369 // If this is a legal AVX512 predicate type then we can just bitcast.
46370 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46371 Movmsk = DAG.getBitcast(MovmskVT, Match);
46372 } else {
46373 // Use combineBitcastvxi1 to create the MOVMSK.
46374 while (NumElts > MaxElts) {
46375 SDValue Lo, Hi;
46376 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46377 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46378 NumElts /= 2;
46379 }
46380 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46381 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46382 }
46383 if (!Movmsk)
46384 return SDValue();
46385 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46386 } else {
46387 // FIXME: Better handling of k-registers or 512-bit vectors?
46388 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46389 if (!(MatchSizeInBits == 128 ||
46390 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46391 return SDValue();
46392
46393 // Make sure this isn't a vector of 1 element. The perf win from using
46394 // MOVMSK diminishes with less elements in the reduction, but it is
46395 // generally better to get the comparison over to the GPRs as soon as
46396 // possible to reduce the number of vector ops.
46397 if (Match.getValueType().getVectorNumElements() < 2)
46398 return SDValue();
46399
46400 // Check that we are extracting a reduction of all sign bits.
46401 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46402 return SDValue();
46403
46404 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46405 SDValue Lo, Hi;
46406 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46407 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46408 MatchSizeInBits = Match.getValueSizeInBits();
46409 }
46410
46411 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46412 MVT MaskSrcVT;
46413 if (64 == BitWidth || 32 == BitWidth)
46415 MatchSizeInBits / BitWidth);
46416 else
46417 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46418
46419 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46420 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46421 NumElts = MaskSrcVT.getVectorNumElements();
46422 }
46423 assert((NumElts <= 32 || NumElts == 64) &&
46424 "Not expecting more than 64 elements");
46425
46426 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46427 if (BinOp == ISD::XOR) {
46428 // parity -> (PARITY(MOVMSK X))
46429 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46430 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46431 }
46432
46433 SDValue CmpC;
46434 ISD::CondCode CondCode;
46435 if (BinOp == ISD::OR) {
46436 // any_of -> MOVMSK != 0
46437 CmpC = DAG.getConstant(0, DL, CmpVT);
46438 CondCode = ISD::CondCode::SETNE;
46439 } else {
46440 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46441 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46442 DL, CmpVT);
46443 CondCode = ISD::CondCode::SETEQ;
46444 }
46445
46446 // The setcc produces an i8 of 0/1, so extend that to the result width and
46447 // negate to get the final 0/-1 mask value.
46448 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46449 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46450 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46451 return DAG.getNegative(Zext, DL, ExtractVT);
46452}
46453
46455 const X86Subtarget &Subtarget) {
46456 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46457 return SDValue();
46458
46459 EVT ExtractVT = Extract->getValueType(0);
46460 // Verify the type we're extracting is i32, as the output element type of
46461 // vpdpbusd is i32.
46462 if (ExtractVT != MVT::i32)
46463 return SDValue();
46464
46465 EVT VT = Extract->getOperand(0).getValueType();
46467 return SDValue();
46468
46469 // Match shuffle + add pyramid.
46470 ISD::NodeType BinOp;
46471 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46472
46473 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46474 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46475 // before adding into the accumulator.
46476 // TODO:
46477 // We also need to verify that the multiply has at least 2x the number of bits
46478 // of the input. We shouldn't match
46479 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46480 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46481 // Root = Root.getOperand(0);
46482
46483 // If there was a match, we want Root to be a mul.
46484 if (!Root || Root.getOpcode() != ISD::MUL)
46485 return SDValue();
46486
46487 // Check whether we have an extend and mul pattern
46488 SDValue LHS, RHS;
46489 if (!detectExtMul(DAG, Root, LHS, RHS))
46490 return SDValue();
46491
46492 // Create the dot product instruction.
46493 SDLoc DL(Extract);
46494 unsigned StageBias;
46495 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46496
46497 // If the original vector was wider than 4 elements, sum over the results
46498 // in the DP vector.
46499 unsigned Stages = Log2_32(VT.getVectorNumElements());
46500 EVT DpVT = DP.getValueType();
46501
46502 if (Stages > StageBias) {
46503 unsigned DpElems = DpVT.getVectorNumElements();
46504
46505 for (unsigned i = Stages - StageBias; i > 0; --i) {
46506 SmallVector<int, 16> Mask(DpElems, -1);
46507 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46508 Mask[j] = MaskEnd + j;
46509
46510 SDValue Shuffle =
46511 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46512 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46513 }
46514 }
46515
46516 // Return the lowest ExtractSizeInBits bits.
46517 EVT ResVT =
46518 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46519 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46520 DP = DAG.getBitcast(ResVT, DP);
46521 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46522 Extract->getOperand(1));
46523}
46524
46526 const X86Subtarget &Subtarget) {
46527 using namespace SDPatternMatch;
46528
46529 // PSADBW is only supported on SSE2 and up.
46530 if (!Subtarget.hasSSE2())
46531 return SDValue();
46532
46533 EVT ExtractVT = Extract->getValueType(0);
46534 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46535 ExtractVT != MVT::i64)
46536 return SDValue();
46537
46538 EVT VT = Extract->getOperand(0).getValueType();
46540 return SDValue();
46541
46542 // Match shuffle + add pyramid.
46543 ISD::NodeType BinOp;
46544 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46545 if (!Root)
46546 return SDValue();
46547
46548 // The operand is expected to be zero extended from i8.
46549 // In order to convert to i64 and above, additional any/zero/sign
46550 // extend is expected.
46551 // The zero extend from 32 bit has no mathematical effect on the result.
46552 // Also the sign extend is basically zero extend
46553 // (extends the sign bit which is zero).
46554 // So it is correct to skip the sign/zero extend instruction.
46555 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46556 Root.getOpcode() == ISD::ZERO_EXTEND ||
46557 Root.getOpcode() == ISD::ANY_EXTEND)
46558 Root = Root.getOperand(0);
46559
46560 // Check whether we have an vXi8 abdu pattern.
46561 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46562 SDValue Src0, Src1;
46563 if (!sd_match(
46564 Root,
46565 m_AnyOf(
46567 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46569 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46570 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46571 m_Abs(
46572 m_Sub(m_AllOf(m_Value(Src0),
46574 m_AllOf(m_Value(Src1),
46575 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46576 return SDValue();
46577
46578 // Create the SAD instruction.
46579 SDLoc DL(Extract);
46580 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46581
46582 // If the original vector was wider than 8 elements, sum over the results
46583 // in the SAD vector.
46584 unsigned Stages = Log2_32(VT.getVectorNumElements());
46585 EVT SadVT = SAD.getValueType();
46586 if (Stages > 3) {
46587 unsigned SadElems = SadVT.getVectorNumElements();
46588
46589 for(unsigned i = Stages - 3; i > 0; --i) {
46590 SmallVector<int, 16> Mask(SadElems, -1);
46591 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46592 Mask[j] = MaskEnd + j;
46593
46594 SDValue Shuffle =
46595 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46596 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46597 }
46598 }
46599
46600 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46601 // Return the lowest ExtractSizeInBits bits.
46602 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46603 SadVT.getSizeInBits() / ExtractSizeInBits);
46604 SAD = DAG.getBitcast(ResVT, SAD);
46605 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46606 Extract->getOperand(1));
46607}
46608
46609// If this extract is from a loaded vector value and will be used as an
46610// integer, that requires a potentially expensive XMM -> GPR transfer.
46611// Additionally, if we can convert to a scalar integer load, that will likely
46612// be folded into a subsequent integer op.
46613// Note: SrcVec might not have a VecVT type, but it must be the same size.
46614// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46615// to a single-use of the loaded vector. For the reasons above, we
46616// expect this to be profitable even if it creates an extra load.
46617static SDValue
46619 const SDLoc &dl, SelectionDAG &DAG,
46621 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46622 "Only EXTRACT_VECTOR_ELT supported so far");
46623
46624 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46625 EVT VT = N->getValueType(0);
46626
46627 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46628 return Use->getOpcode() == ISD::STORE ||
46629 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46630 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46631 });
46632
46633 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46634 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46635 VecVT.getVectorElementType() == VT &&
46636 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46637 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46638 SDValue NewPtr = TLI.getVectorElementPointer(
46639 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46640 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46641 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46642 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46643 SDValue Load =
46644 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46645 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46646 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46647 return Load;
46648 }
46649
46650 return SDValue();
46651}
46652
46653// Attempt to peek through a target shuffle and extract the scalar from the
46654// source.
46657 const X86Subtarget &Subtarget) {
46658 if (DCI.isBeforeLegalizeOps())
46659 return SDValue();
46660
46661 SDLoc dl(N);
46662 SDValue Src = N->getOperand(0);
46663 SDValue Idx = N->getOperand(1);
46664
46665 EVT VT = N->getValueType(0);
46666 EVT SrcVT = Src.getValueType();
46667 EVT SrcSVT = SrcVT.getVectorElementType();
46668 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46669 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46670
46671 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46672 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46673 return SDValue();
46674
46675 const APInt &IdxC = N->getConstantOperandAPInt(1);
46676 if (IdxC.uge(NumSrcElts))
46677 return SDValue();
46678
46679 SDValue SrcBC = peekThroughBitcasts(Src);
46680
46681 // Handle extract(bitcast(broadcast(scalar_value))).
46682 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46683 SDValue SrcOp = SrcBC.getOperand(0);
46684 EVT SrcOpVT = SrcOp.getValueType();
46685 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46686 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46687 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46688 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46689 // TODO support non-zero offsets.
46690 if (Offset == 0) {
46691 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46692 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46693 return SrcOp;
46694 }
46695 }
46696 }
46697
46698 // If we're extracting a single element from a broadcast load and there are
46699 // no other users, just create a single load.
46701 SrcBC.hasOneUse()) {
46702 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46703 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46704 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46705 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46706 SDValue Load =
46707 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46708 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46709 MemIntr->getMemOperand()->getFlags());
46710 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46711 return Load;
46712 }
46713 }
46714
46715 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46716 // TODO: Move to DAGCombine?
46717 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46718 SrcBC.getValueType().isInteger() &&
46719 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46720 SrcBC.getScalarValueSizeInBits() ==
46721 SrcBC.getOperand(0).getValueSizeInBits()) {
46722 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46723 if (IdxC.ult(Scale)) {
46724 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46725 SDValue Scl = SrcBC.getOperand(0);
46726 EVT SclVT = Scl.getValueType();
46727 if (Offset) {
46728 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46729 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46730 }
46731 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46732 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46733 return Scl;
46734 }
46735 }
46736
46737 // Handle extract(truncate(x)) for 0'th index.
46738 // TODO: Treat this as a faux shuffle?
46739 // TODO: When can we use this for general indices?
46740 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46741 (SrcVT.getSizeInBits() % 128) == 0) {
46742 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46743 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46744 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46745 Idx);
46746 }
46747
46748 // We can only legally extract other elements from 128-bit vectors and in
46749 // certain circumstances, depending on SSE-level.
46750 // TODO: Investigate float/double extraction if it will be just stored.
46751 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46752 unsigned Idx) {
46753 EVT VecSVT = VecVT.getScalarType();
46754 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46755 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46756 VecSVT == MVT::i64)) {
46757 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46758 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46759 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46760 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46761 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46762 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46763 Idx &= (NumEltsPerLane - 1);
46764 }
46765 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46766 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46767 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46768 DAG.getBitcast(VecVT, Vec),
46769 DAG.getVectorIdxConstant(Idx, dl));
46770 }
46771 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46772 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46773 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46774 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46775 DAG.getTargetConstant(Idx, dl, MVT::i8));
46776 }
46777 return SDValue();
46778 };
46779
46780 // Resolve the target shuffle inputs and mask.
46783 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46784 return SDValue();
46785
46786 // Shuffle inputs must be the same size as the result.
46787 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46788 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46789 }))
46790 return SDValue();
46791
46792 // Attempt to narrow/widen the shuffle mask to the correct size.
46793 if (Mask.size() != NumSrcElts) {
46794 if ((NumSrcElts % Mask.size()) == 0) {
46795 SmallVector<int, 16> ScaledMask;
46796 int Scale = NumSrcElts / Mask.size();
46797 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46798 Mask = std::move(ScaledMask);
46799 } else if ((Mask.size() % NumSrcElts) == 0) {
46800 // Simplify Mask based on demanded element.
46801 int ExtractIdx = (int)IdxC.getZExtValue();
46802 int Scale = Mask.size() / NumSrcElts;
46803 int Lo = Scale * ExtractIdx;
46804 int Hi = Scale * (ExtractIdx + 1);
46805 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46806 if (i < Lo || Hi <= i)
46807 Mask[i] = SM_SentinelUndef;
46808
46809 SmallVector<int, 16> WidenedMask;
46810 while (Mask.size() > NumSrcElts &&
46811 canWidenShuffleElements(Mask, WidenedMask))
46812 Mask = std::move(WidenedMask);
46813 }
46814 }
46815
46816 // If narrowing/widening failed, see if we can extract+zero-extend.
46817 int ExtractIdx;
46818 EVT ExtractVT;
46819 if (Mask.size() == NumSrcElts) {
46820 ExtractIdx = Mask[IdxC.getZExtValue()];
46821 ExtractVT = SrcVT;
46822 } else {
46823 unsigned Scale = Mask.size() / NumSrcElts;
46824 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46825 return SDValue();
46826 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46827 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46828 return SDValue();
46829 ExtractIdx = Mask[ScaledIdx];
46830 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46831 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46832 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46833 "Failed to widen vector type");
46834 }
46835
46836 // If the shuffle source element is undef/zero then we can just accept it.
46837 if (ExtractIdx == SM_SentinelUndef)
46838 return DAG.getUNDEF(VT);
46839
46840 if (ExtractIdx == SM_SentinelZero)
46841 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46842 : DAG.getConstant(0, dl, VT);
46843
46844 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46845 ExtractIdx = ExtractIdx % Mask.size();
46846 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46847 return DAG.getZExtOrTrunc(V, dl, VT);
46848
46849 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46851 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46852 return V;
46853
46854 return SDValue();
46855}
46856
46857/// Extracting a scalar FP value from vector element 0 is free, so extract each
46858/// operand first, then perform the math as a scalar op.
46860 const X86Subtarget &Subtarget,
46862 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46863 SDValue Vec = ExtElt->getOperand(0);
46864 SDValue Index = ExtElt->getOperand(1);
46865 EVT VT = ExtElt->getValueType(0);
46866 EVT VecVT = Vec.getValueType();
46867
46868 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46869 // non-zero element because the shuffle+scalar op will be cheaper?
46870 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46871 return SDValue();
46872
46873 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46874 // extract, the condition code), so deal with those as a special-case.
46875 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46876 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46877 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46878 return SDValue();
46879
46880 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46881 SDLoc DL(ExtElt);
46882 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46883 Vec.getOperand(0), Index);
46884 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46885 Vec.getOperand(1), Index);
46886 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46887 }
46888
46889 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46890 VT != MVT::f64)
46891 return SDValue();
46892
46893 // Vector FP selects don't fit the pattern of FP math ops (because the
46894 // condition has a different type and we have to change the opcode), so deal
46895 // with those here.
46896 // FIXME: This is restricted to pre type legalization. If we loosen this we
46897 // need to convert vector bool to a scalar bool.
46898 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46899 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46900 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46901 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46902 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46903 SDLoc DL(ExtElt);
46906 Vec.getOperand(0), Index);
46907 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46908 Vec.getOperand(1), Index);
46909 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46910 Vec.getOperand(2), Index);
46911 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46912 }
46913
46914 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46915 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46916 // missed load folding and fma+fneg combining.
46917 switch (Vec.getOpcode()) {
46918 case ISD::FMA: // Begin 3 operands
46919 case ISD::FMAD:
46920 case ISD::FADD: // Begin 2 operands
46921 case ISD::FSUB:
46922 case ISD::FMUL:
46923 case ISD::FDIV:
46924 case ISD::FREM:
46925 case ISD::FCOPYSIGN:
46926 case ISD::FMINNUM:
46927 case ISD::FMAXNUM:
46928 case ISD::FMINNUM_IEEE:
46929 case ISD::FMAXNUM_IEEE:
46930 case ISD::FMAXIMUM:
46931 case ISD::FMINIMUM:
46932 case ISD::FMAXIMUMNUM:
46933 case ISD::FMINIMUMNUM:
46934 case X86ISD::FMAX:
46935 case X86ISD::FMIN:
46936 case ISD::FABS: // Begin 1 operand
46937 case ISD::FSQRT:
46938 case ISD::FRINT:
46939 case ISD::FCEIL:
46940 case ISD::FTRUNC:
46941 case ISD::FNEARBYINT:
46942 case ISD::FROUNDEVEN:
46943 case ISD::FROUND:
46944 case ISD::FFLOOR:
46945 case X86ISD::FRCP:
46946 case X86ISD::FRSQRT: {
46947 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46948 SDLoc DL(ExtElt);
46950 for (SDValue Op : Vec->ops())
46951 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46952 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46953 }
46954 default:
46955 return SDValue();
46956 }
46957 llvm_unreachable("All opcodes should return within switch");
46958}
46959
46960/// Try to convert a vector reduction sequence composed of binops and shuffles
46961/// into horizontal ops.
46963 const X86Subtarget &Subtarget) {
46964 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46965
46966 // We need at least SSE2 to anything here.
46967 if (!Subtarget.hasSSE2())
46968 return SDValue();
46969
46971 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46972 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46973 if (!Rdx)
46974 return SDValue();
46975
46976 SDValue Index = ExtElt->getOperand(1);
46977 assert(isNullConstant(Index) &&
46978 "Reduction doesn't end in an extract from index 0");
46979
46980 EVT VT = ExtElt->getValueType(0);
46981 EVT VecVT = Rdx.getValueType();
46982 if (VecVT.getScalarType() != VT)
46983 return SDValue();
46984
46985 SDLoc DL(ExtElt);
46986 unsigned NumElts = VecVT.getVectorNumElements();
46987 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46988
46989 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46990 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46991 if (V.getValueType() == MVT::v4i8) {
46992 if (ZeroExtend && Subtarget.hasSSE41()) {
46993 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
46994 DAG.getConstant(0, DL, MVT::v4i32),
46995 DAG.getBitcast(MVT::i32, V),
46996 DAG.getVectorIdxConstant(0, DL));
46997 return DAG.getBitcast(MVT::v16i8, V);
46998 }
46999 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
47000 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
47001 : DAG.getUNDEF(MVT::v4i8));
47002 }
47003 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
47004 DAG.getUNDEF(MVT::v8i8));
47005 };
47006
47007 // vXi8 mul reduction - promote to vXi16 mul reduction.
47008 if (Opc == ISD::MUL) {
47009 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
47010 return SDValue();
47011 if (VecVT.getSizeInBits() >= 128) {
47012 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
47013 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47014 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47015 Lo = DAG.getBitcast(WideVT, Lo);
47016 Hi = DAG.getBitcast(WideVT, Hi);
47017 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
47018 while (Rdx.getValueSizeInBits() > 128) {
47019 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47020 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
47021 }
47022 } else {
47023 Rdx = WidenToV16I8(Rdx, false);
47024 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
47025 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
47026 }
47027 if (NumElts >= 8)
47028 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47029 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47030 {4, 5, 6, 7, -1, -1, -1, -1}));
47031 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47032 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47033 {2, 3, -1, -1, -1, -1, -1, -1}));
47034 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47035 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47036 {1, -1, -1, -1, -1, -1, -1, -1}));
47037 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47038 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47039 }
47040
47041 // vXi8 add reduction - sub 128-bit vector.
47042 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
47043 Rdx = WidenToV16I8(Rdx, true);
47044 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47045 DAG.getConstant(0, DL, MVT::v16i8));
47046 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47047 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47048 }
47049
47050 // Must be a >=128-bit vector with pow2 elements.
47051 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
47052 return SDValue();
47053
47054 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
47055 if (VT == MVT::i8) {
47056 while (Rdx.getValueSizeInBits() > 128) {
47057 SDValue Lo, Hi;
47058 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47059 VecVT = Lo.getValueType();
47060 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47061 }
47062 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
47063
47065 MVT::v16i8, DL, Rdx, Rdx,
47066 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
47067 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
47068 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47069 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
47070 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47071 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47072 }
47073
47074 // See if we can use vXi8 PSADBW add reduction for larger zext types.
47075 // If the source vector values are 0-255, then we can use PSADBW to
47076 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
47077 // TODO: See if its worth avoiding vXi16/i32 truncations?
47078 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47079 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47080 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47081 Subtarget.hasAVX512())) {
47082 if (Rdx.getValueType() == MVT::v8i16) {
47083 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47084 DAG.getUNDEF(MVT::v8i16));
47085 } else {
47086 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47087 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47088 if (ByteVT.getSizeInBits() < 128)
47089 Rdx = WidenToV16I8(Rdx, true);
47090 }
47091
47092 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47093 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47095 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47096 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47097 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47098 };
47099 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47100 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47101
47102 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47103 while (Rdx.getValueSizeInBits() > 128) {
47104 SDValue Lo, Hi;
47105 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47106 VecVT = Lo.getValueType();
47107 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47108 }
47109 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47110
47111 if (NumElts > 8) {
47112 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47113 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47114 }
47115
47116 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47117 Rdx = DAG.getBitcast(VecVT, Rdx);
47118 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47119 }
47120
47121 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47122 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47123 return SDValue();
47124
47125 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47126
47127 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47128 // across the whole vector, so we need an extract + hop preliminary stage.
47129 // This is the only step where the operands of the hop are not the same value.
47130 // TODO: We could extend this to handle 512-bit or even longer vectors.
47131 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47132 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47133 unsigned NumElts = VecVT.getVectorNumElements();
47134 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47135 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47136 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47137 VecVT = Rdx.getValueType();
47138 }
47139 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47140 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47141 return SDValue();
47142
47143 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47144 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47145 for (unsigned i = 0; i != ReductionSteps; ++i)
47146 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47147
47148 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47149}
47150
47151/// Detect vector gather/scatter index generation and convert it from being a
47152/// bunch of shuffles and extracts into a somewhat faster sequence.
47153/// For i686, the best sequence is apparently storing the value and loading
47154/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47157 const X86Subtarget &Subtarget) {
47158 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47159 return NewOp;
47160
47161 SDValue InputVector = N->getOperand(0);
47162 SDValue EltIdx = N->getOperand(1);
47163 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47164
47165 EVT SrcVT = InputVector.getValueType();
47166 EVT VT = N->getValueType(0);
47167 SDLoc dl(InputVector);
47168 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47169 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47170 unsigned NumEltBits = VT.getScalarSizeInBits();
47171 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47172
47173 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47174 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47175
47176 // Integer Constant Folding.
47177 if (CIdx && VT.isInteger()) {
47178 APInt UndefVecElts;
47179 SmallVector<APInt, 16> EltBits;
47180 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47181 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47182 EltBits, /*AllowWholeUndefs*/ true,
47183 /*AllowPartialUndefs*/ false)) {
47184 uint64_t Idx = CIdx->getZExtValue();
47185 if (UndefVecElts[Idx])
47186 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47187 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47188 }
47189
47190 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47191 // Improves lowering of bool masks on rust which splits them into byte array.
47192 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47193 SDValue Src = peekThroughBitcasts(InputVector);
47194 if (Src.getValueType().getScalarType() == MVT::i1 &&
47195 TLI.isTypeLegal(Src.getValueType())) {
47196 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47197 SDValue Sub = DAG.getNode(
47198 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47199 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47200 return DAG.getBitcast(VT, Sub);
47201 }
47202 }
47203 }
47204
47205 if (IsPextr) {
47206 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47207 DCI))
47208 return SDValue(N, 0);
47209
47210 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47211 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47212 InputVector.getOpcode() == X86ISD::PINSRW) &&
47213 InputVector.getOperand(2) == EltIdx) {
47214 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47215 "Vector type mismatch");
47216 SDValue Scl = InputVector.getOperand(1);
47217 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47218 return DAG.getZExtOrTrunc(Scl, dl, VT);
47219 }
47220
47221 // TODO - Remove this once we can handle the implicit zero-extension of
47222 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47223 // combineBasicSADPattern.
47224 return SDValue();
47225 }
47226
47227 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47228 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47229 InputVector.getOpcode() == ISD::BITCAST &&
47230 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47231 isNullConstant(EltIdx) && InputVector.hasOneUse())
47232 return DAG.getBitcast(VT, InputVector);
47233
47234 // Detect mmx to i32 conversion through a v2i32 elt extract.
47235 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47236 InputVector.getOpcode() == ISD::BITCAST &&
47237 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47238 isNullConstant(EltIdx) && InputVector.hasOneUse())
47239 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47240 InputVector.getOperand(0));
47241
47242 // Check whether this extract is the root of a sum of absolute differences
47243 // pattern. This has to be done here because we really want it to happen
47244 // pre-legalization,
47245 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47246 return SAD;
47247
47248 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47249 return VPDPBUSD;
47250
47251 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47252 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47253 return Cmp;
47254
47255 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47256 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47257 return MinMax;
47258
47259 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47260 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47261 return V;
47262
47263 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47264 return V;
47265
47266 if (CIdx)
47268 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47269 dl, DAG, DCI))
47270 return V;
47271
47272 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47273 // and then testing the relevant element.
47274 //
47275 // Note that we only combine extracts on the *same* result number, i.e.
47276 // t0 = merge_values a0, a1, a2, a3
47277 // i1 = extract_vector_elt t0, Constant:i64<2>
47278 // i1 = extract_vector_elt t0, Constant:i64<3>
47279 // but not
47280 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47281 // since the latter would need its own MOVMSK.
47282 if (SrcVT.getScalarType() == MVT::i1) {
47283 bool IsVar = !CIdx;
47284 SmallVector<SDNode *, 16> BoolExtracts;
47285 unsigned ResNo = InputVector.getResNo();
47286 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47287 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47288 Use->getOperand(0).getResNo() == ResNo &&
47289 Use->getValueType(0) == MVT::i1) {
47290 BoolExtracts.push_back(Use);
47291 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47292 return true;
47293 }
47294 return false;
47295 };
47296 // TODO: Can we drop the oneuse check for constant extracts?
47297 if (all_of(InputVector->users(), IsBoolExtract) &&
47298 (IsVar || BoolExtracts.size() > 1)) {
47299 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47300 if (SDValue BC =
47301 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47302 for (SDNode *Use : BoolExtracts) {
47303 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47304 // Mask = 1 << MaskIdx
47305 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47306 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47307 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47308 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47309 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47310 DCI.CombineTo(Use, Res);
47311 }
47312 return SDValue(N, 0);
47313 }
47314 }
47315 }
47316
47317 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47318 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47319 SDValue TruncSrc = InputVector.getOperand(0);
47320 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47321 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47322 SDValue NewExt =
47323 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47324 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47325 }
47326 }
47327
47328 return SDValue();
47329}
47330
47331// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47332// This is more or less the reverse of combineBitcastvxi1.
47334 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47335 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47336 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47337 Opcode != ISD::ANY_EXTEND)
47338 return SDValue();
47339 if (!DCI.isBeforeLegalizeOps())
47340 return SDValue();
47341 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47342 return SDValue();
47343
47344 EVT SVT = VT.getScalarType();
47345 EVT InSVT = N0.getValueType().getScalarType();
47346 unsigned EltSizeInBits = SVT.getSizeInBits();
47347
47348 // Input type must be extending a bool vector (bit-casted from a scalar
47349 // integer) to legal integer types.
47350 if (!VT.isVector())
47351 return SDValue();
47352 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47353 return SDValue();
47354 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47355 return SDValue();
47356
47357 SDValue N00 = N0.getOperand(0);
47358 EVT SclVT = N00.getValueType();
47359 if (!SclVT.isScalarInteger())
47360 return SDValue();
47361
47362 SDValue Vec;
47363 SmallVector<int> ShuffleMask;
47364 unsigned NumElts = VT.getVectorNumElements();
47365 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47366
47367 // Broadcast the scalar integer to the vector elements.
47368 if (NumElts > EltSizeInBits) {
47369 // If the scalar integer is greater than the vector element size, then we
47370 // must split it down into sub-sections for broadcasting. For example:
47371 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47372 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47373 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47374 unsigned Scale = NumElts / EltSizeInBits;
47375 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47376 bool UseBroadcast = Subtarget.hasInt256() &&
47377 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47378 Vec = UseBroadcast
47379 ? DAG.getSplat(BroadcastVT, DL, N00)
47380 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47381 Vec = DAG.getBitcast(VT, Vec);
47382
47383 for (unsigned i = 0; i != Scale; ++i) {
47384 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47385 ShuffleMask.append(EltSizeInBits, i + Offset);
47386 }
47387 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47388 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47389 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47390 // If we have register broadcast instructions, use the scalar size as the
47391 // element type for the shuffle. Then cast to the wider element type. The
47392 // widened bits won't be used, and this might allow the use of a broadcast
47393 // load.
47394 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47395 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47396 (NumElts * EltSizeInBits) / NumElts);
47397 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47398 } else {
47399 // For smaller scalar integers, we can simply any-extend it to the vector
47400 // element size (we don't care about the upper bits) and broadcast it to all
47401 // elements.
47402 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47403 }
47404
47405 // Now, mask the relevant bit in each element.
47407 for (unsigned i = 0; i != NumElts; ++i) {
47408 int BitIdx = (i % EltSizeInBits);
47409 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47410 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47411 }
47412 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47413 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47414
47415 // Compare against the bitmask and extend the result.
47416 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47417 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47418 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47419
47420 // For SEXT, this is now done, otherwise shift the result down for
47421 // zero-extension.
47422 if (Opcode == ISD::SIGN_EXTEND)
47423 return Vec;
47424 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47425 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47426}
47427
47428/// If both arms of a vector select are concatenated vectors, split the select,
47429/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47430/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47431/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47433 const X86Subtarget &Subtarget) {
47434 unsigned Opcode = N->getOpcode();
47435 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47436 return SDValue();
47437
47438 // TODO: Split 512-bit vectors too?
47439 EVT VT = N->getValueType(0);
47440 if (!VT.is256BitVector())
47441 return SDValue();
47442
47443 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47444 SDValue Cond = N->getOperand(0);
47445 SDValue TVal = N->getOperand(1);
47446 SDValue FVal = N->getOperand(2);
47447 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47448 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47449 return SDValue();
47450
47451 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47453 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47454 };
47455 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47456 /*CheckBWI*/ false);
47457}
47458
47460 const SDLoc &DL) {
47461 SDValue Cond = N->getOperand(0);
47462 SDValue LHS = N->getOperand(1);
47463 SDValue RHS = N->getOperand(2);
47464
47465 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47466 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47467 if (!TrueC || !FalseC)
47468 return SDValue();
47469
47470 // Don't do this for crazy integer types.
47471 EVT VT = N->getValueType(0);
47472 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47473 return SDValue();
47474
47475 // We're going to use the condition bit in math or logic ops. We could allow
47476 // this with a wider condition value (post-legalization it becomes an i8),
47477 // but if nothing is creating selects that late, it doesn't matter.
47478 if (Cond.getValueType() != MVT::i1)
47479 return SDValue();
47480
47481 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47482 // 3, 5, or 9 with i32/i64, so those get transformed too.
47483 // TODO: For constants that overflow or do not differ by power-of-2 or small
47484 // multiplier, convert to 'and' + 'add'.
47485 const APInt &TrueVal = TrueC->getAPIntValue();
47486 const APInt &FalseVal = FalseC->getAPIntValue();
47487
47488 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47489 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47490 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47491 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47492 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47493 return SDValue();
47494 }
47495
47496 bool OV;
47497 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47498 if (OV)
47499 return SDValue();
47500
47501 APInt AbsDiff = Diff.abs();
47502 if (AbsDiff.isPowerOf2() ||
47503 ((VT == MVT::i32 || VT == MVT::i64) &&
47504 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47505
47506 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47507 // of the condition can usually be folded into a compare predicate, but even
47508 // without that, the sequence should be cheaper than a CMOV alternative.
47509 if (TrueVal.slt(FalseVal)) {
47510 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47511 std::swap(TrueC, FalseC);
47512 }
47513
47514 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47515 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47516
47517 // Multiply condition by the difference if non-one.
47518 if (!AbsDiff.isOne())
47519 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47520
47521 // Add the base if non-zero.
47522 if (!FalseC->isZero())
47523 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47524
47525 return R;
47526 }
47527
47528 return SDValue();
47529}
47530
47531/// If this is a *dynamic* select (non-constant condition) and we can match
47532/// this node with one of the variable blend instructions, restructure the
47533/// condition so that blends can use the high (sign) bit of each element.
47534/// This function will also call SimplifyDemandedBits on already created
47535/// BLENDV to perform additional simplifications.
47537 const SDLoc &DL,
47539 const X86Subtarget &Subtarget) {
47540 SDValue Cond = N->getOperand(0);
47541 if ((N->getOpcode() != ISD::VSELECT &&
47542 N->getOpcode() != X86ISD::BLENDV) ||
47544 return SDValue();
47545
47546 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47547 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47548 EVT VT = N->getValueType(0);
47549
47550 // We can only handle the cases where VSELECT is directly legal on the
47551 // subtarget. We custom lower VSELECT nodes with constant conditions and
47552 // this makes it hard to see whether a dynamic VSELECT will correctly
47553 // lower, so we both check the operation's status and explicitly handle the
47554 // cases where a *dynamic* blend will fail even though a constant-condition
47555 // blend could be custom lowered.
47556 // FIXME: We should find a better way to handle this class of problems.
47557 // Potentially, we should combine constant-condition vselect nodes
47558 // pre-legalization into shuffles and not mark as many types as custom
47559 // lowered.
47561 return SDValue();
47562 // FIXME: We don't support i16-element blends currently. We could and
47563 // should support them by making *all* the bits in the condition be set
47564 // rather than just the high bit and using an i8-element blend.
47565 if (VT.getVectorElementType() == MVT::i16)
47566 return SDValue();
47567 // Dynamic blending was only available from SSE4.1 onward.
47568 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47569 return SDValue();
47570 // Byte blends are only available in AVX2
47571 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47572 return SDValue();
47573 // There are no 512-bit blend instructions that use sign bits.
47574 if (VT.is512BitVector())
47575 return SDValue();
47576
47577 // Don't optimize before the condition has been transformed to a legal type
47578 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47580 return SDValue();
47581
47582 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47583 for (SDUse &Use : Cond->uses())
47584 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47585 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47586 Use.getOperandNo() != 0)
47587 return false;
47588
47589 return true;
47590 };
47591
47593
47594 if (OnlyUsedAsSelectCond(Cond)) {
47595 KnownBits Known;
47597 !DCI.isBeforeLegalizeOps());
47598 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47599 return SDValue();
47600
47601 // If we changed the computation somewhere in the DAG, this change will
47602 // affect all users of Cond. Update all the nodes so that we do not use
47603 // the generic VSELECT anymore. Otherwise, we may perform wrong
47604 // optimizations as we messed with the actual expectation for the vector
47605 // boolean values.
47606 for (SDNode *U : Cond->users()) {
47607 if (U->getOpcode() == X86ISD::BLENDV)
47608 continue;
47609
47610 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47611 Cond, U->getOperand(1), U->getOperand(2));
47612 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47613 DCI.AddToWorklist(U);
47614 }
47615 DCI.CommitTargetLoweringOpt(TLO);
47616 return SDValue(N, 0);
47617 }
47618
47619 // Otherwise we can still at least try to simplify multiple use bits.
47621 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47622 N->getOperand(1), N->getOperand(2));
47623
47624 return SDValue();
47625}
47626
47627// Try to match:
47628// (or (and (M, (sub 0, X)), (pandn M, X)))
47629// which is a special case of:
47630// (select M, (sub 0, X), X)
47631// Per:
47632// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47633// We know that, if fNegate is 0 or 1:
47634// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47635//
47636// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47637// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47638// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47639// This lets us transform our vselect to:
47640// (add (xor X, M), (and M, 1))
47641// And further to:
47642// (sub (xor X, M), M)
47644 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47645 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47646 using namespace SDPatternMatch;
47647 EVT MaskVT = Mask.getValueType();
47648 assert(MaskVT.isInteger() &&
47649 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47650 "Mask must be zero/all-bits");
47651
47652 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47654 return SDValue();
47655
47656 SDValue V;
47657 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47659 return SDValue();
47660
47661 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47662 SDValue SubOp2 = Mask;
47663
47664 // If the negate was on the false side of the select, then
47665 // the operands of the SUB need to be swapped. PR 27251.
47666 // This is because the pattern being matched above is
47667 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47668 // but if the pattern matched was
47669 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47670 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47671 // pattern also needs to be a negation of the replacement pattern above.
47672 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47673 // sub accomplishes the negation of the replacement pattern.
47674 if (V == Y)
47675 std::swap(SubOp1, SubOp2);
47676
47677 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47678 return DAG.getBitcast(VT, Res);
47679}
47680
47682 const X86Subtarget &Subtarget) {
47683 using namespace SDPatternMatch;
47684 if (!Subtarget.hasAVX512())
47685 return SDValue();
47686
47687 ISD::CondCode CC;
47688 SDValue Cond, X, Y, LHS, RHS;
47691 m_CondCode(CC)))),
47692 m_Value(LHS), m_Value(RHS))))
47693 return SDValue();
47694
47695 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47696 !canCombineAsMaskOperation(RHS, Subtarget))
47697 return SDValue();
47698
47699 // Commute LHS and RHS to create opportunity to select mask instruction.
47700 // (vselect M, L, R) -> (vselect ~M, R, L)
47701 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47702 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47703 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47704}
47705
47706/// Do target-specific dag combines on SELECT and VSELECT nodes.
47709 const X86Subtarget &Subtarget) {
47710 SDLoc DL(N);
47711 SDValue Cond = N->getOperand(0);
47712 SDValue LHS = N->getOperand(1);
47713 SDValue RHS = N->getOperand(2);
47714
47715 // Try simplification again because we use this function to optimize
47716 // BLENDV nodes that are not handled by the generic combiner.
47717 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47718 return V;
47719
47720 // When avx512 is available the lhs operand of select instruction can be
47721 // folded with mask instruction, while the rhs operand can't. Commute the
47722 // lhs and rhs of the select instruction to create the opportunity of
47723 // folding.
47724 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47725 return V;
47726
47727 EVT VT = LHS.getValueType();
47728 EVT CondVT = Cond.getValueType();
47729 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47730 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47731
47732 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47733 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47734 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47735 if (CondVT.isVector() && CondVT.isInteger() &&
47736 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47737 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47740 DL, DAG, Subtarget))
47741 return V;
47742
47743 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47744 SmallVector<int, 64> CondMask;
47745 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47746 N->getOpcode() == X86ISD::BLENDV)) {
47747 // Convert vselects with constant condition into shuffles.
47748 if (DCI.isBeforeLegalizeOps())
47749 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47750
47751 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47752 // by forcing the unselected elements to zero.
47753 // TODO: Can we handle more shuffles with this?
47754 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47755 SmallVector<SDValue, 1> LHSOps, RHSOps;
47756 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47759 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47760 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47761 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47762 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47763 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47764 assert(ByteMask.size() == LHSMask.size() &&
47765 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47766 for (auto [I, M] : enumerate(ByteMask)) {
47767 // getConstVector sets negative shuffle mask values as undef, so
47768 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47769 if (M < (int)ByteMask.size()) {
47770 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47771 RHSMask[I] = 0x80;
47772 } else {
47773 LHSMask[I] = 0x80;
47774 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47775 }
47776 }
47777 MVT ByteVT = LHSShuf.getSimpleValueType();
47778 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47779 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47780 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47781 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47782 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47783 }
47784 }
47785
47786 // Attempt to combine as shuffle.
47787 SDValue Op(N, 0);
47788 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47789 return Res;
47790 }
47791 }
47792
47793 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47794 // instructions match the semantics of the common C idiom x<y?x:y but not
47795 // x<=y?x:y, because of how they handle negative zero (which can be
47796 // ignored in unsafe-math mode).
47797 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47798 if ((Cond.getOpcode() == ISD::SETCC ||
47799 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47800 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47801 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47802 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47803 (Subtarget.hasSSE2() ||
47804 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47805 bool IsStrict = Cond->isStrictFPOpcode();
47806 ISD::CondCode CC =
47807 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47808 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47809 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47810
47811 unsigned Opcode = 0;
47812 // Check for x CC y ? x : y.
47813 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47814 switch (CC) {
47815 default: break;
47816 case ISD::SETULT:
47817 // Converting this to a min would handle NaNs incorrectly, and swapping
47818 // the operands would cause it to handle comparisons between positive
47819 // and negative zero incorrectly.
47820 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47822 !(DAG.isKnownNeverZeroFloat(LHS) ||
47824 break;
47825 std::swap(LHS, RHS);
47826 }
47827 Opcode = X86ISD::FMIN;
47828 break;
47829 case ISD::SETOLE:
47830 // Converting this to a min would handle comparisons between positive
47831 // and negative zero incorrectly.
47834 break;
47835 Opcode = X86ISD::FMIN;
47836 break;
47837 case ISD::SETULE:
47838 // Converting this to a min would handle both negative zeros and NaNs
47839 // incorrectly, but we can swap the operands to fix both.
47840 std::swap(LHS, RHS);
47841 [[fallthrough]];
47842 case ISD::SETOLT:
47843 case ISD::SETLT:
47844 case ISD::SETLE:
47845 Opcode = X86ISD::FMIN;
47846 break;
47847
47848 case ISD::SETOGE:
47849 // Converting this to a max would handle comparisons between positive
47850 // and negative zero incorrectly.
47853 break;
47854 Opcode = X86ISD::FMAX;
47855 break;
47856 case ISD::SETUGT:
47857 // Converting this to a max would handle NaNs incorrectly, and swapping
47858 // the operands would cause it to handle comparisons between positive
47859 // and negative zero incorrectly.
47860 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47862 !(DAG.isKnownNeverZeroFloat(LHS) ||
47864 break;
47865 std::swap(LHS, RHS);
47866 }
47867 Opcode = X86ISD::FMAX;
47868 break;
47869 case ISD::SETUGE:
47870 // Converting this to a max would handle both negative zeros and NaNs
47871 // incorrectly, but we can swap the operands to fix both.
47872 std::swap(LHS, RHS);
47873 [[fallthrough]];
47874 case ISD::SETOGT:
47875 case ISD::SETGT:
47876 case ISD::SETGE:
47877 Opcode = X86ISD::FMAX;
47878 break;
47879 }
47880 // Check for x CC y ? y : x -- a min/max with reversed arms.
47881 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47882 switch (CC) {
47883 default: break;
47884 case ISD::SETOGE:
47885 // Converting this to a min would handle comparisons between positive
47886 // and negative zero incorrectly, and swapping the operands would
47887 // cause it to handle NaNs incorrectly.
47889 !(DAG.isKnownNeverZeroFloat(LHS) ||
47890 DAG.isKnownNeverZeroFloat(RHS))) {
47891 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47892 break;
47893 std::swap(LHS, RHS);
47894 }
47895 Opcode = X86ISD::FMIN;
47896 break;
47897 case ISD::SETUGT:
47898 // Converting this to a min would handle NaNs incorrectly.
47899 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47900 break;
47901 Opcode = X86ISD::FMIN;
47902 break;
47903 case ISD::SETUGE:
47904 // Converting this to a min would handle both negative zeros and NaNs
47905 // incorrectly, but we can swap the operands to fix both.
47906 std::swap(LHS, RHS);
47907 [[fallthrough]];
47908 case ISD::SETOGT:
47909 case ISD::SETGT:
47910 case ISD::SETGE:
47911 Opcode = X86ISD::FMIN;
47912 break;
47913
47914 case ISD::SETULT:
47915 // Converting this to a max would handle NaNs incorrectly.
47916 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47917 break;
47918 Opcode = X86ISD::FMAX;
47919 break;
47920 case ISD::SETOLE:
47921 // Converting this to a max would handle comparisons between positive
47922 // and negative zero incorrectly, and swapping the operands would
47923 // cause it to handle NaNs incorrectly.
47925 !DAG.isKnownNeverZeroFloat(LHS) &&
47926 !DAG.isKnownNeverZeroFloat(RHS)) {
47927 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47928 break;
47929 std::swap(LHS, RHS);
47930 }
47931 Opcode = X86ISD::FMAX;
47932 break;
47933 case ISD::SETULE:
47934 // Converting this to a max would handle both negative zeros and NaNs
47935 // incorrectly, but we can swap the operands to fix both.
47936 std::swap(LHS, RHS);
47937 [[fallthrough]];
47938 case ISD::SETOLT:
47939 case ISD::SETLT:
47940 case ISD::SETLE:
47941 Opcode = X86ISD::FMAX;
47942 break;
47943 }
47944 }
47945
47946 if (Opcode) {
47947 if (IsStrict) {
47948 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47950 DL, {N->getValueType(0), MVT::Other},
47951 {Cond.getOperand(0), LHS, RHS});
47952 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47953 return Ret;
47954 }
47955 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47956 }
47957 }
47958
47959 // Some mask scalar intrinsics rely on checking if only one bit is set
47960 // and implement it in C code like this:
47961 // A[0] = (U & 1) ? A[0] : W[0];
47962 // This creates some redundant instructions that break pattern matching.
47963 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47964 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47965 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47966 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47967 SDValue AndNode = Cond.getOperand(0);
47968 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47969 isNullConstant(Cond.getOperand(1)) &&
47970 isOneConstant(AndNode.getOperand(1))) {
47971 // LHS and RHS swapped due to
47972 // setcc outputting 1 when AND resulted in 0 and vice versa.
47973 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47974 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47975 }
47976 }
47977
47978 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47979 // lowering on KNL. In this case we convert it to
47980 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47981 // The same situation all vectors of i8 and i16 without BWI.
47982 // Make sure we extend these even before type legalization gets a chance to
47983 // split wide vectors.
47984 // Since SKX these selects have a proper lowering.
47985 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47986 CondVT.getVectorElementType() == MVT::i1 &&
47987 (VT.getVectorElementType() == MVT::i8 ||
47988 VT.getVectorElementType() == MVT::i16)) {
47989 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47990 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47991 }
47992
47993 // AVX512 - Extend select to merge with target shuffle.
47994 // select(mask, extract_subvector(shuffle(x)), y) -->
47995 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
47996 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47997 if (Subtarget.hasAVX512() && CondVT.isVector() &&
47998 CondVT.getVectorElementType() == MVT::i1) {
47999 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
48000 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
48001 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
48002 isNullConstant(Op.getOperand(1)) &&
48003 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
48004 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
48005 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
48006 ISD::isBuildVectorAllZeros(Alt.getNode()));
48007 };
48008
48009 bool SelectableLHS = SelectableOp(LHS, RHS);
48010 bool SelectableRHS = SelectableOp(RHS, LHS);
48011 if (SelectableLHS || SelectableRHS) {
48012 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
48013 : RHS.getOperand(0).getValueType();
48014 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
48015 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
48016 VT.getSizeInBits());
48017 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
48018 VT.getSizeInBits());
48019 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
48020 DAG.getUNDEF(SrcCondVT), Cond,
48021 DAG.getVectorIdxConstant(0, DL));
48022 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
48023 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
48024 }
48025 }
48026
48027 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
48028 return V;
48029
48030 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
48031 Cond.hasOneUse()) {
48032 EVT CondVT = Cond.getValueType();
48033 SDValue Cond0 = Cond.getOperand(0);
48034 SDValue Cond1 = Cond.getOperand(1);
48035 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
48036
48037 // Canonicalize min/max:
48038 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
48039 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
48040 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
48041 // the need for an extra compare against zero. e.g.
48042 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
48043 // subl %esi, %edi
48044 // testl %edi, %edi
48045 // movl $0, %eax
48046 // cmovgl %edi, %eax
48047 // =>
48048 // xorl %eax, %eax
48049 // subl %esi, $edi
48050 // cmovsl %eax, %edi
48051 //
48052 // We can also canonicalize
48053 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
48054 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
48055 // This allows the use of a test instruction for the compare.
48056 if (LHS == Cond0 && RHS == Cond1) {
48057 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
48058 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
48060 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48061 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48062 }
48063 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
48064 ISD::CondCode NewCC = ISD::SETUGE;
48065 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48066 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48067 }
48068 }
48069
48070 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
48071 // fold eq + gt/lt nested selects into ge/le selects
48072 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
48073 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
48074 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
48075 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
48076 // .. etc ..
48077 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48078 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48079 SDValue InnerSetCC = RHS.getOperand(0);
48080 ISD::CondCode InnerCC =
48081 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48082 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48083 Cond0 == InnerSetCC.getOperand(0) &&
48084 Cond1 == InnerSetCC.getOperand(1)) {
48085 ISD::CondCode NewCC;
48086 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48087 // clang-format off
48088 case ISD::SETGT: NewCC = ISD::SETGE; break;
48089 case ISD::SETLT: NewCC = ISD::SETLE; break;
48090 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48091 case ISD::SETULT: NewCC = ISD::SETULE; break;
48092 default: NewCC = ISD::SETCC_INVALID; break;
48093 // clang-format on
48094 }
48095 if (NewCC != ISD::SETCC_INVALID) {
48096 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48097 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48098 }
48099 }
48100 }
48101 }
48102
48103 // Check if the first operand is all zeros and Cond type is vXi1.
48104 // If this an avx512 target we can improve the use of zero masking by
48105 // swapping the operands and inverting the condition.
48106 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48107 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48108 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48109 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48110 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48111 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48112 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48113 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48114 }
48115
48116 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48117 // get split by legalization.
48118 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48119 CondVT.getVectorElementType() == MVT::i1 &&
48120 TLI.isTypeLegal(VT.getScalarType())) {
48121 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48123 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48124 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48125 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48126 }
48127 }
48128
48129 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48130 // with out-of-bounds clamping.
48131
48132 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48133 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48134 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48135 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48136 // exceeding bitwidth-1.
48137 if (N->getOpcode() == ISD::VSELECT) {
48138 using namespace llvm::SDPatternMatch;
48139 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48140 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48141 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48142 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48144 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48147 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48148 : X86ISD::VSHLV,
48149 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48150 }
48151 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48152 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48153 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48154 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48156 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48159 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48160 : X86ISD::VSHLV,
48161 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48162 }
48163 }
48164
48165 // Early exit check
48166 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48167 return SDValue();
48168
48169 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48170 return V;
48171
48172 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48173 return V;
48174
48175 // select(~Cond, X, Y) -> select(Cond, Y, X)
48176 if (CondVT.getScalarType() != MVT::i1) {
48177 if (SDValue CondNot = IsNOT(Cond, DAG))
48178 return DAG.getNode(N->getOpcode(), DL, VT,
48179 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48180
48181 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48182 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48183 Cond.getOperand(0).getOpcode() == ISD::AND &&
48184 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48185 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48186 Cond.getScalarValueSizeInBits(),
48187 /*AllowUndefs=*/true) &&
48188 Cond.hasOneUse()) {
48189 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48190 Cond.getOperand(0).getOperand(1));
48191 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48192 }
48193
48194 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48195 // signbit.
48196 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48197 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48198 Cond.hasOneUse()) {
48199 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48200 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48201 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48202 }
48203 }
48204
48205 // Try to optimize vXi1 selects if both operands are either all constants or
48206 // bitcasts from scalar integer type. In that case we can convert the operands
48207 // to integer and use an integer select which will be converted to a CMOV.
48208 // We need to take a little bit of care to avoid creating an i64 type after
48209 // type legalization.
48210 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48211 VT.getVectorElementType() == MVT::i1 &&
48212 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48214 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48215 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48216 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48217
48218 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48219 LHS.getOperand(0).getValueType() == IntVT)) &&
48220 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48221 RHS.getOperand(0).getValueType() == IntVT))) {
48222 if (LHSIsConst)
48224 else
48225 LHS = LHS.getOperand(0);
48226
48227 if (RHSIsConst)
48229 else
48230 RHS = RHS.getOperand(0);
48231
48232 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48233 return DAG.getBitcast(VT, Select);
48234 }
48235 }
48236 }
48237
48238 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48239 // single bits, then invert the predicate and swap the select operands.
48240 // This can lower using a vector shift bit-hack rather than mask and compare.
48241 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48242 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48243 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48244 Cond.getOperand(0).getOpcode() == ISD::AND &&
48245 isNullOrNullSplat(Cond.getOperand(1)) &&
48246 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48247 Cond.getOperand(0).getValueType() == VT) {
48248 // The 'and' mask must be composed of power-of-2 constants.
48249 SDValue And = Cond.getOperand(0);
48250 auto *C = isConstOrConstSplat(And.getOperand(1));
48251 if (C && C->getAPIntValue().isPowerOf2()) {
48252 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48253 SDValue NotCond =
48254 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48255 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48256 }
48257
48258 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48259 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48260 // 16-bit lacks a proper blendv.
48261 unsigned EltBitWidth = VT.getScalarSizeInBits();
48262 bool CanShiftBlend =
48263 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48264 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48265 (Subtarget.hasXOP()));
48266 if (CanShiftBlend &&
48267 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48268 return C->getAPIntValue().isPowerOf2();
48269 })) {
48270 // Create a left-shift constant to get the mask bits over to the sign-bit.
48271 SDValue Mask = And.getOperand(1);
48272 SmallVector<int, 32> ShlVals;
48273 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48274 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48275 ShlVals.push_back(EltBitWidth - 1 -
48276 MaskVal->getAPIntValue().exactLogBase2());
48277 }
48278 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48279 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48280 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48281 SDValue NewCond =
48282 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48283 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48284 }
48285 }
48286
48287 return SDValue();
48288}
48289
48290/// Combine:
48291/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48292/// to:
48293/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48294/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48295/// Note that this is only legal for some op/cc combinations.
48297 SelectionDAG &DAG,
48298 const X86Subtarget &Subtarget) {
48299 // This combine only operates on CMP-like nodes.
48300 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48301 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48302 return SDValue();
48303
48304 // Can't replace the cmp if it has more uses than the one we're looking at.
48305 // FIXME: We would like to be able to handle this, but would need to make sure
48306 // all uses were updated.
48307 if (!Cmp.hasOneUse())
48308 return SDValue();
48309
48310 // This only applies to variations of the common case:
48311 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48312 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48313 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48314 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48315 // Using the proper condcodes (see below), overflow is checked for.
48316
48317 // FIXME: We can generalize both constraints:
48318 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48319 // - LHS != 1
48320 // if the result is compared.
48321
48322 SDValue CmpLHS = Cmp.getOperand(0);
48323 SDValue CmpRHS = Cmp.getOperand(1);
48324 EVT CmpVT = CmpLHS.getValueType();
48325
48326 if (!CmpLHS.hasOneUse())
48327 return SDValue();
48328
48329 unsigned Opc = CmpLHS.getOpcode();
48330 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
48331 return SDValue();
48332
48333 SDValue OpRHS = CmpLHS.getOperand(2);
48334 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48335 if (!OpRHSC)
48336 return SDValue();
48337
48338 APInt Addend = OpRHSC->getAPIntValue();
48339 if (Opc == ISD::ATOMIC_LOAD_SUB)
48340 Addend = -Addend;
48341
48342 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48343 if (!CmpRHSC)
48344 return SDValue();
48345
48346 APInt Comparison = CmpRHSC->getAPIntValue();
48347 APInt NegAddend = -Addend;
48348
48349 // See if we can adjust the CC to make the comparison match the negated
48350 // addend.
48351 if (Comparison != NegAddend) {
48352 APInt IncComparison = Comparison + 1;
48353 if (IncComparison == NegAddend) {
48354 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48355 Comparison = IncComparison;
48356 CC = X86::COND_AE;
48357 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48358 Comparison = IncComparison;
48359 CC = X86::COND_L;
48360 }
48361 }
48362 APInt DecComparison = Comparison - 1;
48363 if (DecComparison == NegAddend) {
48364 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48365 Comparison = DecComparison;
48366 CC = X86::COND_A;
48367 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48368 Comparison = DecComparison;
48369 CC = X86::COND_LE;
48370 }
48371 }
48372 }
48373
48374 // If the addend is the negation of the comparison value, then we can do
48375 // a full comparison by emitting the atomic arithmetic as a locked sub.
48376 if (Comparison == NegAddend) {
48377 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48378 // atomic sub.
48379 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48380 auto AtomicSub = DAG.getAtomic(
48381 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48382 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48383 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48384 AN->getMemOperand());
48385 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48386 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48387 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48388 return LockOp;
48389 }
48390
48391 // We can handle comparisons with zero in a number of cases by manipulating
48392 // the CC used.
48393 if (!Comparison.isZero())
48394 return SDValue();
48395
48396 if (CC == X86::COND_S && Addend == 1)
48397 CC = X86::COND_LE;
48398 else if (CC == X86::COND_NS && Addend == 1)
48399 CC = X86::COND_G;
48400 else if (CC == X86::COND_G && Addend == -1)
48401 CC = X86::COND_GE;
48402 else if (CC == X86::COND_LE && Addend == -1)
48403 CC = X86::COND_L;
48404 else
48405 return SDValue();
48406
48407 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48408 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48409 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48410 return LockOp;
48411}
48412
48413// Check whether we're just testing the signbit, and whether we can simplify
48414// this by tracking where the signbit came from.
48416 SelectionDAG &DAG) {
48417 if (CC != X86::COND_S && CC != X86::COND_NS)
48418 return SDValue();
48419
48420 if (!Cmp.hasOneUse())
48421 return SDValue();
48422
48423 SDValue Src;
48424 if (Cmp.getOpcode() == X86ISD::CMP) {
48425 // CMP(X,0) -> signbit test
48426 if (!isNullConstant(Cmp.getOperand(1)))
48427 return SDValue();
48428 Src = Cmp.getOperand(0);
48429 // Peek through a SRA node as we just need the signbit.
48430 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48431 // TODO: Use SimplifyDemandedBits instead of just SRA?
48432 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48433 return SDValue();
48434 Src = Src.getOperand(0);
48435 } else if (Cmp.getOpcode() == X86ISD::OR) {
48436 // OR(X,Y) -> see if only one operand contributes to the signbit.
48437 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48438 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48439 Src = Cmp.getOperand(1);
48440 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48441 Src = Cmp.getOperand(0);
48442 else
48443 return SDValue();
48444 } else {
48445 return SDValue();
48446 }
48447
48448 // Replace with a TEST on the MSB.
48449 SDLoc DL(Cmp);
48450 MVT SrcVT = Src.getSimpleValueType();
48451 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48452
48453 // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded
48454 // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.
48455 if (Src.getOpcode() == ISD::SHL) {
48456 if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48457 Src = Src.getOperand(0);
48458 BitMask.lshrInPlace(*ShiftAmt);
48459 }
48460 } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {
48461 EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();
48462 Src = Src.getOperand(0);
48463 BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());
48464 }
48465
48466 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48467 DAG.getConstant(BitMask, DL, SrcVT));
48468 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48469 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48470 DAG.getConstant(0, DL, SrcVT));
48471}
48472
48473// Check whether a boolean test is testing a boolean value generated by
48474// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48475// code.
48476//
48477// Simplify the following patterns:
48478// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48479// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48480// to (Op EFLAGS Cond)
48481//
48482// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48483// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48484// to (Op EFLAGS !Cond)
48485//
48486// where Op could be BRCOND or CMOV.
48487//
48489 // This combine only operates on CMP-like nodes.
48490 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48491 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48492 return SDValue();
48493
48494 // Quit if not used as a boolean value.
48495 if (CC != X86::COND_E && CC != X86::COND_NE)
48496 return SDValue();
48497
48498 // Check CMP operands. One of them should be 0 or 1 and the other should be
48499 // an SetCC or extended from it.
48500 SDValue Op1 = Cmp.getOperand(0);
48501 SDValue Op2 = Cmp.getOperand(1);
48502
48503 SDValue SetCC;
48504 const ConstantSDNode* C = nullptr;
48505 bool needOppositeCond = (CC == X86::COND_E);
48506 bool checkAgainstTrue = false; // Is it a comparison against 1?
48507
48508 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48509 SetCC = Op2;
48510 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48511 SetCC = Op1;
48512 else // Quit if all operands are not constants.
48513 return SDValue();
48514
48515 if (C->getZExtValue() == 1) {
48516 needOppositeCond = !needOppositeCond;
48517 checkAgainstTrue = true;
48518 } else if (C->getZExtValue() != 0)
48519 // Quit if the constant is neither 0 or 1.
48520 return SDValue();
48521
48522 bool truncatedToBoolWithAnd = false;
48523 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48524 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48525 SetCC.getOpcode() == ISD::TRUNCATE ||
48526 SetCC.getOpcode() == ISD::AND) {
48527 if (SetCC.getOpcode() == ISD::AND) {
48528 int OpIdx = -1;
48529 if (isOneConstant(SetCC.getOperand(0)))
48530 OpIdx = 1;
48531 if (isOneConstant(SetCC.getOperand(1)))
48532 OpIdx = 0;
48533 if (OpIdx < 0)
48534 break;
48535 SetCC = SetCC.getOperand(OpIdx);
48536 truncatedToBoolWithAnd = true;
48537 } else
48538 SetCC = SetCC.getOperand(0);
48539 }
48540
48541 switch (SetCC.getOpcode()) {
48543 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48544 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48545 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48546 // truncated to i1 using 'and'.
48547 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48548 break;
48550 "Invalid use of SETCC_CARRY!");
48551 [[fallthrough]];
48552 case X86ISD::SETCC:
48553 // Set the condition code or opposite one if necessary.
48554 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48555 if (needOppositeCond)
48557 return SetCC.getOperand(1);
48558 case X86ISD::CMOV: {
48559 // Check whether false/true value has canonical one, i.e. 0 or 1.
48562 // Quit if true value is not a constant.
48563 if (!TVal)
48564 return SDValue();
48565 // Quit if false value is not a constant.
48566 if (!FVal) {
48567 SDValue Op = SetCC.getOperand(0);
48568 // Skip 'zext' or 'trunc' node.
48569 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48570 Op.getOpcode() == ISD::TRUNCATE)
48571 Op = Op.getOperand(0);
48572 // A special case for rdrand/rdseed, where 0 is set if false cond is
48573 // found.
48574 if ((Op.getOpcode() != X86ISD::RDRAND &&
48575 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48576 return SDValue();
48577 }
48578 // Quit if false value is not the constant 0 or 1.
48579 bool FValIsFalse = true;
48580 if (FVal && FVal->getZExtValue() != 0) {
48581 if (FVal->getZExtValue() != 1)
48582 return SDValue();
48583 // If FVal is 1, opposite cond is needed.
48584 needOppositeCond = !needOppositeCond;
48585 FValIsFalse = false;
48586 }
48587 // Quit if TVal is not the constant opposite of FVal.
48588 if (FValIsFalse && TVal->getZExtValue() != 1)
48589 return SDValue();
48590 if (!FValIsFalse && TVal->getZExtValue() != 0)
48591 return SDValue();
48592 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48593 if (needOppositeCond)
48595 return SetCC.getOperand(3);
48596 }
48597 }
48598
48599 return SDValue();
48600}
48601
48602/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48603/// Match:
48604/// (X86or (X86setcc) (X86setcc))
48605/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48607 X86::CondCode &CC1, SDValue &Flags,
48608 bool &isAnd) {
48609 if (Cond->getOpcode() == X86ISD::CMP) {
48610 if (!isNullConstant(Cond->getOperand(1)))
48611 return false;
48612
48613 Cond = Cond->getOperand(0);
48614 }
48615
48616 isAnd = false;
48617
48618 SDValue SetCC0, SetCC1;
48619 switch (Cond->getOpcode()) {
48620 default: return false;
48621 case ISD::AND:
48622 case X86ISD::AND:
48623 isAnd = true;
48624 [[fallthrough]];
48625 case ISD::OR:
48626 case X86ISD::OR:
48627 SetCC0 = Cond->getOperand(0);
48628 SetCC1 = Cond->getOperand(1);
48629 break;
48630 };
48631
48632 // Make sure we have SETCC nodes, using the same flags value.
48633 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48634 SetCC1.getOpcode() != X86ISD::SETCC ||
48635 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48636 return false;
48637
48638 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48639 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48640 Flags = SetCC0->getOperand(1);
48641 return true;
48642}
48643
48644// When legalizing carry, we create carries via add X, -1
48645// If that comes from an actual carry, via setcc, we use the
48646// carry directly.
48648 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48649 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48650 bool FoundAndLSB = false;
48651 SDValue Carry = EFLAGS.getOperand(0);
48652 while (Carry.getOpcode() == ISD::TRUNCATE ||
48653 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48654 (Carry.getOpcode() == ISD::AND &&
48655 isOneConstant(Carry.getOperand(1)))) {
48656 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48657 Carry = Carry.getOperand(0);
48658 }
48659 if (Carry.getOpcode() == X86ISD::SETCC ||
48660 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48661 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48662 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48663 SDValue CarryOp1 = Carry.getOperand(1);
48664 if (CarryCC == X86::COND_B)
48665 return CarryOp1;
48666 if (CarryCC == X86::COND_A) {
48667 // Try to convert COND_A into COND_B in an attempt to facilitate
48668 // materializing "setb reg".
48669 //
48670 // Do not flip "e > c", where "c" is a constant, because Cmp
48671 // instruction cannot take an immediate as its first operand.
48672 //
48673 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48674 CarryOp1.getNode()->hasOneUse() &&
48675 CarryOp1.getValueType().isInteger() &&
48676 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48677 SDValue SubCommute =
48678 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48679 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48680 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48681 }
48682 }
48683 // If this is a check of the z flag of an add with 1, switch to the
48684 // C flag.
48685 if (CarryCC == X86::COND_E &&
48686 CarryOp1.getOpcode() == X86ISD::ADD &&
48687 isOneConstant(CarryOp1.getOperand(1)))
48688 return CarryOp1;
48689 } else if (FoundAndLSB) {
48690 SDLoc DL(Carry);
48691 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48692 if (Carry.getOpcode() == ISD::SRL) {
48693 BitNo = Carry.getOperand(1);
48694 Carry = Carry.getOperand(0);
48695 }
48696 return getBT(Carry, BitNo, DL, DAG);
48697 }
48698 }
48699 }
48700
48701 return SDValue();
48702}
48703
48704/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48705/// to avoid the inversion.
48707 SelectionDAG &DAG,
48708 const X86Subtarget &Subtarget) {
48709 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48710 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48711 EFLAGS.getOpcode() != X86ISD::TESTP)
48712 return SDValue();
48713
48714 // PTEST/TESTP sets EFLAGS as:
48715 // TESTZ: ZF = (Op0 & Op1) == 0
48716 // TESTC: CF = (~Op0 & Op1) == 0
48717 // TESTNZC: ZF == 0 && CF == 0
48718 MVT VT = EFLAGS.getSimpleValueType();
48719 SDValue Op0 = EFLAGS.getOperand(0);
48720 SDValue Op1 = EFLAGS.getOperand(1);
48721 MVT OpVT = Op0.getSimpleValueType();
48722 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48723
48724 // TEST*(~X,Y) == TEST*(X,Y)
48725 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48726 X86::CondCode InvCC;
48727 switch (CC) {
48728 case X86::COND_B:
48729 // testc -> testz.
48730 InvCC = X86::COND_E;
48731 break;
48732 case X86::COND_AE:
48733 // !testc -> !testz.
48734 InvCC = X86::COND_NE;
48735 break;
48736 case X86::COND_E:
48737 // testz -> testc.
48738 InvCC = X86::COND_B;
48739 break;
48740 case X86::COND_NE:
48741 // !testz -> !testc.
48742 InvCC = X86::COND_AE;
48743 break;
48744 case X86::COND_A:
48745 case X86::COND_BE:
48746 // testnzc -> testnzc (no change).
48747 InvCC = CC;
48748 break;
48749 default:
48750 InvCC = X86::COND_INVALID;
48751 break;
48752 }
48753
48754 if (InvCC != X86::COND_INVALID) {
48755 CC = InvCC;
48756 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48757 DAG.getBitcast(OpVT, NotOp0), Op1);
48758 }
48759 }
48760
48761 if (CC == X86::COND_B || CC == X86::COND_AE) {
48762 // TESTC(X,~X) == TESTC(X,-1)
48763 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48764 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48765 SDLoc DL(EFLAGS);
48766 return DAG.getNode(
48767 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48768 DAG.getBitcast(OpVT,
48769 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48770 }
48771 }
48772 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48773 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48775 SDValue BC0 = peekThroughBitcasts(Op0);
48776 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48778 SDLoc DL(EFLAGS);
48779 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48780 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48781 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48782 }
48783 }
48784 }
48785
48786 if (CC == X86::COND_E || CC == X86::COND_NE) {
48787 // TESTZ(X,~Y) == TESTC(Y,X)
48788 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48789 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48790 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48791 DAG.getBitcast(OpVT, NotOp1), Op0);
48792 }
48793
48794 if (Op0 == Op1) {
48795 SDValue BC = peekThroughBitcasts(Op0);
48796 EVT BCVT = BC.getValueType();
48797
48798 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48799 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48800 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48801 DAG.getBitcast(OpVT, BC.getOperand(0)),
48802 DAG.getBitcast(OpVT, BC.getOperand(1)));
48803 }
48804
48805 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48806 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48807 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48808 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48809 DAG.getBitcast(OpVT, BC.getOperand(0)),
48810 DAG.getBitcast(OpVT, BC.getOperand(1)));
48811 }
48812
48813 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48814 // to more efficiently extract the sign bits and compare that.
48815 // TODO: Handle TESTC with comparison inversion.
48816 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48817 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48818 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48819 unsigned EltBits = BCVT.getScalarSizeInBits();
48820 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48821 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48822 APInt SignMask = APInt::getSignMask(EltBits);
48823 if (SDValue Res =
48824 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48825 // For vXi16 cases we need to use pmovmksb and extract every other
48826 // sign bit.
48827 SDLoc DL(EFLAGS);
48828 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48829 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48830 MVT FloatVT =
48831 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48832 Res = DAG.getBitcast(FloatVT, Res);
48833 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48834 } else if (EltBits == 16) {
48835 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48836 Res = DAG.getBitcast(MovmskVT, Res);
48837 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48838 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48839 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48840 } else {
48841 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48842 }
48843 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48844 DAG.getConstant(0, DL, MVT::i32));
48845 }
48846 }
48847 }
48848 }
48849
48850 // TESTZ(-1,X) == TESTZ(X,X)
48852 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48853
48854 // TESTZ(X,-1) == TESTZ(X,X)
48856 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48857
48858 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48859 // TODO: Add COND_NE handling?
48860 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48861 SDValue Src0 = peekThroughBitcasts(Op0);
48862 SDValue Src1 = peekThroughBitcasts(Op1);
48863 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48865 peekThroughBitcasts(Src0.getOperand(1)), true);
48867 peekThroughBitcasts(Src1.getOperand(1)), true);
48868 if (Src0 && Src1) {
48869 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48870 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48871 DAG.getBitcast(OpVT2, Src0),
48872 DAG.getBitcast(OpVT2, Src1));
48873 }
48874 }
48875 }
48876 }
48877
48878 return SDValue();
48879}
48880
48881// Attempt to simplify the MOVMSK input based on the comparison type.
48883 SelectionDAG &DAG,
48884 const X86Subtarget &Subtarget) {
48885 // Handle eq/ne against zero (any_of).
48886 // Handle eq/ne against -1 (all_of).
48887 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48888 return SDValue();
48889 if (EFLAGS.getValueType() != MVT::i32)
48890 return SDValue();
48891 unsigned CmpOpcode = EFLAGS.getOpcode();
48892 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48893 return SDValue();
48894 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48895 if (!CmpConstant)
48896 return SDValue();
48897 const APInt &CmpVal = CmpConstant->getAPIntValue();
48898
48899 SDValue CmpOp = EFLAGS.getOperand(0);
48900 unsigned CmpBits = CmpOp.getValueSizeInBits();
48901 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48902
48903 // Peek through any truncate.
48904 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48905 CmpOp = CmpOp.getOperand(0);
48906
48907 // Bail if we don't find a MOVMSK.
48908 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48909 return SDValue();
48910
48911 SDValue Vec = CmpOp.getOperand(0);
48912 MVT VecVT = Vec.getSimpleValueType();
48913 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48914 "Unexpected MOVMSK operand");
48915 unsigned NumElts = VecVT.getVectorNumElements();
48916 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48917
48918 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48919 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48920 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48921 if (!IsAnyOf && !IsAllOf)
48922 return SDValue();
48923
48924 // TODO: Check more combining cases for me.
48925 // Here we check the cmp use number to decide do combining or not.
48926 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48927 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48928 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48929
48930 // See if we can peek through to a vector with a wider element type, if the
48931 // signbits extend down to all the sub-elements as well.
48932 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48933 // potential SimplifyDemandedBits/Elts cases.
48934 // If we looked through a truncate that discard bits, we can't do this
48935 // transform.
48936 // FIXME: We could do this transform for truncates that discarded bits by
48937 // inserting an AND mask between the new MOVMSK and the CMP.
48938 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48939 SDValue BC = peekThroughBitcasts(Vec);
48940 MVT BCVT = BC.getSimpleValueType();
48941 unsigned BCNumElts = BCVT.getVectorNumElements();
48942 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48943 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48944 BCNumEltBits > NumEltBits &&
48945 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48946 SDLoc DL(EFLAGS);
48947 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48948 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48949 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48950 DAG.getConstant(CmpMask, DL, MVT::i32));
48951 }
48952 }
48953
48954 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48955 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48956 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48957 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48958 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48960 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48961 Ops.size() == 2) {
48962 SDLoc DL(EFLAGS);
48963 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48964 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48965 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48966 DAG.getBitcast(SubVT, Ops[0]),
48967 DAG.getBitcast(SubVT, Ops[1]));
48968 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48969 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48970 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48971 DAG.getConstant(CmpMask, DL, MVT::i32));
48972 }
48973 }
48974
48975 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48976 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48977 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48978 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48979 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48980 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48981 SDValue BC = peekThroughBitcasts(Vec);
48982 // Ensure MOVMSK was testing every signbit of BC.
48983 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48984 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48985 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48986 BC.getOperand(0), BC.getOperand(1));
48987 V = DAG.getBitcast(TestVT, V);
48988 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48989 }
48990 // Check for 256-bit split vector cases.
48991 if (BC.getOpcode() == ISD::AND &&
48992 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
48993 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
48994 SDValue LHS = BC.getOperand(0);
48995 SDValue RHS = BC.getOperand(1);
48996 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
48997 LHS.getOperand(0), LHS.getOperand(1));
48998 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
48999 RHS.getOperand(0), RHS.getOperand(1));
49000 LHS = DAG.getBitcast(TestVT, LHS);
49001 RHS = DAG.getBitcast(TestVT, RHS);
49002 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
49003 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49004 }
49005 }
49006 }
49007
49008 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
49009 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
49010 // sign bits prior to the comparison with zero unless we know that
49011 // the vXi16 splats the sign bit down to the lower i8 half.
49012 // TODO: Handle all_of patterns.
49013 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
49014 SDValue VecOp0 = Vec.getOperand(0);
49015 SDValue VecOp1 = Vec.getOperand(1);
49016 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
49017 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
49018 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
49019 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
49020 SDLoc DL(EFLAGS);
49021 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
49022 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49023 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
49024 if (!SignExt0) {
49025 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
49026 DAG.getConstant(0xAAAA, DL, MVT::i16));
49027 }
49028 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49029 DAG.getConstant(0, DL, MVT::i16));
49030 }
49031 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
49032 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
49033 if (CmpBits >= 16 && Subtarget.hasInt256() &&
49034 (IsAnyOf || (SignExt0 && SignExt1))) {
49035 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
49036 SDLoc DL(EFLAGS);
49037 SDValue Result = peekThroughBitcasts(Src);
49038 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
49039 Result.getValueType().getVectorNumElements() <= NumElts) {
49040 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
49041 Result.getOperand(0), Result.getOperand(1));
49042 V = DAG.getBitcast(MVT::v4i64, V);
49043 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49044 }
49045 Result = DAG.getBitcast(MVT::v32i8, Result);
49046 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49047 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
49048 if (!SignExt0 || !SignExt1) {
49049 assert(IsAnyOf &&
49050 "Only perform v16i16 signmasks for any_of patterns");
49051 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
49052 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
49053 }
49054 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49055 DAG.getConstant(CmpMask, DL, MVT::i32));
49056 }
49057 }
49058 }
49059
49060 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
49061 // Since we peek through a bitcast, we need to be careful if the base vector
49062 // type has smaller elements than the MOVMSK type. In that case, even if
49063 // all the elements are demanded by the shuffle mask, only the "high"
49064 // elements which have highbits that align with highbits in the MOVMSK vec
49065 // elements are actually demanded. A simplification of spurious operations
49066 // on the "low" elements take place during other simplifications.
49067 //
49068 // For example:
49069 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
49070 // demanded, because we are swapping around the result can change.
49071 //
49072 // To address this, we check that we can scale the shuffle mask to MOVMSK
49073 // element width (this will ensure "high" elements match). Its slightly overly
49074 // conservative, but fine for an edge case fold.
49075 SmallVector<int, 32> ShuffleMask;
49076 SmallVector<SDValue, 2> ShuffleInputs;
49077 if (NumElts <= CmpBits &&
49078 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
49079 ShuffleMask, DAG) &&
49080 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
49081 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49082 canScaleShuffleElements(ShuffleMask, NumElts)) {
49083 SDLoc DL(EFLAGS);
49084 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49085 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49086 Result =
49087 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49088 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49089 }
49090
49091 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49092 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49093 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49094 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49095 // iff every element is referenced.
49096 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49097 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49098 (NumEltBits == 32 || NumEltBits == 64)) {
49099 SDLoc DL(EFLAGS);
49100 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49101 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49102 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49103 SDValue LHS = Vec;
49104 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49105 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49106 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49107 DAG.getBitcast(FloatVT, LHS),
49108 DAG.getBitcast(FloatVT, RHS));
49109 }
49110
49111 return SDValue();
49112}
49113
49114/// Optimize an EFLAGS definition used according to the condition code \p CC
49115/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49116/// uses of chain values.
49118 SelectionDAG &DAG,
49119 const X86Subtarget &Subtarget) {
49120 if (CC == X86::COND_B)
49121 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49122 return Flags;
49123
49124 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49125 return R;
49126
49127 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49128 return R;
49129
49130 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49131 return R;
49132
49133 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49134 return R;
49135
49136 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49137}
49138
49139/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49142 const X86Subtarget &Subtarget) {
49143 SDLoc DL(N);
49144 EVT VT = N->getValueType(0);
49145 SDValue FalseOp = N->getOperand(0);
49146 SDValue TrueOp = N->getOperand(1);
49147 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49148 SDValue Cond = N->getOperand(3);
49149
49150 // cmov X, X, ?, ? --> X
49151 if (TrueOp == FalseOp)
49152 return TrueOp;
49153
49154 // Try to simplify the EFLAGS and condition code operands.
49155 // We can't always do this as FCMOV only supports a subset of X86 cond.
49156 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49157 if (!(FalseOp.getValueType() == MVT::f80 ||
49158 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49159 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49160 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49161 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49162 Flags};
49163 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49164 }
49165 }
49166
49167 // If this is a select between two integer constants, try to do some
49168 // optimizations. Note that the operands are ordered the opposite of SELECT
49169 // operands.
49170 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49171 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49172 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49173 // larger than FalseC (the false value).
49174 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49176 std::swap(TrueC, FalseC);
49177 std::swap(TrueOp, FalseOp);
49178 }
49179
49180 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49181 // This is efficient for any integer data type (including i8/i16) and
49182 // shift amount.
49183 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49184 Cond = getSETCC(CC, Cond, DL, DAG);
49185
49186 // Zero extend the condition if needed.
49187 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49188
49189 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49190 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49191 DAG.getConstant(ShAmt, DL, MVT::i8));
49192 return Cond;
49193 }
49194
49195 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49196 // for any integer data type, including i8/i16.
49197 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49198 Cond = getSETCC(CC, Cond, DL, DAG);
49199
49200 // Zero extend the condition if needed.
49202 FalseC->getValueType(0), Cond);
49203 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49204 SDValue(FalseC, 0));
49205 return Cond;
49206 }
49207
49208 // Optimize cases that will turn into an LEA instruction. This requires
49209 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49210 if (VT == MVT::i32 || VT == MVT::i64) {
49211 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49212 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49213 "Implicit constant truncation");
49214
49215 bool isFastMultiplier = false;
49216 if (Diff.ult(10)) {
49217 switch (Diff.getZExtValue()) {
49218 default: break;
49219 case 1: // result = add base, cond
49220 case 2: // result = lea base( , cond*2)
49221 case 3: // result = lea base(cond, cond*2)
49222 case 4: // result = lea base( , cond*4)
49223 case 5: // result = lea base(cond, cond*4)
49224 case 8: // result = lea base( , cond*8)
49225 case 9: // result = lea base(cond, cond*8)
49226 isFastMultiplier = true;
49227 break;
49228 }
49229 }
49230
49231 if (isFastMultiplier) {
49232 Cond = getSETCC(CC, Cond, DL ,DAG);
49233 // Zero extend the condition if needed.
49234 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49235 Cond);
49236 // Scale the condition by the difference.
49237 if (Diff != 1)
49238 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49239 DAG.getConstant(Diff, DL, Cond.getValueType()));
49240
49241 // Add the base if non-zero.
49242 if (FalseC->getAPIntValue() != 0)
49243 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49244 SDValue(FalseC, 0));
49245 return Cond;
49246 }
49247 }
49248 }
49249 }
49250
49251 // Handle these cases:
49252 // (select (x != c), e, c) -> select (x != c), e, x),
49253 // (select (x == c), c, e) -> select (x == c), x, e)
49254 // where the c is an integer constant, and the "select" is the combination
49255 // of CMOV and CMP.
49256 //
49257 // The rationale for this change is that the conditional-move from a constant
49258 // needs two instructions, however, conditional-move from a register needs
49259 // only one instruction.
49260 //
49261 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49262 // some instruction-combining opportunities. This opt needs to be
49263 // postponed as late as possible.
49264 //
49265 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49266 // the DCI.xxxx conditions are provided to postpone the optimization as
49267 // late as possible.
49268
49269 ConstantSDNode *CmpAgainst = nullptr;
49270 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49271 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49272 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49273
49274 if (CC == X86::COND_NE &&
49275 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49277 std::swap(TrueOp, FalseOp);
49278 }
49279
49280 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49281 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49282 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49283 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49284 }
49285 }
49286 }
49287
49288 // Transform:
49289 //
49290 // (cmov 1 T (uge T 2))
49291 //
49292 // to:
49293 //
49294 // (adc T 0 (sub T 1))
49295 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49296 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49297 SDValue Cond0 = Cond.getOperand(0);
49298 if (Cond0.getOpcode() == ISD::TRUNCATE)
49299 Cond0 = Cond0.getOperand(0);
49300 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49301 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49302 EVT CondVT = Cond->getValueType(0);
49303 // Subtract 1 and generate a carry.
49304 SDValue NewSub =
49305 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49306 DAG.getConstant(1, DL, CondVT));
49307 SDValue EFLAGS(NewSub.getNode(), 1);
49308 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49309 DAG.getConstant(0, DL, VT), EFLAGS);
49310 }
49311 }
49312
49313 // Fold and/or of setcc's to double CMOV:
49314 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49315 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49316 //
49317 // This combine lets us generate:
49318 // cmovcc1 (jcc1 if we don't have CMOV)
49319 // cmovcc2 (same)
49320 // instead of:
49321 // setcc1
49322 // setcc2
49323 // and/or
49324 // cmovne (jne if we don't have CMOV)
49325 // When we can't use the CMOV instruction, it might increase branch
49326 // mispredicts.
49327 // When we can use CMOV, or when there is no mispredict, this improves
49328 // throughput and reduces register pressure.
49329 //
49330 if (CC == X86::COND_NE) {
49331 SDValue Flags;
49332 X86::CondCode CC0, CC1;
49333 bool isAndSetCC;
49334 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49335 if (isAndSetCC) {
49336 std::swap(FalseOp, TrueOp);
49339 }
49340
49341 SDValue LOps[] = {FalseOp, TrueOp,
49342 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49343 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49344 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49345 Flags};
49346 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49347 return CMOV;
49348 }
49349 }
49350
49351 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49352 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49353 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49354 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49355 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49356 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49357 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49358 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49359 SDValue Add = TrueOp;
49360 SDValue Const = FalseOp;
49361 // Canonicalize the condition code for easier matching and output.
49362 if (CC == X86::COND_E)
49363 std::swap(Add, Const);
49364
49365 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49366 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49367 Add.getResNo() == 0 && Add.hasOneUse() &&
49368 Add.getOperand(1) == Cond.getOperand(0)) {
49369 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49370 Add.getOperand(1));
49371 }
49372
49373 // We might have replaced the constant in the cmov with the LHS of the
49374 // compare. If so change it to the RHS of the compare.
49375 if (Const == Cond.getOperand(0))
49376 Const = Cond.getOperand(1);
49377
49378 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49379 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49380 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49381 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49382 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49383 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49384 // This should constant fold.
49385 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49386 SDValue CMov =
49387 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49388 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49389 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49390 }
49391 }
49392
49393 return SDValue();
49394}
49395
49396/// Different mul shrinking modes.
49398
49400 EVT VT = N->getOperand(0).getValueType();
49401 if (VT.getScalarSizeInBits() != 32)
49402 return false;
49403
49404 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49405 unsigned SignBits[2] = {1, 1};
49406 bool IsPositive[2] = {false, false};
49407 for (unsigned i = 0; i < 2; i++) {
49408 SDValue Opd = N->getOperand(i);
49409
49410 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49411 IsPositive[i] = DAG.SignBitIsZero(Opd);
49412 }
49413
49414 bool AllPositive = IsPositive[0] && IsPositive[1];
49415 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49416 // When ranges are from -128 ~ 127, use MULS8 mode.
49417 if (MinSignBits >= 25)
49419 // When ranges are from 0 ~ 255, use MULU8 mode.
49420 else if (AllPositive && MinSignBits >= 24)
49422 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49423 else if (MinSignBits >= 17)
49425 // When ranges are from 0 ~ 65535, use MULU16 mode.
49426 else if (AllPositive && MinSignBits >= 16)
49428 else
49429 return false;
49430 return true;
49431}
49432
49433/// When the operands of vector mul are extended from smaller size values,
49434/// like i8 and i16, the type of mul may be shrinked to generate more
49435/// efficient code. Two typical patterns are handled:
49436/// Pattern1:
49437/// %2 = sext/zext <N x i8> %1 to <N x i32>
49438/// %4 = sext/zext <N x i8> %3 to <N x i32>
49439// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49440/// %5 = mul <N x i32> %2, %4
49441///
49442/// Pattern2:
49443/// %2 = zext/sext <N x i16> %1 to <N x i32>
49444/// %4 = zext/sext <N x i16> %3 to <N x i32>
49445/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49446/// %5 = mul <N x i32> %2, %4
49447///
49448/// There are four mul shrinking modes:
49449/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49450/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49451/// generate pmullw+sext32 for it (MULS8 mode).
49452/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49453/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49454/// generate pmullw+zext32 for it (MULU8 mode).
49455/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49456/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49457/// generate pmullw+pmulhw for it (MULS16 mode).
49458/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49459/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49460/// generate pmullw+pmulhuw for it (MULU16 mode).
49462 const X86Subtarget &Subtarget) {
49463 // Check for legality
49464 // pmullw/pmulhw are not supported by SSE.
49465 if (!Subtarget.hasSSE2())
49466 return SDValue();
49467
49468 // Check for profitability
49469 // pmulld is supported since SSE41. It is better to use pmulld
49470 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49471 // the expansion.
49472 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49473 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49474 return SDValue();
49475
49477 if (!canReduceVMulWidth(N, DAG, Mode))
49478 return SDValue();
49479
49480 SDValue N0 = N->getOperand(0);
49481 SDValue N1 = N->getOperand(1);
49482 EVT VT = N->getOperand(0).getValueType();
49483 unsigned NumElts = VT.getVectorNumElements();
49484 if ((NumElts % 2) != 0)
49485 return SDValue();
49486
49487 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49488
49489 // Shrink the operands of mul.
49490 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49491 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49492
49493 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49494 // lower part is needed.
49495 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49499 DL, VT, MulLo);
49500
49501 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49502 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49503 // the higher part is also needed.
49504 SDValue MulHi =
49506 ReducedVT, NewN0, NewN1);
49507
49508 // Repack the lower part and higher part result of mul into a wider
49509 // result.
49510 // Generate shuffle functioning as punpcklwd.
49511 SmallVector<int, 16> ShuffleMask(NumElts);
49512 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49513 ShuffleMask[2 * i] = i;
49514 ShuffleMask[2 * i + 1] = i + NumElts;
49515 }
49516 SDValue ResLo =
49517 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49518 ResLo = DAG.getBitcast(ResVT, ResLo);
49519 // Generate shuffle functioning as punpckhwd.
49520 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49521 ShuffleMask[2 * i] = i + NumElts / 2;
49522 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49523 }
49524 SDValue ResHi =
49525 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49526 ResHi = DAG.getBitcast(ResVT, ResHi);
49527 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49528}
49529
49531 EVT VT, const SDLoc &DL) {
49532
49533 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49534 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49535 DAG.getConstant(Mult, DL, VT));
49536 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49537 DAG.getConstant(Shift, DL, MVT::i8));
49538 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49539 N->getOperand(0));
49540 return Result;
49541 };
49542
49543 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49544 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49545 DAG.getConstant(Mul1, DL, VT));
49546 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49547 DAG.getConstant(Mul2, DL, VT));
49548 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49549 N->getOperand(0));
49550 return Result;
49551 };
49552
49553 switch (MulAmt) {
49554 default:
49555 break;
49556 case 11:
49557 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49558 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49559 case 21:
49560 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49561 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49562 case 41:
49563 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49564 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49565 case 22:
49566 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49567 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49568 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49569 case 19:
49570 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49571 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49572 case 37:
49573 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49574 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49575 case 73:
49576 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49577 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49578 case 13:
49579 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49580 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49581 case 23:
49582 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49583 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49584 case 26:
49585 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49586 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49587 case 28:
49588 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49589 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49590 case 29:
49591 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49592 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49593 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49594 }
49595
49596 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49597 // by a single LEA.
49598 // First check if this a sum of two power of 2s because that's easy. Then
49599 // count how many zeros are up to the first bit.
49600 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49601 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49602 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49603 if (ScaleShift >= 1 && ScaleShift < 4) {
49604 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49605 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49606 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49607 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49608 DAG.getConstant(ScaleShift, DL, MVT::i8));
49609 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49610 }
49611 }
49612
49613 return SDValue();
49614}
49615
49616// If the upper 17 bits of either element are zero and the other element are
49617// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49618// PMULLD, except on KNL.
49620 SelectionDAG &DAG,
49621 const X86Subtarget &Subtarget) {
49622 if (!Subtarget.hasSSE2())
49623 return SDValue();
49624
49625 if (Subtarget.isPMADDWDSlow())
49626 return SDValue();
49627
49628 EVT VT = N->getValueType(0);
49629
49630 // Only support vXi32 vectors.
49631 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49632 return SDValue();
49633
49634 // Make sure the type is legal or can split/widen to a legal type.
49635 // With AVX512 but without BWI, we would need to split v32i16.
49636 unsigned NumElts = VT.getVectorNumElements();
49637 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49638 return SDValue();
49639
49640 // With AVX512 but without BWI, we would need to split v32i16.
49641 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49642 return SDValue();
49643
49644 SDValue N0 = N->getOperand(0);
49645 SDValue N1 = N->getOperand(1);
49646
49647 // If we are zero/sign extending two steps without SSE4.1, its better to
49648 // reduce the vmul width instead.
49649 if (!Subtarget.hasSSE41() &&
49650 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49651 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49652 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49653 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49654 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49655 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49656 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49657 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49658 return SDValue();
49659
49660 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49661 // the vmul width instead.
49662 if (!Subtarget.hasSSE41() &&
49663 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49664 N0.getOperand(0).getValueSizeInBits() > 128) &&
49665 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49666 N1.getOperand(0).getValueSizeInBits() > 128))
49667 return SDValue();
49668
49669 // Sign bits must extend down to the lowest i16.
49670 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49671 DAG.ComputeMaxSignificantBits(N0) > 16)
49672 return SDValue();
49673
49674 // At least one of the elements must be zero in the upper 17 bits, or can be
49675 // safely made zero without altering the final result.
49676 auto GetZeroableOp = [&](SDValue Op) {
49677 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49678 if (DAG.MaskedValueIsZero(Op, Mask17))
49679 return Op;
49680 // Mask off upper 16-bits of sign-extended constants.
49682 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49683 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49684 SDValue Src = Op.getOperand(0);
49685 // Convert sext(vXi16) to zext(vXi16).
49686 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49687 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49688 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49689 // which will expand the extension.
49690 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49691 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49692 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49693 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49694 }
49695 }
49696 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49697 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49698 N->isOnlyUserOf(Op.getNode())) {
49699 SDValue Src = Op.getOperand(0);
49700 if (Src.getScalarValueSizeInBits() == 16)
49701 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49702 }
49703 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49704 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49705 N->isOnlyUserOf(Op.getNode())) {
49706 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49707 Op.getOperand(1));
49708 }
49709 return SDValue();
49710 };
49711 SDValue ZeroN0 = GetZeroableOp(N0);
49712 SDValue ZeroN1 = GetZeroableOp(N1);
49713 if (!ZeroN0 && !ZeroN1)
49714 return SDValue();
49715 N0 = ZeroN0 ? ZeroN0 : N0;
49716 N1 = ZeroN1 ? ZeroN1 : N1;
49717
49718 // Use SplitOpsAndApply to handle AVX splitting.
49719 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49721 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49722 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49723 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49724 DAG.getBitcast(OpVT, Ops[0]),
49725 DAG.getBitcast(OpVT, Ops[1]));
49726 };
49727 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49728}
49729
49731 const X86Subtarget &Subtarget) {
49732 if (!Subtarget.hasSSE2())
49733 return SDValue();
49734
49735 EVT VT = N->getValueType(0);
49736
49737 // Only support vXi64 vectors.
49738 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49739 VT.getVectorNumElements() < 2 ||
49741 return SDValue();
49742
49743 SDValue N0 = N->getOperand(0);
49744 SDValue N1 = N->getOperand(1);
49745
49746 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49747 // 32-bits. We can lower with this if the sign bits stretch that far.
49748 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49749 DAG.ComputeNumSignBits(N1) > 32) {
49750 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49752 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49753 };
49754 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49755 /*CheckBWI*/ false);
49756 }
49757
49758 // If the upper bits are zero we can use a single pmuludq.
49759 APInt Mask = APInt::getHighBitsSet(64, 32);
49760 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49761 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49763 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49764 };
49765 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49766 /*CheckBWI*/ false);
49767 }
49768
49769 return SDValue();
49770}
49771
49774 const X86Subtarget &Subtarget) {
49775 EVT VT = N->getValueType(0);
49776 SDLoc DL(N);
49777
49778 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49779 return V;
49780
49781 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49782 return V;
49783
49784 if (DCI.isBeforeLegalize() && VT.isVector())
49785 return reduceVMULWidth(N, DL, DAG, Subtarget);
49786
49787 if (VT != MVT::i64 && VT != MVT::i32 &&
49788 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49789 return SDValue();
49790
49791 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49792 if (!Known1.isConstant())
49793 return SDValue();
49794
49795 const APInt &C = Known1.getConstant();
49796 if (C.isZero())
49797 return DAG.getConstant(0, DL, VT);
49798
49799 if (C.isAllOnes())
49800 return DAG.getNegative(N->getOperand(0), DL, VT);
49801
49802 if (isPowerOf2_64(C.getZExtValue()))
49803 return SDValue();
49804
49805 // Optimize a single multiply with constant into two operations in order to
49806 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49808 return SDValue();
49809
49810 // An imul is usually smaller than the alternative sequence.
49812 return SDValue();
49813
49814 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49815 return SDValue();
49816
49817 int64_t SignMulAmt = C.getSExtValue();
49818 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49819 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49820
49821 SDValue NewMul = SDValue();
49822 if (VT == MVT::i64 || VT == MVT::i32) {
49823 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49824 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49825 DAG.getConstant(AbsMulAmt, DL, VT));
49826 if (SignMulAmt < 0)
49827 NewMul = DAG.getNegative(NewMul, DL, VT);
49828
49829 return NewMul;
49830 }
49831
49832 uint64_t MulAmt1 = 0;
49833 uint64_t MulAmt2 = 0;
49834 if ((AbsMulAmt % 9) == 0) {
49835 MulAmt1 = 9;
49836 MulAmt2 = AbsMulAmt / 9;
49837 } else if ((AbsMulAmt % 5) == 0) {
49838 MulAmt1 = 5;
49839 MulAmt2 = AbsMulAmt / 5;
49840 } else if ((AbsMulAmt % 3) == 0) {
49841 MulAmt1 = 3;
49842 MulAmt2 = AbsMulAmt / 3;
49843 }
49844
49845 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49846 if (MulAmt2 &&
49847 (isPowerOf2_64(MulAmt2) ||
49848 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49849
49850 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49851 N->user_begin()->getOpcode() == ISD::ADD))
49852 // If second multiplifer is pow2, issue it first. We want the multiply
49853 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49854 // use is an add. Only do this for positive multiply amounts since the
49855 // negate would prevent it from being used as an address mode anyway.
49856 std::swap(MulAmt1, MulAmt2);
49857
49858 if (isPowerOf2_64(MulAmt1))
49859 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49860 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49861 else
49862 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49863 DAG.getConstant(MulAmt1, DL, VT));
49864
49865 if (isPowerOf2_64(MulAmt2))
49866 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49867 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49868 else
49869 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49870 DAG.getConstant(MulAmt2, DL, VT));
49871
49872 // Negate the result.
49873 if (SignMulAmt < 0)
49874 NewMul = DAG.getNegative(NewMul, DL, VT);
49875 } else if (!Subtarget.slowLEA())
49876 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49877 }
49878 if (!NewMul) {
49879 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49880 if (isPowerOf2_64(AbsMulAmt - 1)) {
49881 // (mul x, 2^N + 1) => (add (shl x, N), x)
49882 NewMul = DAG.getNode(
49883 ISD::ADD, DL, VT, N->getOperand(0),
49884 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49885 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49886 if (SignMulAmt < 0)
49887 NewMul = DAG.getNegative(NewMul, DL, VT);
49888 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49889 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49890 NewMul =
49891 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49892 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49893 // To negate, reverse the operands of the subtract.
49894 if (SignMulAmt < 0)
49895 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49896 else
49897 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49898 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49899 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49900 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49901 NewMul =
49902 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49903 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49904 NewMul = DAG.getNode(
49905 ISD::ADD, DL, VT, NewMul,
49906 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49907 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49908 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49909 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49910 NewMul =
49911 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49912 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49913 NewMul = DAG.getNode(
49914 ISD::SUB, DL, VT, NewMul,
49915 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49916 } else if (SignMulAmt >= 0 && VT.isVector() &&
49917 Subtarget.fastImmVectorShift()) {
49918 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49919 uint64_t ShiftAmt1;
49920 std::optional<unsigned> Opc;
49921 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49922 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49923 Opc = ISD::ADD;
49924 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49925 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49926 Opc = ISD::SUB;
49927 }
49928
49929 if (Opc) {
49930 SDValue Shift1 =
49931 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49932 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49933 SDValue Shift2 =
49934 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49935 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49936 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49937 }
49938 }
49939 }
49940
49941 return NewMul;
49942}
49943
49944// Try to form a MULHU or MULHS node by looking for
49945// (srl (mul ext, ext), 16)
49946// TODO: This is X86 specific because we want to be able to handle wide types
49947// before type legalization. But we can only do it if the vector will be
49948// legalized via widening/splitting. Type legalization can't handle promotion
49949// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49950// combiner.
49952 const SDLoc &DL,
49953 const X86Subtarget &Subtarget) {
49954 using namespace SDPatternMatch;
49955 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49956 "SRL or SRA node is required here!");
49957
49958 if (!Subtarget.hasSSE2())
49959 return SDValue();
49960
49961 // Input type should be at least vXi32.
49962 EVT VT = N->getValueType(0);
49963 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49964 return SDValue();
49965
49966 // The operation must be a multiply shifted right by 16.
49967 SDValue LHS, RHS;
49968 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49969 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49970 return SDValue();
49971
49972 unsigned ExtOpc = LHS.getOpcode();
49973 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49974 RHS.getOpcode() != ExtOpc)
49975 return SDValue();
49976
49977 // Peek through the extends.
49978 LHS = LHS.getOperand(0);
49979 RHS = RHS.getOperand(0);
49980
49981 // Ensure the input types match.
49982 EVT MulVT = LHS.getValueType();
49983 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49984 return SDValue();
49985
49986 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49987 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49988
49989 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49990 return DAG.getNode(ExtOpc, DL, VT, Mulh);
49991}
49992
49994 const X86Subtarget &Subtarget) {
49995 using namespace llvm::SDPatternMatch;
49996 SDValue N0 = N->getOperand(0);
49997 SDValue N1 = N->getOperand(1);
49999 EVT VT = N0.getValueType();
50000 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50001 SDLoc DL(N);
50002
50003 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50004 // with out-of-bounds clamping.
50005 if (N0.getOpcode() == ISD::VSELECT &&
50006 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
50007 SDValue Cond = N0.getOperand(0);
50008 SDValue N00 = N0.getOperand(1);
50009 SDValue N01 = N0.getOperand(2);
50010 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
50012 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50014 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
50015 }
50016 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
50018 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50020 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
50021 }
50022 }
50023
50024 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
50025 // since the result of setcc_c is all zero's or all ones.
50026 if (VT.isInteger() && !VT.isVector() &&
50027 N1C && N0.getOpcode() == ISD::AND &&
50028 N0.getOperand(1).getOpcode() == ISD::Constant) {
50029 SDValue N00 = N0.getOperand(0);
50030 APInt Mask = N0.getConstantOperandAPInt(1);
50031 Mask <<= N1C->getAPIntValue();
50032 bool MaskOK = false;
50033 // We can handle cases concerning bit-widening nodes containing setcc_c if
50034 // we carefully interrogate the mask to make sure we are semantics
50035 // preserving.
50036 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
50037 // of the underlying setcc_c operation if the setcc_c was zero extended.
50038 // Consider the following example:
50039 // zext(setcc_c) -> i32 0x0000FFFF
50040 // c1 -> i32 0x0000FFFF
50041 // c2 -> i32 0x00000001
50042 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
50043 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
50044 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
50045 MaskOK = true;
50046 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
50048 MaskOK = true;
50049 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
50050 N00.getOpcode() == ISD::ANY_EXTEND) &&
50052 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
50053 }
50054 if (MaskOK && Mask != 0)
50055 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
50056 }
50057
50058 return SDValue();
50059}
50060
50062 const X86Subtarget &Subtarget) {
50063 using namespace llvm::SDPatternMatch;
50064 SDValue N0 = N->getOperand(0);
50065 SDValue N1 = N->getOperand(1);
50066 EVT VT = N0.getValueType();
50067 unsigned Size = VT.getSizeInBits();
50068 SDLoc DL(N);
50069
50070 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50071 return V;
50072
50073 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
50074 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
50075 SDValue ShrAmtVal;
50076 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
50078 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
50079 }
50080
50081 // fold (SRA (SHL X, ShlConst), SraConst)
50082 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50083 // or (sext_in_reg X)
50084 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50085 // depending on relation between SraConst and ShlConst.
50086 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50087 // us to do the sext_in_reg from corresponding bit.
50088
50089 // sexts in X86 are MOVs. The MOVs have the same code size
50090 // as above SHIFTs (only SHIFT on 1 has lower code size).
50091 // However the MOVs have 2 advantages to a SHIFT:
50092 // 1. MOVs can write to a register that differs from source
50093 // 2. MOVs accept memory operands
50094
50095 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50096 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50098 return SDValue();
50099
50100 SDValue N00 = N0.getOperand(0);
50101 SDValue N01 = N0.getOperand(1);
50102 APInt ShlConst = N01->getAsAPIntVal();
50103 APInt SraConst = N1->getAsAPIntVal();
50104 EVT CVT = N1.getValueType();
50105
50106 if (CVT != N01.getValueType())
50107 return SDValue();
50108 if (SraConst.isNegative())
50109 return SDValue();
50110
50111 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50112 unsigned ShiftSize = SVT.getSizeInBits();
50113 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50114 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50115 continue;
50116 SDValue NN =
50117 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50118 if (SraConst.eq(ShlConst))
50119 return NN;
50120 if (SraConst.ult(ShlConst))
50121 return DAG.getNode(ISD::SHL, DL, VT, NN,
50122 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50123 return DAG.getNode(ISD::SRA, DL, VT, NN,
50124 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50125 }
50126 return SDValue();
50127}
50128
50131 const X86Subtarget &Subtarget) {
50132 using namespace llvm::SDPatternMatch;
50133 SDValue N0 = N->getOperand(0);
50134 SDValue N1 = N->getOperand(1);
50135 EVT VT = N0.getValueType();
50136 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50137 SDLoc DL(N);
50138
50139 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50140 return V;
50141
50142 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50143 // with out-of-bounds clamping.
50144 if (N0.getOpcode() == ISD::VSELECT &&
50145 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50146 SDValue Cond = N0.getOperand(0);
50147 SDValue N00 = N0.getOperand(1);
50148 SDValue N01 = N0.getOperand(2);
50149 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50151 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50153 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50154 }
50155 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50157 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50159 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50160 }
50161 }
50162
50163 // Only do this on the last DAG combine as it can interfere with other
50164 // combines.
50165 if (!DCI.isAfterLegalizeDAG())
50166 return SDValue();
50167
50168 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50169 // TODO: This is a generic DAG combine that became an x86-only combine to
50170 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50171 // and-not ('andn').
50172 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50173 return SDValue();
50174
50175 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50176 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50177 if (!ShiftC || !AndC)
50178 return SDValue();
50179
50180 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50181 // transform should reduce code size. It may also enable secondary transforms
50182 // from improved known-bits analysis or instruction selection.
50183 APInt MaskVal = AndC->getAPIntValue();
50184
50185 // If this can be matched by a zero extend, don't optimize.
50186 if (MaskVal.isMask()) {
50187 unsigned TO = MaskVal.countr_one();
50188 if (TO >= 8 && isPowerOf2_32(TO))
50189 return SDValue();
50190 }
50191
50192 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50193 unsigned OldMaskSize = MaskVal.getSignificantBits();
50194 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50195 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50196 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50197 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50198 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50199 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50200 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50201 }
50202 return SDValue();
50203}
50204
50206 const X86Subtarget &Subtarget) {
50207 unsigned Opcode = N->getOpcode();
50208 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50209
50210 SDLoc DL(N);
50211 EVT VT = N->getValueType(0);
50212 SDValue N0 = N->getOperand(0);
50213 SDValue N1 = N->getOperand(1);
50214 EVT SrcVT = N0.getValueType();
50215
50216 SDValue BC0 =
50217 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50218 SDValue BC1 =
50219 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50220
50221 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50222 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50223 // truncation trees that help us avoid lane crossing shuffles.
50224 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50225 // TODO: We don't handle vXf64 shuffles yet.
50226 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50227 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50229 SmallVector<int> ShuffleMask, ScaledMask;
50230 SDValue Vec = peekThroughBitcasts(BCSrc);
50231 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50233 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50234 // shuffle to a v4X64 width - we can probably relax this in the future.
50235 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50236 ShuffleOps[0].getValueType().is256BitVector() &&
50237 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50238 SDValue Lo, Hi;
50239 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50240 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50241 Lo = DAG.getBitcast(SrcVT, Lo);
50242 Hi = DAG.getBitcast(SrcVT, Hi);
50243 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50244 Res = DAG.getBitcast(ShufVT, Res);
50245 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50246 return DAG.getBitcast(VT, Res);
50247 }
50248 }
50249 }
50250 }
50251
50252 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50253 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50254 // If either/both ops are a shuffle that can scale to v2x64,
50255 // then see if we can perform this as a v4x32 post shuffle.
50256 SmallVector<SDValue> Ops0, Ops1;
50257 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50258 bool IsShuf0 =
50259 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50260 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50261 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50262 bool IsShuf1 =
50263 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50264 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50265 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50266 if (IsShuf0 || IsShuf1) {
50267 if (!IsShuf0) {
50268 Ops0.assign({BC0});
50269 ScaledMask0.assign({0, 1});
50270 }
50271 if (!IsShuf1) {
50272 Ops1.assign({BC1});
50273 ScaledMask1.assign({0, 1});
50274 }
50275
50276 SDValue LHS, RHS;
50277 int PostShuffle[4] = {-1, -1, -1, -1};
50278 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50279 if (M < 0)
50280 return true;
50281 Idx = M % 2;
50282 SDValue Src = Ops[M / 2];
50283 if (!LHS || LHS == Src) {
50284 LHS = Src;
50285 return true;
50286 }
50287 if (!RHS || RHS == Src) {
50288 Idx += 2;
50289 RHS = Src;
50290 return true;
50291 }
50292 return false;
50293 };
50294 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50295 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50296 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50297 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50298 LHS = DAG.getBitcast(SrcVT, LHS);
50299 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50300 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50301 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50302 Res = DAG.getBitcast(ShufVT, Res);
50303 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50304 return DAG.getBitcast(VT, Res);
50305 }
50306 }
50307 }
50308
50309 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50310 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50311 SmallVector<int> Mask0, Mask1;
50312 SmallVector<SDValue> Ops0, Ops1;
50313 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50314 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50315 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50316 !Ops0.empty() && !Ops1.empty() &&
50317 all_of(Ops0,
50318 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50319 all_of(Ops1,
50320 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50321 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50322 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50323 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50324 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50325 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50326 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50327 if ((Op00 == Op11) && (Op01 == Op10)) {
50328 std::swap(Op10, Op11);
50330 }
50331 if ((Op00 == Op10) && (Op01 == Op11)) {
50332 const int Map[4] = {0, 2, 1, 3};
50333 SmallVector<int, 4> ShuffleMask(
50334 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50335 Map[ScaledMask1[1]]});
50336 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50337 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50338 DAG.getBitcast(SrcVT, Op01));
50339 Res = DAG.getBitcast(ShufVT, Res);
50340 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50341 return DAG.getBitcast(VT, Res);
50342 }
50343 }
50344 }
50345
50346 return SDValue();
50347}
50348
50351 const X86Subtarget &Subtarget) {
50352 unsigned Opcode = N->getOpcode();
50353 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50354 "Unexpected pack opcode");
50355
50356 EVT VT = N->getValueType(0);
50357 SDValue N0 = N->getOperand(0);
50358 SDValue N1 = N->getOperand(1);
50359 unsigned NumDstElts = VT.getVectorNumElements();
50360 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50361 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50362 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50363 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50364 "Unexpected PACKSS/PACKUS input type");
50365
50366 bool IsSigned = (X86ISD::PACKSS == Opcode);
50367
50368 // Constant Folding.
50369 APInt UndefElts0, UndefElts1;
50370 SmallVector<APInt, 32> EltBits0, EltBits1;
50371 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50372 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50373 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50374 /*AllowWholeUndefs*/ true,
50375 /*AllowPartialUndefs*/ true) &&
50376 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50377 /*AllowWholeUndefs*/ true,
50378 /*AllowPartialUndefs*/ true)) {
50379 unsigned NumLanes = VT.getSizeInBits() / 128;
50380 unsigned NumSrcElts = NumDstElts / 2;
50381 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50382 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50383
50384 APInt Undefs(NumDstElts, 0);
50385 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50386 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50387 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50388 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50389 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50390 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50391
50392 if (UndefElts[SrcIdx]) {
50393 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50394 continue;
50395 }
50396
50397 APInt &Val = EltBits[SrcIdx];
50398 if (IsSigned) {
50399 // PACKSS: Truncate signed value with signed saturation.
50400 // Source values less than dst minint are saturated to minint.
50401 // Source values greater than dst maxint are saturated to maxint.
50402 Val = Val.truncSSat(DstBitsPerElt);
50403 } else {
50404 // PACKUS: Truncate signed value with unsigned saturation.
50405 // Source values less than zero are saturated to zero.
50406 // Source values greater than dst maxuint are saturated to maxuint.
50407 // NOTE: This is different from APInt::truncUSat.
50408 if (Val.isIntN(DstBitsPerElt))
50409 Val = Val.trunc(DstBitsPerElt);
50410 else if (Val.isNegative())
50411 Val = APInt::getZero(DstBitsPerElt);
50412 else
50413 Val = APInt::getAllOnes(DstBitsPerElt);
50414 }
50415 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50416 }
50417 }
50418
50419 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50420 }
50421
50422 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50423 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50424 return V;
50425
50426 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50427 // Currently limit this to allsignbits cases only.
50428 if (IsSigned &&
50429 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50430 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50431 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50432 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50433 if (Not0 && Not1) {
50434 SDLoc DL(N);
50435 MVT SrcVT = N0.getSimpleValueType();
50436 SDValue Pack =
50437 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50438 DAG.getBitcast(SrcVT, Not1));
50439 return DAG.getNOT(DL, Pack, VT);
50440 }
50441 }
50442
50443 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50444 // truncate to create a larger truncate.
50445 if (Subtarget.hasAVX512() &&
50446 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50447 N0.getOperand(0).getValueType() == MVT::v8i32) {
50448 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50449 (!IsSigned &&
50450 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50451 if (Subtarget.hasVLX())
50452 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50453
50454 // Widen input to v16i32 so we can truncate that.
50455 SDLoc dl(N);
50456 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50457 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50458 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50459 }
50460 }
50461
50462 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50463 if (VT.is128BitVector()) {
50464 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50465 SDValue Src0, Src1;
50466 if (N0.getOpcode() == ExtOpc &&
50468 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50469 Src0 = N0.getOperand(0);
50470 }
50471 if (N1.getOpcode() == ExtOpc &&
50473 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50474 Src1 = N1.getOperand(0);
50475 }
50476 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50477 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50478 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50479 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50480 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50481 }
50482
50483 // Try again with pack(*_extend_vector_inreg, undef).
50484 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50486 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50487 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50488 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50489 DAG);
50490 }
50491
50492 // Attempt to combine as shuffle.
50493 SDValue Op(N, 0);
50494 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50495 return Res;
50496
50497 return SDValue();
50498}
50499
50502 const X86Subtarget &Subtarget) {
50503 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50504 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50505 "Unexpected horizontal add/sub opcode");
50506
50507 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50508 MVT VT = N->getSimpleValueType(0);
50509 SDValue LHS = N->getOperand(0);
50510 SDValue RHS = N->getOperand(1);
50511
50512 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50513 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50514 LHS.getOpcode() == RHS.getOpcode() &&
50515 LHS.getValueType() == RHS.getValueType() &&
50516 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50517 SDValue LHS0 = LHS.getOperand(0);
50518 SDValue LHS1 = LHS.getOperand(1);
50519 SDValue RHS0 = RHS.getOperand(0);
50520 SDValue RHS1 = RHS.getOperand(1);
50521 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50522 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50523 SDLoc DL(N);
50524 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50525 LHS0.isUndef() ? LHS1 : LHS0,
50526 RHS0.isUndef() ? RHS1 : RHS0);
50527 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50528 Res = DAG.getBitcast(ShufVT, Res);
50529 SDValue NewLHS =
50530 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50531 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50532 SDValue NewRHS =
50533 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50534 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50535 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50536 DAG.getBitcast(VT, NewRHS));
50537 }
50538 }
50539 }
50540
50541 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50542 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50543 return V;
50544
50545 return SDValue();
50546}
50547
50550 const X86Subtarget &Subtarget) {
50551 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50552 X86ISD::VSRL == N->getOpcode()) &&
50553 "Unexpected shift opcode");
50554 EVT VT = N->getValueType(0);
50555 SDValue N0 = N->getOperand(0);
50556 SDValue N1 = N->getOperand(1);
50557
50558 // Shift zero -> zero.
50560 return DAG.getConstant(0, SDLoc(N), VT);
50561
50562 // Detect constant shift amounts.
50563 APInt UndefElts;
50564 SmallVector<APInt, 32> EltBits;
50565 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50566 /*AllowWholeUndefs*/ true,
50567 /*AllowPartialUndefs*/ false)) {
50568 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50569 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50570 EltBits[0].getZExtValue(), DAG);
50571 }
50572
50573 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50574 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50575 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50576 return SDValue(N, 0);
50577
50578 return SDValue();
50579}
50580
50583 const X86Subtarget &Subtarget) {
50584 unsigned Opcode = N->getOpcode();
50585 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50586 X86ISD::VSRLI == Opcode) &&
50587 "Unexpected shift opcode");
50588 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50589 EVT VT = N->getValueType(0);
50590 SDValue N0 = N->getOperand(0);
50591 SDValue N1 = N->getOperand(1);
50592 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50593 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50594 "Unexpected value type");
50595 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50596
50597 // (shift undef, X) -> 0
50598 if (N0.isUndef())
50599 return DAG.getConstant(0, SDLoc(N), VT);
50600
50601 // Out of range logical bit shifts are guaranteed to be zero.
50602 // Out of range arithmetic bit shifts splat the sign bit.
50603 unsigned ShiftVal = N->getConstantOperandVal(1);
50604 if (ShiftVal >= NumBitsPerElt) {
50605 if (LogicalShift)
50606 return DAG.getConstant(0, SDLoc(N), VT);
50607 ShiftVal = NumBitsPerElt - 1;
50608 }
50609
50610 // (shift X, 0) -> X
50611 if (!ShiftVal)
50612 return N0;
50613
50614 // (shift 0, C) -> 0
50616 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50617 // result are all zeros, not undef.
50618 return DAG.getConstant(0, SDLoc(N), VT);
50619
50620 // (VSRAI -1, C) -> -1
50621 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50622 // N0 is all ones or undef. We guarantee that the bits shifted into the
50623 // result are all ones, not undef.
50624 return DAG.getAllOnesConstant(SDLoc(N), VT);
50625
50626 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50627 unsigned NewShiftVal = Amt0 + Amt1;
50628 if (NewShiftVal >= NumBitsPerElt) {
50629 // Out of range logical bit shifts are guaranteed to be zero.
50630 // Out of range arithmetic bit shifts splat the sign bit.
50631 if (LogicalShift)
50632 return DAG.getConstant(0, SDLoc(N), VT);
50633 NewShiftVal = NumBitsPerElt - 1;
50634 }
50635 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50636 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50637 };
50638
50639 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50640 if (Opcode == N0.getOpcode())
50641 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50642
50643 // (shl (add X, X), C) -> (shl X, (C + 1))
50644 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50645 N0.getOperand(0) == N0.getOperand(1))
50646 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50647
50648 // We can decode 'whole byte' logical bit shifts as shuffles.
50649 if (LogicalShift && (ShiftVal % 8) == 0) {
50650 SDValue Op(N, 0);
50651 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50652 return Res;
50653 }
50654
50655 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50656 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50657 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50658 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50659 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50660 N0.getOpcode() == X86ISD::PSHUFD &&
50661 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50662 N0->hasOneUse()) {
50664 if (BC.getOpcode() == X86ISD::VSHLI &&
50665 BC.getScalarValueSizeInBits() == 64 &&
50666 BC.getConstantOperandVal(1) == 63) {
50667 SDLoc DL(N);
50668 SDValue Src = BC.getOperand(0);
50669 Src = DAG.getBitcast(VT, Src);
50670 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50671 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50672 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50673 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50674 return Src;
50675 }
50676 }
50677
50678 auto TryConstantFold = [&](SDValue V) {
50679 APInt UndefElts;
50680 SmallVector<APInt, 32> EltBits;
50681 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50682 /*AllowWholeUndefs*/ true,
50683 /*AllowPartialUndefs*/ true))
50684 return SDValue();
50685 assert(EltBits.size() == VT.getVectorNumElements() &&
50686 "Unexpected shift value type");
50687 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50688 // created an undef input due to no input bits being demanded, but user
50689 // still expects 0 in other bits.
50690 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50691 APInt &Elt = EltBits[i];
50692 if (UndefElts[i])
50693 Elt = 0;
50694 else if (X86ISD::VSHLI == Opcode)
50695 Elt <<= ShiftVal;
50696 else if (X86ISD::VSRAI == Opcode)
50697 Elt.ashrInPlace(ShiftVal);
50698 else
50699 Elt.lshrInPlace(ShiftVal);
50700 }
50701 // Reset undef elements since they were zeroed above.
50702 UndefElts = 0;
50703 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50704 };
50705
50706 // Constant Folding.
50707 if (N->isOnlyUserOf(N0.getNode())) {
50708 if (SDValue C = TryConstantFold(N0))
50709 return C;
50710
50711 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50712 // Don't break NOT patterns.
50714 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50715 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50717 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50718 SDLoc DL(N);
50719 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50720 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50721 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50722 }
50723 }
50724 }
50725
50726 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50727 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50728 DCI))
50729 return SDValue(N, 0);
50730
50731 return SDValue();
50732}
50733
50736 const X86Subtarget &Subtarget) {
50737 EVT VT = N->getValueType(0);
50738 unsigned Opcode = N->getOpcode();
50739 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50740 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50741 Opcode == ISD::INSERT_VECTOR_ELT) &&
50742 "Unexpected vector insertion");
50743
50744 SDValue Vec = N->getOperand(0);
50745 SDValue Scl = N->getOperand(1);
50746 SDValue Idx = N->getOperand(2);
50747
50748 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50749 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50750 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50751
50752 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50753 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50754 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50755 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50756 APInt::getAllOnes(NumBitsPerElt), DCI))
50757 return SDValue(N, 0);
50758 }
50759
50760 // Attempt to combine insertion patterns to a shuffle.
50761 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50762 SDValue Op(N, 0);
50763 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50764 return Res;
50765 }
50766
50767 return SDValue();
50768}
50769
50770/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50771/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50772/// OR -> CMPNEQSS.
50775 const X86Subtarget &Subtarget) {
50776 unsigned opcode;
50777
50778 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50779 // we're requiring SSE2 for both.
50780 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50781 SDValue N0 = N->getOperand(0);
50782 SDValue N1 = N->getOperand(1);
50783 SDValue CMP0 = N0.getOperand(1);
50784 SDValue CMP1 = N1.getOperand(1);
50785 SDLoc DL(N);
50786
50787 // The SETCCs should both refer to the same CMP.
50788 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50789 return SDValue();
50790
50791 SDValue CMP00 = CMP0->getOperand(0);
50792 SDValue CMP01 = CMP0->getOperand(1);
50793 EVT VT = CMP00.getValueType();
50794
50795 if (VT == MVT::f32 || VT == MVT::f64 ||
50796 (VT == MVT::f16 && Subtarget.hasFP16())) {
50797 bool ExpectingFlags = false;
50798 // Check for any users that want flags:
50799 for (const SDNode *U : N->users()) {
50800 if (ExpectingFlags)
50801 break;
50802
50803 switch (U->getOpcode()) {
50804 default:
50805 case ISD::BR_CC:
50806 case ISD::BRCOND:
50807 case ISD::SELECT:
50808 ExpectingFlags = true;
50809 break;
50810 case ISD::CopyToReg:
50811 case ISD::SIGN_EXTEND:
50812 case ISD::ZERO_EXTEND:
50813 case ISD::ANY_EXTEND:
50814 break;
50815 }
50816 }
50817
50818 if (!ExpectingFlags) {
50819 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50820 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50821
50822 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50823 X86::CondCode tmp = cc0;
50824 cc0 = cc1;
50825 cc1 = tmp;
50826 }
50827
50828 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50829 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50830 // FIXME: need symbolic constants for these magic numbers.
50831 // See X86ATTInstPrinter.cpp:printSSECC().
50832 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50833 if (Subtarget.hasAVX512()) {
50834 SDValue FSetCC =
50835 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50836 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50837 // Need to fill with zeros to ensure the bitcast will produce zeroes
50838 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50839 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50840 DAG.getConstant(0, DL, MVT::v16i1),
50841 FSetCC, DAG.getVectorIdxConstant(0, DL));
50842 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50843 N->getSimpleValueType(0));
50844 }
50845 SDValue OnesOrZeroesF =
50846 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50847 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50848
50849 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50850 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50851
50852 if (is64BitFP && !Subtarget.is64Bit()) {
50853 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50854 // 64-bit integer, since that's not a legal type. Since
50855 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50856 // bits, but can do this little dance to extract the lowest 32 bits
50857 // and work with those going forward.
50858 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50859 MVT::v2f64, OnesOrZeroesF);
50860 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50861 OnesOrZeroesF =
50862 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50863 DAG.getVectorIdxConstant(0, DL));
50864 IntVT = MVT::i32;
50865 }
50866
50867 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50868 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50869 DAG.getConstant(1, DL, IntVT));
50870 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50871 ANDed);
50872 return OneBitOfTruth;
50873 }
50874 }
50875 }
50876 }
50877 return SDValue();
50878}
50879
50880/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50882 SelectionDAG &DAG) {
50883 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50884
50885 MVT VT = N->getSimpleValueType(0);
50886 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50887 return SDValue();
50888
50889 SDValue X, Y;
50890 SDValue N0 = N->getOperand(0);
50891 SDValue N1 = N->getOperand(1);
50892
50893 if (SDValue Not = IsNOT(N0, DAG)) {
50894 X = Not;
50895 Y = N1;
50896 } else if (SDValue Not = IsNOT(N1, DAG)) {
50897 X = Not;
50898 Y = N0;
50899 } else
50900 return SDValue();
50901
50902 X = DAG.getBitcast(VT, X);
50903 Y = DAG.getBitcast(VT, Y);
50904 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50905}
50906
50907/// Try to fold:
50908/// and (vector_shuffle<Z,...,Z>
50909/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50910/// ->
50911/// andnp (vector_shuffle<Z,...,Z>
50912/// (insert_vector_elt undef, X, Z), undef), Y
50914 const X86Subtarget &Subtarget) {
50915 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50916
50917 EVT VT = N->getValueType(0);
50918 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50919 // value and require extra moves.
50920 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50921 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50922 return SDValue();
50923
50924 auto GetNot = [&DAG](SDValue V) {
50926 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50927 // end-users are ISD::AND including cases
50928 // (and(extract_vector_element(SVN), Y)).
50929 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50930 !SVN->getOperand(1).isUndef()) {
50931 return SDValue();
50932 }
50933 SDValue IVEN = SVN->getOperand(0);
50934 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50935 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50936 return SDValue();
50937 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50938 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50939 return SDValue();
50940 SDValue Src = IVEN.getOperand(1);
50941 if (SDValue Not = IsNOT(Src, DAG)) {
50942 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50943 SDValue NotIVEN =
50945 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50946 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50947 SVN->getOperand(1), SVN->getMask());
50948 }
50949 return SDValue();
50950 };
50951
50952 SDValue X, Y;
50953 SDValue N0 = N->getOperand(0);
50954 SDValue N1 = N->getOperand(1);
50955 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50956
50957 if (SDValue Not = GetNot(N0)) {
50958 X = Not;
50959 Y = N1;
50960 } else if (SDValue Not = GetNot(N1)) {
50961 X = Not;
50962 Y = N0;
50963 } else
50964 return SDValue();
50965
50966 X = DAG.getBitcast(VT, X);
50967 Y = DAG.getBitcast(VT, Y);
50968 SDLoc DL(N);
50969
50970 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50971 // AVX2.
50972 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50974 SDValue LoX, HiX;
50975 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50976 SDValue LoY, HiY;
50977 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50978 EVT SplitVT = LoX.getValueType();
50979 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50980 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50981 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50982 }
50983
50984 if (TLI.isTypeLegal(VT))
50985 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50986
50987 return SDValue();
50988}
50989
50990// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
50991// logical operations, like in the example below.
50992// or (and (truncate x, truncate y)),
50993// (xor (truncate z, build_vector (constants)))
50994// Given a target type \p VT, we generate
50995// or (and x, y), (xor z, zext(build_vector (constants)))
50996// given x, y and z are of type \p VT. We can do so, if operands are either
50997// truncates from VT types, the second operand is a vector of constants, can
50998// be recursively promoted or is an existing extension we can extend further.
51000 SelectionDAG &DAG,
51001 const X86Subtarget &Subtarget,
51002 unsigned Depth) {
51003 // Limit recursion to avoid excessive compile times.
51005 return SDValue();
51006
51007 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
51008 return SDValue();
51009
51010 SDValue N0 = N.getOperand(0);
51011 SDValue N1 = N.getOperand(1);
51012
51013 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51014 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
51015 return SDValue();
51016
51017 if (SDValue NN0 =
51018 PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
51019 N0 = NN0;
51020 else {
51021 // The left side has to be a 'trunc'.
51022 bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
51023 N0.getOperand(0).getValueType() == VT;
51024 if (LHSTrunc)
51025 N0 = N0.getOperand(0);
51026 else
51027 return SDValue();
51028 }
51029
51030 if (SDValue NN1 =
51031 PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
51032 N1 = NN1;
51033 else {
51034 // The right side has to be a 'trunc', a (foldable) constant or an
51035 // existing extension we can extend further.
51036 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
51037 N1.getOperand(0).getValueType() == VT;
51038 if (RHSTrunc)
51039 N1 = N1.getOperand(0);
51040 else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
51041 Subtarget.hasInt256() && N1.hasOneUse())
51042 N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
51043 else if (SDValue Cst =
51045 N1 = Cst;
51046 else
51047 return SDValue();
51048 }
51049
51050 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
51051}
51052
51053// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
51054// register. In most cases we actually compare or select YMM-sized registers
51055// and mixing the two types creates horrible code. This method optimizes
51056// some of the transition sequences.
51057// Even with AVX-512 this is still useful for removing casts around logical
51058// operations on vXi1 mask types.
51060 SelectionDAG &DAG,
51061 const X86Subtarget &Subtarget) {
51062 EVT VT = N.getValueType();
51063 assert(VT.isVector() && "Expected vector type");
51064 assert((N.getOpcode() == ISD::ANY_EXTEND ||
51065 N.getOpcode() == ISD::ZERO_EXTEND ||
51066 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
51067
51068 SDValue Narrow = N.getOperand(0);
51069 EVT NarrowVT = Narrow.getValueType();
51070
51071 // Generate the wide operation.
51072 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
51073 if (!Op)
51074 return SDValue();
51075 switch (N.getOpcode()) {
51076 default: llvm_unreachable("Unexpected opcode");
51077 case ISD::ANY_EXTEND:
51078 return Op;
51079 case ISD::ZERO_EXTEND:
51080 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
51081 case ISD::SIGN_EXTEND:
51082 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
51083 Op, DAG.getValueType(NarrowVT));
51084 }
51085}
51086
51087static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51088 unsigned FPOpcode;
51089 switch (Opcode) {
51090 // clang-format off
51091 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51092 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51093 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51094 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51095 // clang-format on
51096 }
51097 return FPOpcode;
51098}
51099
51100/// If both input operands of a logic op are being cast from floating-point
51101/// types or FP compares, try to convert this into a floating-point logic node
51102/// to avoid unnecessary moves from SSE to integer registers.
51103static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51104 SDValue N0, SDValue N1,
51105 SelectionDAG &DAG,
51107 const X86Subtarget &Subtarget) {
51108 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51109 "Unexpected bit opcode");
51110
51111 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51112 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51113 return SDValue();
51114
51115 SDValue N00 = N0.getOperand(0);
51116 SDValue N10 = N1.getOperand(0);
51117 EVT N00Type = N00.getValueType();
51118 EVT N10Type = N10.getValueType();
51119
51120 // Ensure that both types are the same and are legal scalar fp types.
51121 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51122 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51123 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51124 return SDValue();
51125
51126 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51127 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51128 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51129 return DAG.getBitcast(VT, FPLogic);
51130 }
51131
51132 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51133 !N1.hasOneUse())
51134 return SDValue();
51135
51136 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51137 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51138
51139 // The vector ISA for FP predicates is incomplete before AVX, so converting
51140 // COMIS* to CMPS* may not be a win before AVX.
51141 if (!Subtarget.hasAVX() &&
51142 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51143 return SDValue();
51144
51145 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51146 // and vector logic:
51147 // logic (setcc N00, N01), (setcc N10, N11) -->
51148 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51149 unsigned NumElts = 128 / N00Type.getSizeInBits();
51150 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51151 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51152 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51153 SDValue N01 = N0.getOperand(1);
51154 SDValue N11 = N1.getOperand(1);
51155 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51156 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51157 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51158 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51159 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51160 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51161 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51162 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51163}
51164
51165// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51166// to reduce XMM->GPR traffic.
51167static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51168 SDValue N1, SelectionDAG &DAG) {
51169 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51170 "Unexpected bit opcode");
51171
51172 // Both operands must be single use MOVMSK.
51173 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51174 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51175 return SDValue();
51176
51177 SDValue Vec0 = N0.getOperand(0);
51178 SDValue Vec1 = N1.getOperand(0);
51179 EVT VecVT0 = Vec0.getValueType();
51180 EVT VecVT1 = Vec1.getValueType();
51181
51182 // Both MOVMSK operands must be from vectors of the same size and same element
51183 // size, but its OK for a fp/int diff.
51184 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51185 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51186 return SDValue();
51187
51188 unsigned VecOpc =
51190 SDValue Result =
51191 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51192 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51193}
51194
51195// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51196// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51197// handles in InstCombine.
51198static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51199 SDValue N0, SDValue N1,
51200 SelectionDAG &DAG) {
51201 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51202 "Unexpected bit opcode");
51203
51204 // Both operands must be single use.
51205 if (!N0.hasOneUse() || !N1.hasOneUse())
51206 return SDValue();
51207
51208 // Search for matching shifts.
51211
51212 unsigned BCOpc = BC0.getOpcode();
51213 EVT BCVT = BC0.getValueType();
51214 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51215 return SDValue();
51216
51217 switch (BCOpc) {
51218 case X86ISD::VSHLI:
51219 case X86ISD::VSRLI:
51220 case X86ISD::VSRAI: {
51221 if (BC0.getOperand(1) != BC1.getOperand(1))
51222 return SDValue();
51223 SDValue BitOp =
51224 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51225 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51226 return DAG.getBitcast(VT, Shift);
51227 }
51228 }
51229
51230 return SDValue();
51231}
51232
51233// Attempt to fold:
51234// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51235// TODO: Handle PACKUS handling.
51236static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51237 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51238 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51239 "Unexpected bit opcode");
51240
51241 // Both operands must be single use.
51242 if (!N0.hasOneUse() || !N1.hasOneUse())
51243 return SDValue();
51244
51245 // Search for matching packs.
51248
51249 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51250 return SDValue();
51251
51252 MVT DstVT = N0.getSimpleValueType();
51253 if (DstVT != N1.getSimpleValueType())
51254 return SDValue();
51255
51256 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51257 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51258
51259 // Limit to allsignbits packing.
51260 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51261 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51262 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51263 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51264 return SDValue();
51265
51266 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51267 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51268 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51269}
51270
51271/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51272/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51273/// with a shift-right to eliminate loading the vector constant mask value.
51275 SelectionDAG &DAG,
51276 const X86Subtarget &Subtarget) {
51277 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51278 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51279 EVT VT = Op0.getValueType();
51280 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51281 return SDValue();
51282
51283 // Try to convert an "is positive" signbit masking operation into arithmetic
51284 // shift and "andn". This saves a materialization of a -1 vector constant.
51285 // The "is negative" variant should be handled more generally because it only
51286 // requires "and" rather than "andn":
51287 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51288 //
51289 // This is limited to the original type to avoid producing even more bitcasts.
51290 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51291 // will be profitable.
51292 if (N->getValueType(0) == VT &&
51293 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51294 SDValue X, Y;
51295 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51296 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51297 X = Op1.getOperand(0);
51298 Y = Op0;
51299 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51300 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51301 X = Op0.getOperand(0);
51302 Y = Op1;
51303 }
51304 if (X && Y) {
51305 SDValue Sra =
51307 VT.getScalarSizeInBits() - 1, DAG);
51308 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51309 }
51310 }
51311
51312 APInt SplatVal;
51313 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51314 return SDValue();
51315
51316 // Don't prevent creation of ANDN.
51317 if (isBitwiseNot(Op0))
51318 return SDValue();
51319
51320 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51321 return SDValue();
51322
51323 unsigned EltBitWidth = VT.getScalarSizeInBits();
51324 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51325 return SDValue();
51326
51327 unsigned ShiftVal = SplatVal.countr_one();
51328 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51329 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51330 return DAG.getBitcast(N->getValueType(0), Shift);
51331}
51332
51333// Get the index node from the lowered DAG of a GEP IR instruction with one
51334// indexing dimension.
51336 if (Ld->isIndexed())
51337 return SDValue();
51338
51339 SDValue Base = Ld->getBasePtr();
51340 if (Base.getOpcode() != ISD::ADD)
51341 return SDValue();
51342
51343 SDValue ShiftedIndex = Base.getOperand(0);
51344 if (ShiftedIndex.getOpcode() != ISD::SHL)
51345 return SDValue();
51346
51347 return ShiftedIndex.getOperand(0);
51348}
51349
51350static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51351 return Subtarget.hasBMI2() &&
51352 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51353}
51354
51355/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51356/// This undoes the inverse fold performed in InstCombine
51358 SelectionDAG &DAG) {
51359 using namespace llvm::SDPatternMatch;
51360 MVT VT = N->getSimpleValueType(0);
51361 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51362 return SDValue();
51363
51364 SDValue X, Y, Z;
51365 if (sd_match(N, m_And(m_Value(X),
51366 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51367 // Don't fold if Y or Z are constants to prevent infinite loops.
51370 return DAG.getNode(
51371 ISD::AND, DL, VT, X,
51372 DAG.getNOT(
51373 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51374 }
51375
51376 return SDValue();
51377}
51378
51379// This function recognizes cases where X86 bzhi instruction can replace and
51380// 'and-load' sequence.
51381// In case of loading integer value from an array of constants which is defined
51382// as follows:
51383//
51384// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51385//
51386// then applying a bitwise and on the result with another input.
51387// It's equivalent to performing bzhi (zero high bits) on the input, with the
51388// same index of the load.
51390 const X86Subtarget &Subtarget) {
51391 MVT VT = Node->getSimpleValueType(0);
51392 SDLoc dl(Node);
51393
51394 // Check if subtarget has BZHI instruction for the node's type
51395 if (!hasBZHI(Subtarget, VT))
51396 return SDValue();
51397
51398 // Try matching the pattern for both operands.
51399 for (unsigned i = 0; i < 2; i++) {
51400 // continue if the operand is not a load instruction
51401 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51402 if (!Ld)
51403 continue;
51404 const Value *MemOp = Ld->getMemOperand()->getValue();
51405 if (!MemOp)
51406 continue;
51407 // Get the Node which indexes into the array.
51409 if (!Index)
51410 continue;
51411
51412 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51413 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51414 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51415 Constant *Init = GV->getInitializer();
51416 Type *Ty = Init->getType();
51418 !Ty->getArrayElementType()->isIntegerTy() ||
51419 Ty->getArrayElementType()->getScalarSizeInBits() !=
51420 VT.getSizeInBits() ||
51421 Ty->getArrayNumElements() >
51422 Ty->getArrayElementType()->getScalarSizeInBits())
51423 continue;
51424
51425 // Check if the array's constant elements are suitable to our case.
51426 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51427 bool ConstantsMatch = true;
51428 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51429 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51430 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51431 ConstantsMatch = false;
51432 break;
51433 }
51434 }
51435 if (!ConstantsMatch)
51436 continue;
51437
51438 // Do the transformation (For 32-bit type):
51439 // -> (and (load arr[idx]), inp)
51440 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51441 // that will be replaced with one bzhi instruction.
51442 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51443 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51444
51445 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51446 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51447 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51448
51449 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51450 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51451 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51452 }
51453 }
51454 }
51455 }
51456 return SDValue();
51457}
51458
51459// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51460// Where C is a mask containing the same number of bits as the setcc and
51461// where the setcc will freely 0 upper bits of k-register. We can replace the
51462// undef in the concat with 0s and remove the AND. This mainly helps with
51463// v2i1/v4i1 setcc being casted to scalar.
51465 const X86Subtarget &Subtarget) {
51466 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51467
51468 EVT VT = N->getValueType(0);
51469
51470 // Make sure this is an AND with constant. We will check the value of the
51471 // constant later.
51472 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51473 if (!C1)
51474 return SDValue();
51475
51476 // This is implied by the ConstantSDNode.
51477 assert(!VT.isVector() && "Expected scalar VT!");
51478
51479 SDValue Src = N->getOperand(0);
51480 if (!Src.hasOneUse())
51481 return SDValue();
51482
51483 // (Optionally) peek through any_extend().
51484 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51485 if (!Src.getOperand(0).hasOneUse())
51486 return SDValue();
51487 Src = Src.getOperand(0);
51488 }
51489
51490 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51491 return SDValue();
51492
51493 Src = Src.getOperand(0);
51494 EVT SrcVT = Src.getValueType();
51495
51496 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51497 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51498 !TLI.isTypeLegal(SrcVT))
51499 return SDValue();
51500
51501 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51502 return SDValue();
51503
51504 // We only care about the first subvector of the concat, we expect the
51505 // other subvectors to be ignored due to the AND if we make the change.
51506 SDValue SubVec = Src.getOperand(0);
51507 EVT SubVecVT = SubVec.getValueType();
51508
51509 // The RHS of the AND should be a mask with as many bits as SubVec.
51510 if (!TLI.isTypeLegal(SubVecVT) ||
51511 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51512 return SDValue();
51513
51514 // First subvector should be a setcc with a legal result type or a
51515 // AND containing at least one setcc with a legal result type.
51516 auto IsLegalSetCC = [&](SDValue V) {
51517 if (V.getOpcode() != ISD::SETCC)
51518 return false;
51519 EVT SetccVT = V.getOperand(0).getValueType();
51520 if (!TLI.isTypeLegal(SetccVT) ||
51521 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51522 return false;
51523 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51524 return false;
51525 return true;
51526 };
51527 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51528 (IsLegalSetCC(SubVec.getOperand(0)) ||
51529 IsLegalSetCC(SubVec.getOperand(1))))))
51530 return SDValue();
51531
51532 // We passed all the checks. Rebuild the concat_vectors with zeroes
51533 // and cast it back to VT.
51534 SDLoc dl(N);
51535 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51536 DAG.getConstant(0, dl, SubVecVT));
51537 Ops[0] = SubVec;
51538 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51539 Ops);
51540 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51541 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51542}
51543
51545 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51546 // We don't want to go crazy with the recursion here. This isn't a super
51547 // important optimization.
51548 static constexpr unsigned kMaxDepth = 2;
51549
51550 // Only do this re-ordering if op has one use.
51551 if (!Op.hasOneUse())
51552 return SDValue();
51553
51554 SDLoc DL(Op);
51555 // If we hit another assosiative op, recurse further.
51556 if (Op.getOpcode() == Opc) {
51557 // Done recursing.
51558 if (Depth++ >= kMaxDepth)
51559 return SDValue();
51560
51561 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51562 if (SDValue R =
51563 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51564 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51565 Op.getOperand(1 - OpIdx));
51566
51567 } else if (Op.getOpcode() == ISD::SUB) {
51568 if (Opc == ISD::AND) {
51569 // BLSI: (and x, (sub 0, x))
51570 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51571 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51572 }
51573 // Opc must be ISD::AND or ISD::XOR
51574 // BLSR: (and x, (sub x, 1))
51575 // BLSMSK: (xor x, (sub x, 1))
51576 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51577 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51578
51579 } else if (Op.getOpcode() == ISD::ADD) {
51580 // Opc must be ISD::AND or ISD::XOR
51581 // BLSR: (and x, (add x, -1))
51582 // BLSMSK: (xor x, (add x, -1))
51583 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51584 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51585 }
51586 return SDValue();
51587}
51588
51590 const X86Subtarget &Subtarget) {
51591 EVT VT = N->getValueType(0);
51592 // Make sure this node is a candidate for BMI instructions.
51593 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51594 (VT != MVT::i32 && VT != MVT::i64))
51595 return SDValue();
51596
51597 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51598
51599 // Try and match LHS and RHS.
51600 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51601 if (SDValue OpMatch =
51602 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51603 N->getOperand(1 - OpIdx), 0))
51604 return OpMatch;
51605 return SDValue();
51606}
51607
51608/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51610 SelectionDAG &DAG,
51611 const X86Subtarget &Subtarget) {
51612 using namespace llvm::SDPatternMatch;
51613
51614 EVT VT = And->getValueType(0);
51615 // Make sure this node is a candidate for BMI instructions.
51616 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51617 return SDValue();
51618
51619 SDValue X;
51620 SDValue Y;
51623 m_Value(Y))))
51624 return SDValue();
51625
51626 SDValue BLSMSK =
51627 DAG.getNode(ISD::XOR, DL, VT, X,
51628 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51629 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51630 return AndN;
51631}
51632
51634 SelectionDAG &DAG,
51636 const X86Subtarget &ST) {
51637 // cmp(setcc(cc, X), 0)
51638 // brcond ne
51639 // ->
51640 // X
51641 // brcond cc
51642
51643 // sub(setcc(cc, X), 1)
51644 // brcond ne
51645 // ->
51646 // X
51647 // brcond ~cc
51648 //
51649 // if only flag has users
51650
51651 SDValue SetCC = N->getOperand(0);
51652
51653 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51654 return SDValue();
51655
51656 // Check the only user of flag is `brcond ne`.
51657 SDNode *BrCond = *Flag->user_begin();
51658 if (BrCond->getOpcode() != X86ISD::BRCOND)
51659 return SDValue();
51660 unsigned CondNo = 2;
51661 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51663 return SDValue();
51664
51665 SDValue X = SetCC.getOperand(1);
51666 // sub has two results while X only have one. DAG combine assumes the value
51667 // type matches.
51668 if (N->getOpcode() == X86ISD::SUB)
51669 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51670
51671 SDValue CCN = SetCC.getOperand(0);
51672 X86::CondCode CC =
51673 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51675 // Update CC for the consumer of the flag.
51676 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51677 // checking if the second condition evaluates to true. When comparing the
51678 // result with 1, we are checking uf the second condition evaluates to false.
51680 if (isNullConstant(N->getOperand(1)))
51681 Ops[CondNo] = CCN;
51682 else if (isOneConstant(N->getOperand(1)))
51683 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51684 else
51685 llvm_unreachable("expect constant 0 or 1");
51686
51687 SDValue NewBrCond =
51688 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51689 // Avoid self-assign error b/c CC1 can be `e/ne`.
51690 if (BrCond != NewBrCond.getNode())
51691 DCI.CombineTo(BrCond, NewBrCond);
51692 return X;
51693}
51694
51697 const X86Subtarget &ST) {
51698 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51699 // ->
51700 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51701
51702 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51703 // ->
51704 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51705 //
51706 // where cflags is determined by cc1.
51707
51708 if (!ST.hasCCMP())
51709 return SDValue();
51710
51711 SDValue SetCC0 = N->getOperand(0);
51712 SDValue SetCC1 = N->getOperand(1);
51713 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51714 SetCC1.getOpcode() != X86ISD::SETCC)
51715 return SDValue();
51716
51717 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51718 SDValue Op = V.getOperand(1);
51719 unsigned Opc = Op.getOpcode();
51720 if (Opc == X86ISD::SUB)
51721 return X86ISD::CCMP;
51722 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51723 return X86ISD::CTEST;
51724 return 0U;
51725 };
51726
51727 unsigned NewOpc = 0;
51728
51729 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51730 // appear on the right.
51731 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51732 std::swap(SetCC0, SetCC1);
51733 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51734 return SDValue();
51735 }
51736
51737 X86::CondCode CC0 =
51738 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51739 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51740 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51741 return SDValue();
51742
51743 bool IsOR = N->getOpcode() == ISD::OR;
51744
51745 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51746 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51747 // operator is OR. Similar for CC1.
51748 SDValue SrcCC =
51750 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51751 : SetCC0.getOperand(0);
51752 SDValue CC1N = SetCC1.getOperand(0);
51753 X86::CondCode CC1 =
51754 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51756 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51757 SDLoc DL(N);
51758 SDValue CFlags = DAG.getTargetConstant(
51759 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51760 SDValue Sub = SetCC1.getOperand(1);
51761
51762 // Replace any uses of the old flag produced by SUB/CMP with the new one
51763 // produced by CCMP/CTEST.
51764 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51765 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51766 {Sub.getOperand(0), Sub.getOperand(1),
51767 CFlags, SrcCC, SetCC0.getOperand(1)})
51768 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51769 {Sub.getOperand(0), Sub.getOperand(0),
51770 CFlags, SrcCC, SetCC0.getOperand(1)});
51771
51772 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51773}
51774
51777 const X86Subtarget &Subtarget) {
51778 using namespace SDPatternMatch;
51779
51780 SDValue N0 = N->getOperand(0);
51781 SDValue N1 = N->getOperand(1);
51782 EVT VT = N->getValueType(0);
51783 SDLoc dl(N);
51784 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51785
51786 // If this is SSE1 only convert to FAND to avoid scalarization.
51787 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51788 return DAG.getBitcast(MVT::v4i32,
51789 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51790 DAG.getBitcast(MVT::v4f32, N0),
51791 DAG.getBitcast(MVT::v4f32, N1)));
51792 }
51793
51794 // Use a 32-bit and+zext if upper bits known zero.
51795 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51796 APInt HiMask = APInt::getHighBitsSet(64, 32);
51797 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51798 DAG.MaskedValueIsZero(N0, HiMask)) {
51799 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51800 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51801 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51802 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51803 }
51804 }
51805
51806 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51807 // TODO: Support multiple SrcOps.
51808 if (VT == MVT::i1) {
51810 SmallVector<APInt, 2> SrcPartials;
51811 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51812 SrcOps.size() == 1) {
51813 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51814 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51815 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51816 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51817 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51818 if (Mask) {
51819 assert(SrcPartials[0].getBitWidth() == NumElts &&
51820 "Unexpected partial reduction mask");
51821 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51822 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51823 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51824 }
51825 }
51826 }
51827
51828 // InstCombine converts:
51829 // `(-x << C0) & C1`
51830 // to
51831 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51832 // This saves an IR instruction but on x86 the neg/shift version is preferable
51833 // so undo the transform.
51834
51835 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51836 // TODO: We don't actually need a splat for this, we just need the checks to
51837 // hold for each element.
51838 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51839 /*AllowTruncation*/ false);
51840 ConstantSDNode *N01C =
51841 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51842 /*AllowTruncation*/ false);
51843 if (N1C && N01C) {
51844 const APInt &MulC = N01C->getAPIntValue();
51845 const APInt &AndC = N1C->getAPIntValue();
51846 APInt MulCLowBit = MulC & (-MulC);
51847 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51848 (MulCLowBit + MulC).isPowerOf2()) {
51849 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51850 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51851 assert(MulCLowBitLog != -1 &&
51852 "Isolated lowbit is somehow not a power of 2!");
51853 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51854 DAG.getConstant(MulCLowBitLog, dl, VT));
51855 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51856 }
51857 }
51858 }
51859
51860 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51861 return SetCC;
51862
51863 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51864 return V;
51865
51866 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51867 return R;
51868
51869 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51870 return R;
51871
51872 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51873 return R;
51874
51875 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51876 DAG, DCI, Subtarget))
51877 return FPLogic;
51878
51879 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51880 return R;
51881
51882 if (DCI.isBeforeLegalizeOps())
51883 return SDValue();
51884
51885 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51886 return R;
51887
51888 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51889 return R;
51890
51891 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51892 return ShiftRight;
51893
51894 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51895 return R;
51896
51897 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51898 return R;
51899
51900 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51901 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51902 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51903 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51904 unsigned Opc0 = N0.getOpcode();
51905 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51907 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51908 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51909 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51910 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51911 }
51912 }
51913
51914 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51915 // to make use of predicated selects.
51916 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51917 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51918 SDValue X, Y;
51919 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51920 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51921 (VT.is512BitVector() || Subtarget.hasVLX()) &&
51922 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
51925 m_Value(Y), m_SpecificVT(CondVT),
51926 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51927 return DAG.getSelect(dl, VT, Y, X,
51928 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51929 }
51930 }
51931
51932 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51933 // avoids slow variable shift (moving shift amount to ECX etc.)
51934 if (isOneConstant(N1) && N0->hasOneUse()) {
51935 SDValue Src = N0;
51936 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51937 Src.getOpcode() == ISD::TRUNCATE) &&
51938 Src.getOperand(0)->hasOneUse())
51939 Src = Src.getOperand(0);
51940 bool ContainsNOT = false;
51941 X86::CondCode X86CC = X86::COND_B;
51942 // Peek through AND(NOT(SRL(X,Y)),1).
51943 if (isBitwiseNot(Src)) {
51944 Src = Src.getOperand(0);
51945 X86CC = X86::COND_AE;
51946 ContainsNOT = true;
51947 }
51948 if (Src.getOpcode() == ISD::SRL &&
51949 !isa<ConstantSDNode>(Src.getOperand(1))) {
51950 SDValue BitNo = Src.getOperand(1);
51951 Src = Src.getOperand(0);
51952 // Peek through AND(SRL(NOT(X),Y),1).
51953 if (isBitwiseNot(Src)) {
51954 Src = Src.getOperand(0);
51955 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51956 ContainsNOT = true;
51957 }
51958 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51959 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51960 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51961 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51962 }
51963 }
51964
51965 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51966 // Attempt to recursively combine a bitmask AND with shuffles.
51967 SDValue Op(N, 0);
51968 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51969 return Res;
51970
51971 // If either operand is a constant mask, then only the elements that aren't
51972 // zero are actually demanded by the other operand.
51973 auto GetDemandedMasks = [&](SDValue Op) {
51974 APInt UndefElts;
51975 SmallVector<APInt> EltBits;
51976 int NumElts = VT.getVectorNumElements();
51977 int EltSizeInBits = VT.getScalarSizeInBits();
51978 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51979 APInt DemandedElts = APInt::getAllOnes(NumElts);
51980 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51981 EltBits)) {
51982 DemandedBits.clearAllBits();
51983 DemandedElts.clearAllBits();
51984 for (int I = 0; I != NumElts; ++I) {
51985 if (UndefElts[I]) {
51986 // We can't assume an undef src element gives an undef dst - the
51987 // other src might be zero.
51988 DemandedBits.setAllBits();
51989 DemandedElts.setBit(I);
51990 } else if (!EltBits[I].isZero()) {
51991 DemandedBits |= EltBits[I];
51992 DemandedElts.setBit(I);
51993 }
51994 }
51995 }
51996 return std::make_pair(DemandedBits, DemandedElts);
51997 };
51998 APInt Bits0, Elts0;
51999 APInt Bits1, Elts1;
52000 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52001 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
52002
52003 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52004 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52005 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52006 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52007 if (N->getOpcode() != ISD::DELETED_NODE)
52008 DCI.AddToWorklist(N);
52009 return SDValue(N, 0);
52010 }
52011
52012 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
52013 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
52014 if (NewN0 || NewN1)
52015 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
52016 NewN1 ? NewN1 : N1);
52017 }
52018
52019 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
52020 if ((VT.getScalarSizeInBits() % 8) == 0 &&
52022 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
52023 SDValue BitMask = N1;
52024 SDValue SrcVec = N0.getOperand(0);
52025 EVT SrcVecVT = SrcVec.getValueType();
52026
52027 // Check that the constant bitmask masks whole bytes.
52028 APInt UndefElts;
52029 SmallVector<APInt, 64> EltBits;
52030 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
52031 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
52032 llvm::all_of(EltBits, [](const APInt &M) {
52033 return M.isZero() || M.isAllOnes();
52034 })) {
52035 unsigned NumElts = SrcVecVT.getVectorNumElements();
52036 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
52037 unsigned Idx = N0.getConstantOperandVal(1);
52038
52039 // Create a root shuffle mask from the byte mask and the extracted index.
52040 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
52041 for (unsigned i = 0; i != Scale; ++i) {
52042 if (UndefElts[i])
52043 continue;
52044 int VecIdx = Scale * Idx + i;
52045 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
52046 }
52047
52049 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
52050 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
52051 /*AllowVariableCrossLaneMask=*/true,
52052 /*AllowVariablePerLaneMask=*/true,
52053 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
52054 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
52055 N0.getOperand(1));
52056 }
52057 }
52058
52059 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52060 return R;
52061
52062 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
52063 return R;
52064
52065 return SDValue();
52066}
52067
52068// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
52070 SelectionDAG &DAG,
52071 const X86Subtarget &Subtarget) {
52072 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52073
52074 MVT VT = N->getSimpleValueType(0);
52075 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52076 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
52077 return SDValue();
52078
52079 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
52080 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
52081 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
52082 return SDValue();
52083
52084 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
52085 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
52086 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
52087 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
52088 return SDValue();
52089
52090 // Attempt to extract constant byte masks.
52091 APInt UndefElts0, UndefElts1;
52092 SmallVector<APInt, 32> EltBits0, EltBits1;
52093 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52094 /*AllowWholeUndefs*/ false,
52095 /*AllowPartialUndefs*/ false))
52096 return SDValue();
52097 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52098 /*AllowWholeUndefs*/ false,
52099 /*AllowPartialUndefs*/ false))
52100 return SDValue();
52101
52102 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52103 // TODO - add UNDEF elts support.
52104 if (UndefElts0[i] || UndefElts1[i])
52105 return SDValue();
52106 if (EltBits0[i] != ~EltBits1[i])
52107 return SDValue();
52108 }
52109
52110 if (useVPTERNLOG(Subtarget, VT)) {
52111 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52112 // VPTERNLOG is only available as vXi32/64-bit types.
52113 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52114 MVT OpVT =
52115 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52116 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52117 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52118 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52119 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52120 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52121 DAG, Subtarget);
52122 return DAG.getBitcast(VT, Res);
52123 }
52124
52125 SDValue X = N->getOperand(0);
52126 SDValue Y =
52127 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52128 DAG.getBitcast(VT, N1.getOperand(0)));
52129 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52130}
52131
52132// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52133// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52134// Waiting for ANDNP combine allows other combines to happen that prevent
52135// matching.
52136static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52137 using namespace SDPatternMatch;
52138 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52139 m_And(m_Deferred(Mask), m_Value(Y))));
52140}
52141
52142// Try to fold:
52143// (or (and (m, y), (pandn m, x)))
52144// into:
52145// (vselect m, x, y)
52146// As a special case, try to fold:
52147// (or (and (m, (sub 0, x)), (pandn m, x)))
52148// into:
52149// (sub (xor X, M), M)
52151 SelectionDAG &DAG,
52152 const X86Subtarget &Subtarget) {
52153 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52154
52155 EVT VT = N->getValueType(0);
52156 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52157 (VT.is256BitVector() && Subtarget.hasInt256())))
52158 return SDValue();
52159
52160 SDValue X, Y, Mask;
52161 if (!matchLogicBlend(N, X, Y, Mask))
52162 return SDValue();
52163
52164 // Validate that X, Y, and Mask are bitcasts, and see through them.
52165 Mask = peekThroughBitcasts(Mask);
52168
52169 EVT MaskVT = Mask.getValueType();
52170 unsigned EltBits = MaskVT.getScalarSizeInBits();
52171
52172 // TODO: Attempt to handle floating point cases as well?
52173 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52174 return SDValue();
52175
52176 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52177 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52178 DAG, Subtarget))
52179 return Res;
52180
52181 // PBLENDVB is only available on SSE 4.1.
52182 if (!Subtarget.hasSSE41())
52183 return SDValue();
52184
52185 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52186 if (Subtarget.hasVLX())
52187 return SDValue();
52188
52189 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52190
52191 X = DAG.getBitcast(BlendVT, X);
52192 Y = DAG.getBitcast(BlendVT, Y);
52193 Mask = DAG.getBitcast(BlendVT, Mask);
52194 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52195 return DAG.getBitcast(VT, Mask);
52196}
52197
52198// Helper function for combineOrCmpEqZeroToCtlzSrl
52199// Transforms:
52200// seteq(cmp x, 0)
52201// into:
52202// srl(ctlz x), log2(bitsize(x))
52203// Input pattern is checked by caller.
52205 SDValue Cmp = Op.getOperand(1);
52206 EVT VT = Cmp.getOperand(0).getValueType();
52207 unsigned Log2b = Log2_32(VT.getSizeInBits());
52208 SDLoc dl(Op);
52209 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52210 // The result of the shift is true or false, and on X86, the 32-bit
52211 // encoding of shr and lzcnt is more desirable.
52212 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52213 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52214 DAG.getConstant(Log2b, dl, MVT::i8));
52215 return Scc;
52216}
52217
52218// Try to transform:
52219// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52220// into:
52221// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52222// Will also attempt to match more generic cases, eg:
52223// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52224// Only applies if the target supports the FastLZCNT feature.
52227 const X86Subtarget &Subtarget) {
52228 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52229 return SDValue();
52230
52231 auto isORCandidate = [](SDValue N) {
52232 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52233 };
52234
52235 // Check the zero extend is extending to 32-bit or more. The code generated by
52236 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52237 // instructions to clear the upper bits.
52238 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52239 !isORCandidate(N->getOperand(0)))
52240 return SDValue();
52241
52242 // Check the node matches: setcc(eq, cmp 0)
52243 auto isSetCCCandidate = [](SDValue N) {
52244 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52245 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52246 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52247 isNullConstant(N->getOperand(1).getOperand(1)) &&
52248 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52249 };
52250
52251 SDNode *OR = N->getOperand(0).getNode();
52252 SDValue LHS = OR->getOperand(0);
52253 SDValue RHS = OR->getOperand(1);
52254
52255 // Save nodes matching or(or, setcc(eq, cmp 0)).
52257 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52258 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52259 ORNodes.push_back(OR);
52260 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52261 LHS = OR->getOperand(0);
52262 RHS = OR->getOperand(1);
52263 }
52264
52265 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52266 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52267 !isORCandidate(SDValue(OR, 0)))
52268 return SDValue();
52269
52270 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52271 // to
52272 // or(srl(ctlz),srl(ctlz)).
52273 // The dag combiner can then fold it into:
52274 // srl(or(ctlz, ctlz)).
52275 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52276 SDValue Ret, NewRHS;
52277 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52278 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52279
52280 if (!Ret)
52281 return SDValue();
52282
52283 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52284 while (!ORNodes.empty()) {
52285 OR = ORNodes.pop_back_val();
52286 LHS = OR->getOperand(0);
52287 RHS = OR->getOperand(1);
52288 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52289 if (RHS->getOpcode() == ISD::OR)
52290 std::swap(LHS, RHS);
52291 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52292 if (!NewRHS)
52293 return SDValue();
52294 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52295 }
52296
52297 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52298}
52299
52300/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52301/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52302/// with CMP+{ADC, SBB}.
52303/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52304static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52305 SDValue X, SDValue Y,
52306 SelectionDAG &DAG,
52307 bool ZeroSecondOpOnly = false) {
52308 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52309 return SDValue();
52310
52311 // Look through a one-use zext.
52312 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52313 Y = Y.getOperand(0);
52314
52315 X86::CondCode CC;
52316 SDValue EFLAGS;
52317 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52318 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52319 EFLAGS = Y.getOperand(1);
52320 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52321 Y.hasOneUse()) {
52322 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52323 }
52324
52325 if (!EFLAGS)
52326 return SDValue();
52327
52328 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52329 // the general case below.
52330 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52331 if (ConstantX && !ZeroSecondOpOnly) {
52332 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52333 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52334 // This is a complicated way to get -1 or 0 from the carry flag:
52335 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52336 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52337 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52338 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52339 EFLAGS);
52340 }
52341
52342 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52343 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52344 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52345 EFLAGS.getValueType().isInteger() &&
52346 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52347 // Swap the operands of a SUB, and we have the same pattern as above.
52348 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52349 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52350 SDValue NewSub = DAG.getNode(
52351 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52352 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52353 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52354 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52355 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52356 NewEFLAGS);
52357 }
52358 }
52359 }
52360
52361 if (CC == X86::COND_B) {
52362 // X + SETB Z --> adc X, 0
52363 // X - SETB Z --> sbb X, 0
52364 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52365 DAG.getVTList(VT, MVT::i32), X,
52366 DAG.getConstant(0, DL, VT), EFLAGS);
52367 }
52368
52369 if (ZeroSecondOpOnly)
52370 return SDValue();
52371
52372 if (CC == X86::COND_A) {
52373 // Try to convert COND_A into COND_B in an attempt to facilitate
52374 // materializing "setb reg".
52375 //
52376 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52377 // cannot take an immediate as its first operand.
52378 //
52379 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52380 EFLAGS.getValueType().isInteger() &&
52381 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52382 SDValue NewSub =
52383 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52384 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52385 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52386 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52387 DAG.getVTList(VT, MVT::i32), X,
52388 DAG.getConstant(0, DL, VT), NewEFLAGS);
52389 }
52390 }
52391
52392 if (CC == X86::COND_AE) {
52393 // X + SETAE --> sbb X, -1
52394 // X - SETAE --> adc X, -1
52395 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52396 DAG.getVTList(VT, MVT::i32), X,
52397 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52398 }
52399
52400 if (CC == X86::COND_BE) {
52401 // X + SETBE --> sbb X, -1
52402 // X - SETBE --> adc X, -1
52403 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52404 // materializing "setae reg".
52405 //
52406 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52407 // cannot take an immediate as its first operand.
52408 //
52409 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52410 EFLAGS.getValueType().isInteger() &&
52411 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52412 SDValue NewSub =
52413 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52414 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52415 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52416 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52417 DAG.getVTList(VT, MVT::i32), X,
52418 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52419 }
52420 }
52421
52422 if (CC != X86::COND_E && CC != X86::COND_NE)
52423 return SDValue();
52424
52425 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52426 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52427 !EFLAGS.getOperand(0).getValueType().isInteger())
52428 return SDValue();
52429
52430 SDValue Z = EFLAGS.getOperand(0);
52431 EVT ZVT = Z.getValueType();
52432
52433 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52434 // the general case below.
52435 if (ConstantX) {
52436 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52437 // fake operands:
52438 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52439 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52440 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52441 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52442 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52443 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52444 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52445 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52446 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52447 SDValue(Neg.getNode(), 1));
52448 }
52449
52450 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52451 // with fake operands:
52452 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52453 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52454 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52455 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52456 SDValue One = DAG.getConstant(1, DL, ZVT);
52457 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52458 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52459 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52460 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52461 Cmp1.getValue(1));
52462 }
52463 }
52464
52465 // (cmp Z, 1) sets the carry flag if Z is 0.
52466 SDValue One = DAG.getConstant(1, DL, ZVT);
52467 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52468 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52469
52470 // Add the flags type for ADC/SBB nodes.
52471 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52472
52473 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52474 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52475 if (CC == X86::COND_NE)
52476 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52477 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52478
52479 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52480 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52481 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52482 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52483}
52484
52485/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52486/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52487/// with CMP+{ADC, SBB}.
52489 SelectionDAG &DAG) {
52490 bool IsSub = N->getOpcode() == ISD::SUB;
52491 SDValue X = N->getOperand(0);
52492 SDValue Y = N->getOperand(1);
52493 EVT VT = N->getValueType(0);
52494
52495 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52496 return ADCOrSBB;
52497
52498 // Commute and try again (negate the result for subtracts).
52499 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52500 if (IsSub)
52501 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52502 return ADCOrSBB;
52503 }
52504
52505 return SDValue();
52506}
52507
52508static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52509 SDValue N0, SDValue N1,
52510 SelectionDAG &DAG) {
52511 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52512
52513 // Delegate to combineAddOrSubToADCOrSBB if we have:
52514 //
52515 // (xor/or (zero_extend (setcc)) imm)
52516 //
52517 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52518 // equivalent to a SUB/ADD, respectively.
52519 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52520 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52521 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52522 bool IsSub = Opc == ISD::XOR;
52523 bool N1COdd = N1C->getZExtValue() & 1;
52524 if (IsSub ? N1COdd : !N1COdd)
52525 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52526 return R;
52527 }
52528 }
52529
52530 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52531 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52532 N0.getOperand(0).getOpcode() == ISD::AND &&
52536 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52537 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52538 N0.getOperand(0).getOperand(1));
52539 }
52540
52541 return SDValue();
52542}
52543
52546 const X86Subtarget &Subtarget) {
52547 SDValue N0 = N->getOperand(0);
52548 SDValue N1 = N->getOperand(1);
52549 EVT VT = N->getValueType(0);
52550 SDLoc dl(N);
52551 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52552
52553 // If this is SSE1 only convert to FOR to avoid scalarization.
52554 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52555 return DAG.getBitcast(MVT::v4i32,
52556 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52557 DAG.getBitcast(MVT::v4f32, N0),
52558 DAG.getBitcast(MVT::v4f32, N1)));
52559 }
52560
52561 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52562 // TODO: Support multiple SrcOps.
52563 if (VT == MVT::i1) {
52565 SmallVector<APInt, 2> SrcPartials;
52566 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52567 SrcOps.size() == 1) {
52568 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52569 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52570 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52571 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52572 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52573 if (Mask) {
52574 assert(SrcPartials[0].getBitWidth() == NumElts &&
52575 "Unexpected partial reduction mask");
52576 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52577 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52578 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52579 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52580 }
52581 }
52582 }
52583
52584 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52585 return SetCC;
52586
52587 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52588 return R;
52589
52590 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52591 return R;
52592
52593 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52594 return R;
52595
52596 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52597 DAG, DCI, Subtarget))
52598 return FPLogic;
52599
52600 if (DCI.isBeforeLegalizeOps())
52601 return SDValue();
52602
52603 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52604 return R;
52605
52606 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52607 return R;
52608
52609 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52610 return R;
52611
52612 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52613 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52614 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52615 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52616 uint64_t Val = CN->getZExtValue();
52617 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52618 Val == 8) {
52619 SDValue NotCond;
52620 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52621 N0.getOperand(1).hasOneUse()) {
52624 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52625 } else if (N0.getOpcode() == ISD::SUB &&
52626 isNullConstant(N0.getOperand(0))) {
52627 SDValue Cond = N0.getOperand(1);
52628 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52629 Cond = Cond.getOperand(0);
52630 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52631 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52633 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52634 }
52635 }
52636
52637 if (NotCond) {
52638 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52639 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52640 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52641 return R;
52642 }
52643 }
52644 }
52645 }
52646
52647 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52648 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52649 // iff the upper elements of the non-shifted arg are zero.
52650 // KUNPCK require 16+ bool vector elements.
52651 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52652 unsigned NumElts = VT.getVectorNumElements();
52653 unsigned HalfElts = NumElts / 2;
52654 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52655 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52656 N1.getConstantOperandAPInt(1) == HalfElts &&
52657 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52658 return DAG.getNode(
52659 ISD::CONCAT_VECTORS, dl, VT,
52660 extractSubVector(N0, 0, DAG, dl, HalfElts),
52661 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52662 }
52663 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52664 N0.getConstantOperandAPInt(1) == HalfElts &&
52665 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52666 return DAG.getNode(
52667 ISD::CONCAT_VECTORS, dl, VT,
52668 extractSubVector(N1, 0, DAG, dl, HalfElts),
52669 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52670 }
52671 }
52672
52673 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52674 // Attempt to recursively combine an OR of shuffles.
52675 SDValue Op(N, 0);
52676 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52677 return Res;
52678
52679 // If either operand is a constant mask, then only the elements that aren't
52680 // allones are actually demanded by the other operand.
52681 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52682 APInt UndefElts;
52683 SmallVector<APInt> EltBits;
52684 int NumElts = VT.getVectorNumElements();
52685 int EltSizeInBits = VT.getScalarSizeInBits();
52686 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52687 return false;
52688
52689 APInt DemandedElts = APInt::getZero(NumElts);
52690 for (int I = 0; I != NumElts; ++I)
52691 if (!EltBits[I].isAllOnes())
52692 DemandedElts.setBit(I);
52693
52694 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52695 };
52696 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52697 if (N->getOpcode() != ISD::DELETED_NODE)
52698 DCI.AddToWorklist(N);
52699 return SDValue(N, 0);
52700 }
52701 }
52702
52703 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52704 return R;
52705
52706 return SDValue();
52707}
52708
52709/// Try to turn tests against the signbit in the form of:
52710/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52711/// into:
52712/// SETGT(X, -1)
52714 SelectionDAG &DAG) {
52715 // This is only worth doing if the output type is i8 or i1.
52716 EVT ResultType = N->getValueType(0);
52717 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52718 return SDValue();
52719
52720 SDValue N0 = N->getOperand(0);
52721 SDValue N1 = N->getOperand(1);
52722
52723 // We should be performing an xor against a truncated shift.
52724 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52725 return SDValue();
52726
52727 // Make sure we are performing an xor against one.
52728 if (!isOneConstant(N1))
52729 return SDValue();
52730
52731 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52732 SDValue Shift = N0.getOperand(0);
52733 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52734 return SDValue();
52735
52736 // Make sure we are truncating from one of i16, i32 or i64.
52737 EVT ShiftTy = Shift.getValueType();
52738 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52739 return SDValue();
52740
52741 // Make sure the shift amount extracts the sign bit.
52742 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52743 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52744 return SDValue();
52745
52746 // Create a greater-than comparison against -1.
52747 // N.B. Using SETGE against 0 works but we want a canonical looking
52748 // comparison, using SETGT matches up with what TranslateX86CC.
52749 SDValue ShiftOp = Shift.getOperand(0);
52750 EVT ShiftOpTy = ShiftOp.getValueType();
52751 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52752 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52753 *DAG.getContext(), ResultType);
52754 SDValue Cond =
52755 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52756 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52757 if (SetCCResultType != ResultType)
52758 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52759 return Cond;
52760}
52761
52762/// Turn vector tests of the signbit in the form of:
52763/// xor (sra X, elt_size(X)-1), -1
52764/// into:
52765/// pcmpgt X, -1
52766///
52767/// This should be called before type legalization because the pattern may not
52768/// persist after that.
52770 const X86Subtarget &Subtarget) {
52771 EVT VT = N->getValueType(0);
52772 if (!VT.isSimple())
52773 return SDValue();
52774
52775 switch (VT.getSimpleVT().SimpleTy) {
52776 // clang-format off
52777 default: return SDValue();
52778 case MVT::v16i8:
52779 case MVT::v8i16:
52780 case MVT::v4i32:
52781 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52782 case MVT::v32i8:
52783 case MVT::v16i16:
52784 case MVT::v8i32:
52785 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52786 // clang-format on
52787 }
52788
52789 // There must be a shift right algebraic before the xor, and the xor must be a
52790 // 'not' operation.
52791 SDValue Shift = N->getOperand(0);
52792 SDValue Ones = N->getOperand(1);
52793 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52795 return SDValue();
52796
52797 // The shift should be smearing the sign bit across each vector element.
52798 auto *ShiftAmt =
52799 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52800 if (!ShiftAmt ||
52801 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52802 return SDValue();
52803
52804 // Create a greater-than comparison against -1. We don't use the more obvious
52805 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52806 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52807}
52808
52809/// Detect patterns of truncation with unsigned saturation:
52810///
52811/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52812/// Return the source value x to be truncated or SDValue() if the pattern was
52813/// not matched.
52814///
52815/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52816/// where C1 >= 0 and C2 is unsigned max of destination type.
52817///
52818/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52819/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52820///
52821/// These two patterns are equivalent to:
52822/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52823/// So return the smax(x, C1) value to be truncated or SDValue() if the
52824/// pattern was not matched.
52826 const SDLoc &DL) {
52827 using namespace llvm::SDPatternMatch;
52828 EVT InVT = In.getValueType();
52829
52830 // Saturation with truncation. We truncate from InVT to VT.
52832 "Unexpected types for truncate operation");
52833
52834 APInt C1, C2;
52836
52837 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52838 // the element size of the destination type.
52839 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52840 C2.isMask(VT.getScalarSizeInBits()))
52841 return UMin;
52842
52843 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52845 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52846 return SMin;
52847
52848 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52850 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52851 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52852
52853 return SDValue();
52854}
52855
52856/// Detect patterns of truncation with signed saturation:
52857/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52858/// signed_max_of_dest_type)) to dest_type)
52859/// or:
52860/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52861/// signed_min_of_dest_type)) to dest_type).
52862/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52863/// Return the source value to be truncated or SDValue() if the pattern was not
52864/// matched.
52865static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52866 using namespace llvm::SDPatternMatch;
52867 unsigned NumDstBits = VT.getScalarSizeInBits();
52868 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52869 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52870
52871 APInt SignedMax, SignedMin;
52872 if (MatchPackUS) {
52873 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52874 SignedMin = APInt::getZero(NumSrcBits);
52875 } else {
52876 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52877 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52878 }
52879
52880 SDValue SMin, SMax;
52881 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52882 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52883 return SMax;
52884
52885 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52886 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52887 return SMin;
52888
52889 return SDValue();
52890}
52891
52893 SelectionDAG &DAG,
52894 const X86Subtarget &Subtarget) {
52895 if (!Subtarget.hasSSE2() || !VT.isVector())
52896 return SDValue();
52897
52898 EVT SVT = VT.getVectorElementType();
52899 EVT InVT = In.getValueType();
52900 EVT InSVT = InVT.getVectorElementType();
52901
52902 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52903 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52904 // and concatenate at the same time. Then we can use a final vpmovuswb to
52905 // clip to 0-255.
52906 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52907 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52908 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52909 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52910 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52911 DL, DAG, Subtarget);
52912 assert(Mid && "Failed to pack!");
52913 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52914 }
52915 }
52916
52917 // vXi32 truncate instructions are available with AVX512F.
52918 // vXi16 truncate instructions are only available with AVX512BW.
52919 // For 256-bit or smaller vectors, we require VLX.
52920 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52921 // If the result type is 256-bits or larger and we have disable 512-bit
52922 // registers, we should go ahead and use the pack instructions if possible.
52923 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52924 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52925 (InVT.getSizeInBits() > 128) &&
52926 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52927 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52928
52929 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52931 (SVT == MVT::i8 || SVT == MVT::i16) &&
52932 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52933 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52934 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52935 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52936 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52937 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52938 DAG, Subtarget);
52939 assert(Mid && "Failed to pack!");
52941 Subtarget);
52942 assert(V && "Failed to pack!");
52943 return V;
52944 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52945 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52946 Subtarget);
52947 }
52948 if (SDValue SSatVal = detectSSatPattern(In, VT))
52949 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52950 Subtarget);
52951 }
52952
52953 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52954 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52955 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52956 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52957 unsigned TruncOpc = 0;
52958 SDValue SatVal;
52959 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52960 SatVal = SSatVal;
52961 TruncOpc = X86ISD::VTRUNCS;
52962 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52963 SatVal = USatVal;
52964 TruncOpc = X86ISD::VTRUNCUS;
52965 }
52966 if (SatVal) {
52967 unsigned ResElts = VT.getVectorNumElements();
52968 // If the input type is less than 512 bits and we don't have VLX, we need
52969 // to widen to 512 bits.
52970 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52971 unsigned NumConcats = 512 / InVT.getSizeInBits();
52972 ResElts *= NumConcats;
52973 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
52974 ConcatOps[0] = SatVal;
52975 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
52976 NumConcats * InVT.getVectorNumElements());
52977 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
52978 }
52979 // Widen the result if its narrower than 128 bits.
52980 if (ResElts * SVT.getSizeInBits() < 128)
52981 ResElts = 128 / SVT.getSizeInBits();
52982 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
52983 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
52984 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
52985 DAG.getVectorIdxConstant(0, DL));
52986 }
52987 }
52988
52989 return SDValue();
52990}
52991
52993 SelectionDAG &DAG,
52995 const X86Subtarget &Subtarget) {
52996 auto *Ld = cast<LoadSDNode>(N);
52997 EVT RegVT = Ld->getValueType(0);
52998 SDValue Ptr = Ld->getBasePtr();
52999 SDValue Chain = Ld->getChain();
53000 ISD::LoadExtType Ext = Ld->getExtensionType();
53001
53002 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
53003 return SDValue();
53004
53005 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
53006 return SDValue();
53007
53009 if (!LdC)
53010 return SDValue();
53011
53012 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
53013 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
53014 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
53015 if (Undefs[I])
53016 continue;
53017 if (UserUndefs[I] || Bits[I] != UserBits[I])
53018 return false;
53019 }
53020 return true;
53021 };
53022
53023 // Look through all other loads/broadcasts in the chain for another constant
53024 // pool entry.
53025 for (SDNode *User : Chain->users()) {
53026 auto *UserLd = dyn_cast<MemSDNode>(User);
53027 if (User != N && UserLd &&
53028 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
53029 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
53031 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
53032 User->getValueSizeInBits(0).getFixedValue() >
53033 RegVT.getFixedSizeInBits()) {
53034 EVT UserVT = User->getValueType(0);
53035 SDValue UserPtr = UserLd->getBasePtr();
53036 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
53037
53038 // See if we are loading a constant that matches in the lower
53039 // bits of a longer constant (but from a different constant pool ptr).
53040 if (UserC && UserPtr != Ptr) {
53041 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
53042 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
53043 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
53044 APInt Undefs, UserUndefs;
53045 SmallVector<APInt> Bits, UserBits;
53046 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
53047 UserVT.getScalarSizeInBits());
53048 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
53049 Bits) &&
53051 UserUndefs, UserBits)) {
53052 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
53054 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53055 RegVT.getSizeInBits());
53056 Extract = DAG.getBitcast(RegVT, Extract);
53057 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53058 }
53059 }
53060 }
53061 }
53062 }
53063 }
53064
53065 return SDValue();
53066}
53067
53070 const X86Subtarget &Subtarget) {
53071 auto *Ld = cast<LoadSDNode>(N);
53072 EVT RegVT = Ld->getValueType(0);
53073 EVT MemVT = Ld->getMemoryVT();
53074 SDLoc dl(Ld);
53075 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53076
53077 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
53078 // into two 16-byte operations. Also split non-temporal aligned loads on
53079 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
53080 ISD::LoadExtType Ext = Ld->getExtensionType();
53081 unsigned Fast;
53082 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
53083 Ext == ISD::NON_EXTLOAD &&
53084 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
53085 Ld->getAlign() >= Align(16)) ||
53086 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
53087 *Ld->getMemOperand(), &Fast) &&
53088 !Fast))) {
53089 unsigned NumElems = RegVT.getVectorNumElements();
53090 if (NumElems < 2)
53091 return SDValue();
53092
53093 unsigned HalfOffset = 16;
53094 SDValue Ptr1 = Ld->getBasePtr();
53095 SDValue Ptr2 =
53096 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53097 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53098 NumElems / 2);
53099 SDValue Load1 =
53100 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53101 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53102 SDValue Load2 =
53103 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53104 Ld->getPointerInfo().getWithOffset(HalfOffset),
53105 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53106 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53107 Load1.getValue(1), Load2.getValue(1));
53108
53109 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53110 return DCI.CombineTo(N, NewVec, TF, true);
53111 }
53112
53113 // Bool vector load - attempt to cast to an integer, as we have good
53114 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53115 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53116 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53117 unsigned NumElts = RegVT.getVectorNumElements();
53118 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53119 if (TLI.isTypeLegal(IntVT)) {
53120 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53121 Ld->getPointerInfo(), Ld->getBaseAlign(),
53122 Ld->getMemOperand()->getFlags());
53123 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53124 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53125 }
53126 }
53127
53128 // If we also broadcast this vector to a wider type, then just extract the
53129 // lowest subvector.
53130 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53131 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53132 SDValue Ptr = Ld->getBasePtr();
53133 SDValue Chain = Ld->getChain();
53134 for (SDNode *User : Chain->users()) {
53135 auto *UserLd = dyn_cast<MemSDNode>(User);
53136 if (User != N && UserLd &&
53137 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53138 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53139 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53140 User->hasAnyUseOfValue(0) &&
53141 User->getValueSizeInBits(0).getFixedValue() >
53142 RegVT.getFixedSizeInBits()) {
53144 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53145 RegVT.getSizeInBits());
53146 Extract = DAG.getBitcast(RegVT, Extract);
53147 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53148 }
53149 }
53150 }
53151
53152 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53153 return V;
53154
53155 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53156 unsigned AddrSpace = Ld->getAddressSpace();
53157 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53158 AddrSpace == X86AS::PTR32_UPTR) {
53159 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53160 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53161 SDValue Cast =
53162 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53163 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53164 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53165 Ld->getMemOperand()->getFlags());
53166 }
53167 }
53168
53169 return SDValue();
53170}
53171
53172/// If V is a build vector of boolean constants and exactly one of those
53173/// constants is true, return the operand index of that true element.
53174/// Otherwise, return -1.
53175static int getOneTrueElt(SDValue V) {
53176 // This needs to be a build vector of booleans.
53177 // TODO: Checking for the i1 type matches the IR definition for the mask,
53178 // but the mask check could be loosened to i8 or other types. That might
53179 // also require checking more than 'allOnesValue'; eg, the x86 HW
53180 // instructions only require that the MSB is set for each mask element.
53181 // The ISD::MSTORE comments/definition do not specify how the mask operand
53182 // is formatted.
53183 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53184 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53185 return -1;
53186
53187 int TrueIndex = -1;
53188 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53189 for (unsigned i = 0; i < NumElts; ++i) {
53190 const SDValue &Op = BV->getOperand(i);
53191 if (Op.isUndef())
53192 continue;
53193 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53194 if (!ConstNode)
53195 return -1;
53196 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53197 // If we already found a one, this is too many.
53198 if (TrueIndex >= 0)
53199 return -1;
53200 TrueIndex = i;
53201 }
53202 }
53203 return TrueIndex;
53204}
53205
53206/// Given a masked memory load/store operation, return true if it has one mask
53207/// bit set. If it has one mask bit set, then also return the memory address of
53208/// the scalar element to load/store, the vector index to insert/extract that
53209/// scalar element, and the alignment for the scalar memory access.
53211 SelectionDAG &DAG, SDValue &Addr,
53212 SDValue &Index, Align &Alignment,
53213 unsigned &Offset) {
53214 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53215 if (TrueMaskElt < 0)
53216 return false;
53217
53218 // Get the address of the one scalar element that is specified by the mask
53219 // using the appropriate offset from the base pointer.
53220 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53221 Offset = 0;
53222 Addr = MaskedOp->getBasePtr();
53223 if (TrueMaskElt != 0) {
53224 Offset = TrueMaskElt * EltVT.getStoreSize();
53226 SDLoc(MaskedOp));
53227 }
53228
53229 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53230 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53231 return true;
53232}
53233
53234/// If exactly one element of the mask is set for a non-extending masked load,
53235/// it is a scalar load and vector insert.
53236/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53237/// mask have already been optimized in IR, so we don't bother with those here.
53238static SDValue
53241 const X86Subtarget &Subtarget) {
53242 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53243 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53244 // However, some target hooks may need to be added to know when the transform
53245 // is profitable. Endianness would also have to be considered.
53246
53247 SDValue Addr, VecIndex;
53248 Align Alignment;
53249 unsigned Offset;
53250 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53251 return SDValue();
53252
53253 // Load the one scalar element that is specified by the mask using the
53254 // appropriate offset from the base pointer.
53255 SDLoc DL(ML);
53256 EVT VT = ML->getValueType(0);
53257 EVT EltVT = VT.getVectorElementType();
53258
53259 EVT CastVT = VT;
53260 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53261 EltVT = MVT::f64;
53262 CastVT = VT.changeVectorElementType(EltVT);
53263 }
53264
53265 SDValue Load =
53266 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53267 ML->getPointerInfo().getWithOffset(Offset),
53268 Alignment, ML->getMemOperand()->getFlags());
53269
53270 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53271
53272 // Insert the loaded element into the appropriate place in the vector.
53273 SDValue Insert =
53274 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53275 Insert = DAG.getBitcast(VT, Insert);
53276 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53277}
53278
53279static SDValue
53282 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53283 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53284 return SDValue();
53285
53286 SDLoc DL(ML);
53287 EVT VT = ML->getValueType(0);
53288
53289 // If we are loading the first and last elements of a vector, it is safe and
53290 // always faster to load the whole vector. Replace the masked load with a
53291 // vector load and select.
53292 unsigned NumElts = VT.getVectorNumElements();
53293 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53294 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53295 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53296 if (LoadFirstElt && LoadLastElt) {
53297 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53298 ML->getMemOperand());
53299 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53300 ML->getPassThru());
53301 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53302 }
53303
53304 // Convert a masked load with a constant mask into a masked load and a select.
53305 // This allows the select operation to use a faster kind of select instruction
53306 // (for example, vblendvps -> vblendps).
53307
53308 // Don't try this if the pass-through operand is already undefined. That would
53309 // cause an infinite loop because that's what we're about to create.
53310 if (ML->getPassThru().isUndef())
53311 return SDValue();
53312
53313 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53314 return SDValue();
53315
53316 // The new masked load has an undef pass-through operand. The select uses the
53317 // original pass-through operand.
53318 SDValue NewML = DAG.getMaskedLoad(
53319 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53320 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53321 ML->getAddressingMode(), ML->getExtensionType());
53322 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53323 ML->getPassThru());
53324
53325 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53326}
53327
53330 const X86Subtarget &Subtarget) {
53331 auto *Mld = cast<MaskedLoadSDNode>(N);
53332
53333 // TODO: Expanding load with constant mask may be optimized as well.
53334 if (Mld->isExpandingLoad())
53335 return SDValue();
53336
53337 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53338 if (SDValue ScalarLoad =
53339 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53340 return ScalarLoad;
53341
53342 // TODO: Do some AVX512 subsets benefit from this transform?
53343 if (!Subtarget.hasAVX512())
53344 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53345 return Blend;
53346 }
53347
53348 // If the mask value has been legalized to a non-boolean vector, try to
53349 // simplify ops leading up to it. We only demand the MSB of each lane.
53350 SDValue Mask = Mld->getMask();
53351 if (Mask.getScalarValueSizeInBits() != 1) {
53352 EVT VT = Mld->getValueType(0);
53353 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53355 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53356 if (N->getOpcode() != ISD::DELETED_NODE)
53357 DCI.AddToWorklist(N);
53358 return SDValue(N, 0);
53359 }
53360 if (SDValue NewMask =
53362 return DAG.getMaskedLoad(
53363 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53364 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53365 Mld->getAddressingMode(), Mld->getExtensionType());
53366 }
53367
53368 return SDValue();
53369}
53370
53371/// If exactly one element of the mask is set for a non-truncating masked store,
53372/// it is a vector extract and scalar store.
53373/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53374/// mask have already been optimized in IR, so we don't bother with those here.
53376 SelectionDAG &DAG,
53377 const X86Subtarget &Subtarget) {
53378 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53379 // However, some target hooks may need to be added to know when the transform
53380 // is profitable. Endianness would also have to be considered.
53381
53382 SDValue Addr, VecIndex;
53383 Align Alignment;
53384 unsigned Offset;
53385 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53386 return SDValue();
53387
53388 // Extract the one scalar element that is actually being stored.
53389 SDLoc DL(MS);
53390 SDValue Value = MS->getValue();
53391 EVT VT = Value.getValueType();
53392 EVT EltVT = VT.getVectorElementType();
53393 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53394 EltVT = MVT::f64;
53395 EVT CastVT = VT.changeVectorElementType(EltVT);
53396 Value = DAG.getBitcast(CastVT, Value);
53397 }
53398 SDValue Extract =
53399 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53400
53401 // Store that element at the appropriate offset from the base pointer.
53402 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53404 Alignment, MS->getMemOperand()->getFlags());
53405}
53406
53409 const X86Subtarget &Subtarget) {
53411 if (Mst->isCompressingStore())
53412 return SDValue();
53413
53414 EVT VT = Mst->getValue().getValueType();
53415 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53416
53417 if (Mst->isTruncatingStore())
53418 return SDValue();
53419
53420 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53421 return ScalarStore;
53422
53423 // If the mask value has been legalized to a non-boolean vector, try to
53424 // simplify ops leading up to it. We only demand the MSB of each lane.
53425 SDValue Mask = Mst->getMask();
53426 if (Mask.getScalarValueSizeInBits() != 1) {
53428 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53429 if (N->getOpcode() != ISD::DELETED_NODE)
53430 DCI.AddToWorklist(N);
53431 return SDValue(N, 0);
53432 }
53433 if (SDValue NewMask =
53435 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53436 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53437 Mst->getMemoryVT(), Mst->getMemOperand(),
53438 Mst->getAddressingMode());
53439 }
53440
53441 SDValue Value = Mst->getValue();
53442 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53443 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53444 Mst->getMemoryVT())) {
53445 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53446 Mst->getBasePtr(), Mst->getOffset(), Mask,
53447 Mst->getMemoryVT(), Mst->getMemOperand(),
53448 Mst->getAddressingMode(), true);
53449 }
53450
53451 return SDValue();
53452}
53453
53456 const X86Subtarget &Subtarget) {
53458 EVT StVT = St->getMemoryVT();
53459 SDLoc dl(St);
53460 SDValue StoredVal = St->getValue();
53461 EVT VT = StoredVal.getValueType();
53462 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53463
53464 // Convert a store of vXi1 into a store of iX and a bitcast.
53465 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53466 VT.getVectorElementType() == MVT::i1) {
53467
53469 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53470
53471 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53472 St->getPointerInfo(), St->getBaseAlign(),
53473 St->getMemOperand()->getFlags());
53474 }
53475
53476 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53477 // This will avoid a copy to k-register.
53478 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53479 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53480 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53481 SDValue Val = StoredVal.getOperand(0);
53482 // We must store zeros to the unused bits.
53483 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53484 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53485 St->getPointerInfo(), St->getBaseAlign(),
53486 St->getMemOperand()->getFlags());
53487 }
53488
53489 // Widen v2i1/v4i1 stores to v8i1.
53490 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53491 Subtarget.hasAVX512()) {
53492 unsigned NumConcats = 8 / VT.getVectorNumElements();
53493 // We must store zeros to the unused bits.
53494 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53495 Ops[0] = StoredVal;
53496 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53497 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53498 St->getPointerInfo(), St->getBaseAlign(),
53499 St->getMemOperand()->getFlags());
53500 }
53501
53502 // Turn vXi1 stores of constants into a scalar store.
53503 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53504 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53506 // If its a v64i1 store without 64-bit support, we need two stores.
53507 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53508 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53509 StoredVal->ops().slice(0, 32));
53511 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53512 StoredVal->ops().slice(32, 32));
53514
53515 SDValue Ptr0 = St->getBasePtr();
53516 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53517
53518 SDValue Ch0 =
53519 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53520 St->getBaseAlign(), St->getMemOperand()->getFlags());
53521 SDValue Ch1 = DAG.getStore(
53522 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53523 St->getBaseAlign(), St->getMemOperand()->getFlags());
53524 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53525 }
53526
53527 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53528 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53529 St->getPointerInfo(), St->getBaseAlign(),
53530 St->getMemOperand()->getFlags());
53531 }
53532
53533 // Convert scalar fabs/fneg load-store to integer equivalents.
53534 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53535 (StoredVal.getOpcode() == ISD::FABS ||
53536 StoredVal.getOpcode() == ISD::FNEG) &&
53537 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53538 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53539 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53540 if (TLI.isTypeLegal(IntVT)) {
53542 unsigned SignOp = ISD::XOR;
53543 if (StoredVal.getOpcode() == ISD::FABS) {
53544 SignMask = ~SignMask;
53545 SignOp = ISD::AND;
53546 }
53547 SDValue LogicOp = DAG.getNode(
53548 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53549 DAG.getConstant(SignMask, dl, IntVT));
53550 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53551 St->getPointerInfo(), St->getBaseAlign(),
53552 St->getMemOperand()->getFlags());
53553 }
53554 }
53555
53556 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53557 // Sandy Bridge, perform two 16-byte stores.
53558 unsigned Fast;
53559 if (VT.is256BitVector() && StVT == VT &&
53560 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53561 *St->getMemOperand(), &Fast) &&
53562 !Fast) {
53563 unsigned NumElems = VT.getVectorNumElements();
53564 if (NumElems < 2)
53565 return SDValue();
53566
53567 return splitVectorStore(St, DAG);
53568 }
53569
53570 // Split under-aligned vector non-temporal stores.
53571 if (St->isNonTemporal() && StVT == VT &&
53572 St->getAlign().value() < VT.getStoreSize()) {
53573 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53574 // vectors or the legalizer can scalarize it to use MOVNTI.
53575 if (VT.is256BitVector() || VT.is512BitVector()) {
53576 unsigned NumElems = VT.getVectorNumElements();
53577 if (NumElems < 2)
53578 return SDValue();
53579 return splitVectorStore(St, DAG);
53580 }
53581
53582 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53583 // to use MOVNTI.
53584 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53585 MVT NTVT = Subtarget.hasSSE4A()
53586 ? MVT::v2f64
53587 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53588 return scalarizeVectorStore(St, NTVT, DAG);
53589 }
53590 }
53591
53592 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53593 // supported, but avx512f is by extending to v16i32 and truncating.
53594 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53595 St->getValue().getOpcode() == ISD::TRUNCATE &&
53596 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53597 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53598 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53599 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53600 St->getValue().getOperand(0));
53601 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53602 MVT::v16i8, St->getMemOperand());
53603 }
53604
53605 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53606 if (!St->isTruncatingStore() &&
53607 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53608 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53609 StoredVal.hasOneUse() &&
53610 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53611 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53612 return EmitTruncSStore(IsSigned, St->getChain(),
53613 dl, StoredVal.getOperand(0), St->getBasePtr(),
53614 VT, St->getMemOperand(), DAG);
53615 }
53616
53617 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53618 if (!St->isTruncatingStore()) {
53619 auto IsExtractedElement = [](SDValue V) {
53620 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53621 V = V.getOperand(0);
53622 unsigned Opc = V.getOpcode();
53624 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53625 V.getOperand(0).hasOneUse())
53626 return V.getOperand(0);
53627 return SDValue();
53628 };
53629 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53630 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53631 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53632 SDValue Src = Trunc.getOperand(0);
53633 MVT DstVT = Trunc.getSimpleValueType();
53634 MVT SrcVT = Src.getSimpleValueType();
53635 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53636 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53637 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53638 if (NumTruncBits == VT.getSizeInBits() &&
53639 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53640 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53641 TruncVT, St->getMemOperand());
53642 }
53643 }
53644 }
53645 }
53646
53647 // Optimize trunc store (of multiple scalars) to shuffle and store.
53648 // First, pack all of the elements in one place. Next, store to memory
53649 // in fewer chunks.
53650 if (St->isTruncatingStore() && VT.isVector()) {
53651 if (TLI.isTruncStoreLegal(VT, StVT)) {
53652 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53653 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53654 dl, Val, St->getBasePtr(),
53655 St->getMemoryVT(), St->getMemOperand(), DAG);
53656 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53657 DAG, dl))
53658 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53659 dl, Val, St->getBasePtr(),
53660 St->getMemoryVT(), St->getMemOperand(), DAG);
53661 }
53662
53663 return SDValue();
53664 }
53665
53666 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53667 unsigned AddrSpace = St->getAddressSpace();
53668 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53669 AddrSpace == X86AS::PTR32_UPTR) {
53670 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53671 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53672 SDValue Cast =
53673 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53674 return DAG.getTruncStore(
53675 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53676 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53677 }
53678 }
53679
53680 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53681 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53682 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53683 Subtarget.hasCF() && St->isSimple()) {
53684 SDValue Cmov;
53685 if (StoredVal.getOpcode() == X86ISD::CMOV)
53686 Cmov = StoredVal;
53687 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53688 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53689 Cmov = StoredVal.getOperand(0);
53690 else
53691 return SDValue();
53692
53693 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53694 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53695 return SDValue();
53696
53697 bool InvertCC = false;
53698 SDValue V = SDValue(Ld, 0);
53699 if (V == Cmov.getOperand(1))
53700 InvertCC = true;
53701 else if (V != Cmov.getOperand(0))
53702 return SDValue();
53703
53704 SDVTList Tys = DAG.getVTList(MVT::Other);
53705 SDValue CC = Cmov.getOperand(2);
53706 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53707 if (InvertCC)
53708 CC = DAG.getTargetConstant(
53711 dl, MVT::i8);
53712 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53713 Cmov.getOperand(3)};
53714 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53715 St->getMemOperand());
53716 }
53717
53718 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53719 // the FP state in cases where an emms may be missing.
53720 // A preferable solution to the general problem is to figure out the right
53721 // places to insert EMMS. This qualifies as a quick hack.
53722
53723 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53724 if (VT.getSizeInBits() != 64)
53725 return SDValue();
53726
53727 const Function &F = DAG.getMachineFunction().getFunction();
53728 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53729 bool F64IsLegal =
53730 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53731
53732 if (!F64IsLegal || Subtarget.is64Bit())
53733 return SDValue();
53734
53735 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53736 cast<LoadSDNode>(St->getValue())->isSimple() &&
53737 St->getChain().hasOneUse() && St->isSimple()) {
53738 auto *Ld = cast<LoadSDNode>(St->getValue());
53739
53740 if (!ISD::isNormalLoad(Ld))
53741 return SDValue();
53742
53743 // Avoid the transformation if there are multiple uses of the loaded value.
53744 if (!Ld->hasNUsesOfValue(1, 0))
53745 return SDValue();
53746
53747 SDLoc LdDL(Ld);
53748 SDLoc StDL(N);
53749
53750 // Remove any range metadata as we're converting to f64 load/store.
53751 Ld->getMemOperand()->clearRanges();
53752
53753 // Lower to a single movq load/store pair.
53754 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53755 Ld->getBasePtr(), Ld->getMemOperand());
53756
53757 // Make sure new load is placed in same chain order.
53758 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53759 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53760 St->getMemOperand());
53761 }
53762
53763 // This is similar to the above case, but here we handle a scalar 64-bit
53764 // integer store that is extracted from a vector on a 32-bit target.
53765 // If we have SSE2, then we can treat it like a floating-point double
53766 // to get past legalization. The execution dependencies fixup pass will
53767 // choose the optimal machine instruction for the store if this really is
53768 // an integer or v2f32 rather than an f64.
53769 if (VT == MVT::i64 &&
53771 SDValue OldExtract = St->getOperand(1);
53772 SDValue ExtOp0 = OldExtract.getOperand(0);
53773 unsigned VecSize = ExtOp0.getValueSizeInBits();
53774 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53775 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53776 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53777 BitCast, OldExtract.getOperand(1));
53778 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53779 St->getPointerInfo(), St->getBaseAlign(),
53780 St->getMemOperand()->getFlags());
53781 }
53782
53783 return SDValue();
53784}
53785
53788 const X86Subtarget &Subtarget) {
53789 auto *St = cast<MemIntrinsicSDNode>(N);
53790
53791 SDValue StoredVal = N->getOperand(1);
53792 MVT VT = StoredVal.getSimpleValueType();
53793 EVT MemVT = St->getMemoryVT();
53794
53795 // Figure out which elements we demand.
53796 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53797 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53798
53799 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53800 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53801 if (N->getOpcode() != ISD::DELETED_NODE)
53802 DCI.AddToWorklist(N);
53803 return SDValue(N, 0);
53804 }
53805
53806 return SDValue();
53807}
53808
53809/// Return 'true' if this vector operation is "horizontal"
53810/// and return the operands for the horizontal operation in LHS and RHS. A
53811/// horizontal operation performs the binary operation on successive elements
53812/// of its first operand, then on successive elements of its second operand,
53813/// returning the resulting values in a vector. For example, if
53814/// A = < float a0, float a1, float a2, float a3 >
53815/// and
53816/// B = < float b0, float b1, float b2, float b3 >
53817/// then the result of doing a horizontal operation on A and B is
53818/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53819/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53820/// A horizontal-op B, for some already available A and B, and if so then LHS is
53821/// set to A, RHS to B, and the routine returns 'true'.
53822static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53823 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53824 bool IsCommutative,
53825 SmallVectorImpl<int> &PostShuffleMask,
53826 bool ForceHorizOp) {
53827 // If either operand is undef, bail out. The binop should be simplified.
53828 if (LHS.isUndef() || RHS.isUndef())
53829 return false;
53830
53831 // Look for the following pattern:
53832 // A = < float a0, float a1, float a2, float a3 >
53833 // B = < float b0, float b1, float b2, float b3 >
53834 // and
53835 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53836 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53837 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53838 // which is A horizontal-op B.
53839
53840 MVT VT = LHS.getSimpleValueType();
53841 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53842 "Unsupported vector type for horizontal add/sub");
53843 unsigned NumElts = VT.getVectorNumElements();
53844
53845 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53846 SmallVectorImpl<int> &ShuffleMask) {
53847 bool UseSubVector = false;
53848 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53849 Op.getOperand(0).getValueType().is256BitVector() &&
53850 llvm::isNullConstant(Op.getOperand(1))) {
53851 Op = Op.getOperand(0);
53852 UseSubVector = true;
53853 }
53855 SmallVector<int, 16> SrcMask, ScaledMask;
53857 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53858 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53859 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53860 })) {
53861 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53862 if (!UseSubVector && SrcOps.size() <= 2 &&
53863 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53864 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53865 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53866 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53867 }
53868 if (UseSubVector && SrcOps.size() == 1 &&
53869 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53870 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53871 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53872 ShuffleMask.assign(Mask.begin(), Mask.end());
53873 }
53874 }
53875 };
53876
53877 // View LHS in the form
53878 // LHS = VECTOR_SHUFFLE A, B, LMask
53879 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53880 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53881 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53882 SDValue A, B;
53884 GetShuffle(LHS, A, B, LMask);
53885
53886 // Likewise, view RHS in the form
53887 // RHS = VECTOR_SHUFFLE C, D, RMask
53888 SDValue C, D;
53890 GetShuffle(RHS, C, D, RMask);
53891
53892 // At least one of the operands should be a vector shuffle.
53893 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53894 if (NumShuffles == 0)
53895 return false;
53896
53897 if (LMask.empty()) {
53898 A = LHS;
53899 for (unsigned i = 0; i != NumElts; ++i)
53900 LMask.push_back(i);
53901 }
53902
53903 if (RMask.empty()) {
53904 C = RHS;
53905 for (unsigned i = 0; i != NumElts; ++i)
53906 RMask.push_back(i);
53907 }
53908
53909 // If we have an unary mask, ensure the other op is set to null.
53910 if (isUndefOrInRange(LMask, 0, NumElts))
53911 B = SDValue();
53912 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53913 A = SDValue();
53914
53915 if (isUndefOrInRange(RMask, 0, NumElts))
53916 D = SDValue();
53917 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53918 C = SDValue();
53919
53920 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53921 // RHS operands and shuffle mask.
53922 if (A != C) {
53923 std::swap(C, D);
53925 }
53926 // Check that the shuffles are both shuffling the same vectors.
53927 if (!(A == C && B == D))
53928 return false;
53929
53930 PostShuffleMask.clear();
53931 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53932
53933 // LHS and RHS are now:
53934 // LHS = shuffle A, B, LMask
53935 // RHS = shuffle A, B, RMask
53936 // Check that the masks correspond to performing a horizontal operation.
53937 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53938 // so we just repeat the inner loop if this is a 256-bit op.
53939 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53940 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53941 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53942 assert((NumEltsPer128BitChunk % 2 == 0) &&
53943 "Vector type should have an even number of elements in each lane");
53944 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53945 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53946 // Ignore undefined components.
53947 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53948 if (LIdx < 0 || RIdx < 0 ||
53949 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53950 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53951 continue;
53952
53953 // Check that successive odd/even elements are being operated on. If not,
53954 // this is not a horizontal operation.
53955 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53956 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53957 return false;
53958
53959 // Compute the post-shuffle mask index based on where the element
53960 // is stored in the HOP result, and where it needs to be moved to.
53961 int Base = LIdx & ~1u;
53962 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53963 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53964
53965 // The low half of the 128-bit result must choose from A.
53966 // The high half of the 128-bit result must choose from B,
53967 // unless B is undef. In that case, we are always choosing from A.
53968 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53969 Index += NumEltsPer64BitChunk;
53970 PostShuffleMask[i + j] = Index;
53971 }
53972 }
53973
53974 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
53975 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
53976
53977 bool IsIdentityPostShuffle =
53978 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
53979 if (IsIdentityPostShuffle)
53980 PostShuffleMask.clear();
53981
53982 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53983 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
53984 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
53985 return false;
53986
53987 // If the source nodes are already used in HorizOps then always accept this.
53988 // Shuffle folding should merge these back together.
53989 auto FoundHorizUser = [&](SDNode *User) {
53990 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53991 };
53992 ForceHorizOp =
53993 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
53994 llvm::any_of(NewRHS->users(), FoundHorizUser));
53995
53996 // Assume a SingleSource HOP if we only shuffle one input and don't need to
53997 // shuffle the result.
53998 if (!ForceHorizOp &&
53999 !shouldUseHorizontalOp(NewLHS == NewRHS &&
54000 (NumShuffles < 2 || !IsIdentityPostShuffle),
54001 DAG, Subtarget))
54002 return false;
54003
54004 LHS = DAG.getBitcast(VT, NewLHS);
54005 RHS = DAG.getBitcast(VT, NewRHS);
54006 return true;
54007}
54008
54009// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
54011 const X86Subtarget &Subtarget) {
54012 EVT VT = N->getValueType(0);
54013 unsigned Opcode = N->getOpcode();
54014 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
54015 SmallVector<int, 8> PostShuffleMask;
54016
54017 auto MergableHorizOp = [N](unsigned HorizOpcode) {
54018 return N->hasOneUse() &&
54019 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
54020 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
54021 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
54022 };
54023
54024 switch (Opcode) {
54025 case ISD::FADD:
54026 case ISD::FSUB:
54027 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
54028 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
54029 SDValue LHS = N->getOperand(0);
54030 SDValue RHS = N->getOperand(1);
54031 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
54032 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54033 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54034 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
54035 if (!PostShuffleMask.empty())
54036 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54037 DAG.getUNDEF(VT), PostShuffleMask);
54038 return HorizBinOp;
54039 }
54040 }
54041 break;
54042 case ISD::ADD:
54043 case ISD::SUB:
54044 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
54045 VT == MVT::v16i16 || VT == MVT::v8i32)) {
54046 SDValue LHS = N->getOperand(0);
54047 SDValue RHS = N->getOperand(1);
54048 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
54049 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54050 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54051 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
54053 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
54054 };
54055 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
54056 {LHS, RHS}, HOpBuilder);
54057 if (!PostShuffleMask.empty())
54058 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54059 DAG.getUNDEF(VT), PostShuffleMask);
54060 return HorizBinOp;
54061 }
54062 }
54063 break;
54064 }
54065
54066 return SDValue();
54067}
54068
54069// Try to combine the following nodes
54070// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
54071// <i32 -2147483648[float -0.000000e+00]> 0
54072// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
54073// <(load 4 from constant-pool)> t0, t29
54074// [t30: v16i32 = bitcast t27]
54075// t6: v16i32 = xor t7, t27[t30]
54076// t11: v16f32 = bitcast t6
54077// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
54078// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
54079// t22: v16f32 = bitcast t7
54080// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
54081// t24: v32f16 = bitcast t23
54083 const X86Subtarget &Subtarget) {
54084 EVT VT = N->getValueType(0);
54085 SDValue LHS = N->getOperand(0);
54086 SDValue RHS = N->getOperand(1);
54087 int CombineOpcode =
54088 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
54089 auto combineConjugation = [&](SDValue &r) {
54090 if (LHS->getOpcode() == ISD::BITCAST) {
54091 SDValue XOR = LHS.getOperand(0);
54092 if (XOR->getOpcode() == ISD::XOR) {
54093 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54094 if (XORRHS.isConstant()) {
54095 APInt ConjugationInt32 = APInt(32, 0x80000000);
54096 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54097 if ((XORRHS.getBitWidth() == 32 &&
54098 XORRHS.getConstant() == ConjugationInt32) ||
54099 (XORRHS.getBitWidth() == 64 &&
54100 XORRHS.getConstant() == ConjugationInt64)) {
54101 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54102 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54103 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54104 r = DAG.getBitcast(VT, FCMulC);
54105 return true;
54106 }
54107 }
54108 }
54109 }
54110 return false;
54111 };
54112 SDValue Res;
54113 if (combineConjugation(Res))
54114 return Res;
54115 std::swap(LHS, RHS);
54116 if (combineConjugation(Res))
54117 return Res;
54118 return Res;
54119}
54120
54121// Try to combine the following nodes:
54122// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54124 const X86Subtarget &Subtarget) {
54125 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54127 Flags.hasAllowContract();
54128 };
54129
54130 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54131 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54132 Flags.hasNoSignedZeros();
54133 };
54134 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54135 APInt AI = APInt(32, 0x80008000);
54136 KnownBits Bits = DAG.computeKnownBits(Op);
54137 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54138 Bits.getConstant() == AI;
54139 };
54140
54141 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54142 !AllowContract(N->getFlags()))
54143 return SDValue();
54144
54145 EVT VT = N->getValueType(0);
54146 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54147 return SDValue();
54148
54149 SDValue LHS = N->getOperand(0);
54150 SDValue RHS = N->getOperand(1);
54151 bool IsConj;
54152 SDValue FAddOp1, MulOp0, MulOp1;
54153 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54154 &IsVectorAllNegativeZero,
54155 &HasNoSignedZero](SDValue N) -> bool {
54156 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54157 return false;
54158 SDValue Op0 = N.getOperand(0);
54159 unsigned Opcode = Op0.getOpcode();
54160 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54161 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54162 MulOp0 = Op0.getOperand(0);
54163 MulOp1 = Op0.getOperand(1);
54164 IsConj = Opcode == X86ISD::VFCMULC;
54165 return true;
54166 }
54167 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54169 HasNoSignedZero(Op0->getFlags())) ||
54170 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54171 MulOp0 = Op0.getOperand(0);
54172 MulOp1 = Op0.getOperand(1);
54173 IsConj = Opcode == X86ISD::VFCMADDC;
54174 return true;
54175 }
54176 }
54177 return false;
54178 };
54179
54180 if (GetCFmulFrom(LHS))
54181 FAddOp1 = RHS;
54182 else if (GetCFmulFrom(RHS))
54183 FAddOp1 = LHS;
54184 else
54185 return SDValue();
54186
54187 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54188 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54189 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54190 // FIXME: How do we handle when fast math flags of FADD are different from
54191 // CFMUL's?
54192 SDValue CFmul =
54193 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54194 return DAG.getBitcast(VT, CFmul);
54195}
54196
54197/// Do target-specific dag combines on floating-point adds/subs.
54199 const X86Subtarget &Subtarget) {
54200 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54201 return HOp;
54202
54203 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54204 return COp;
54205
54206 return SDValue();
54207}
54208
54210 const X86Subtarget &Subtarget) {
54211 EVT VT = N->getValueType(0);
54212 SDValue Src = N->getOperand(0);
54213 EVT SrcVT = Src.getValueType();
54214 SDLoc DL(N);
54215
54216 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54217
54218 // Let legalize expand this if it isn't a legal type yet.
54219 if (!TLI.isTypeLegal(VT))
54220 return SDValue();
54221
54222 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54223 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54224 return SDValue();
54225
54226 if (SrcVT == MVT::v2f16) {
54227 SrcVT = MVT::v4f16;
54228 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54229 DAG.getUNDEF(MVT::v2f16));
54230 }
54231
54232 if (SrcVT == MVT::v4f16) {
54233 SrcVT = MVT::v8f16;
54234 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54235 DAG.getUNDEF(MVT::v4f16));
54236 } else if (SrcVT == MVT::v2f32) {
54237 SrcVT = MVT::v4f32;
54238 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54239 DAG.getUNDEF(MVT::v2f32));
54240 } else {
54241 return SDValue();
54242 }
54243
54244 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54245}
54246
54247// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54248// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54249// are able to avoid generating code with MOVABS and large constants in certain
54250// cases.
54252 const SDLoc &DL) {
54253 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54254 std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
54255 if (!ValidSrlConst)
54256 return SDValue();
54257 unsigned SrlConstVal = *ValidSrlConst;
54258
54259 SDValue Op = N.getOperand(0);
54260 unsigned Opcode = Op.getOpcode();
54261 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54262 "Illegal truncation types");
54263
54264 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54265 !isa<ConstantSDNode>(Op.getOperand(1)))
54266 return SDValue();
54267 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54268
54269 if (SrlConstVal <= 32 ||
54270 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54271 return SDValue();
54272
54273 SDValue OpLhsSrl =
54274 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54275 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54276
54277 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54278 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54279 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54280
54281 if (Opcode == ISD::ADD) {
54282 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54283 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54284 }
54285 return NewOpNode;
54286}
54287
54288/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54289/// the codegen.
54290/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54291/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54292/// anything that is guaranteed to be transformed by DAGCombiner.
54294 const X86Subtarget &Subtarget,
54295 const SDLoc &DL) {
54296 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54297 SDValue Src = N->getOperand(0);
54298 unsigned SrcOpcode = Src.getOpcode();
54299 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54300
54301 EVT VT = N->getValueType(0);
54302 EVT SrcVT = Src.getValueType();
54303
54304 auto IsFreeTruncation = [VT](SDValue Op) {
54305 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54306
54307 // See if this has been extended from a smaller/equal size to
54308 // the truncation size, allowing a truncation to combine with the extend.
54309 unsigned Opcode = Op.getOpcode();
54310 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54311 Opcode == ISD::ZERO_EXTEND) &&
54312 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54313 return true;
54314
54315 // See if this is a single use constant which can be constant folded.
54316 // NOTE: We don't peek throught bitcasts here because there is currently
54317 // no support for constant folding truncate+bitcast+vector_of_constants. So
54318 // we'll just send up with a truncate on both operands which will
54319 // get turned back into (truncate (binop)) causing an infinite loop.
54320 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54321 };
54322
54323 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54324 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54325 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54326 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54327 };
54328
54329 // Don't combine if the operation has other uses.
54330 if (!Src.hasOneUse())
54331 return SDValue();
54332
54333 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54334 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54335
54336 if (!VT.isVector())
54337 return SDValue();
54338
54339 // In most cases its only worth pre-truncating if we're only facing the cost
54340 // of one truncation.
54341 // i.e. if one of the inputs will constant fold or the input is repeated.
54342 switch (SrcOpcode) {
54343 case ISD::MUL:
54344 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54345 // better to truncate if we have the chance.
54346 if (SrcVT.getScalarType() == MVT::i64 &&
54347 TLI.isOperationLegal(SrcOpcode, VT) &&
54348 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54349 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54350 [[fallthrough]];
54351 case ISD::AND:
54352 case ISD::XOR:
54353 case ISD::OR:
54354 case ISD::ADD:
54355 case ISD::SUB: {
54356 SDValue Op0 = Src.getOperand(0);
54357 SDValue Op1 = Src.getOperand(1);
54358 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54359 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54360 return TruncateArithmetic(Op0, Op1);
54361 break;
54362 }
54363 }
54364
54365 return SDValue();
54366}
54367
54368// Try to form a MULHU or MULHS node by looking for
54369// (trunc (srl (mul ext, ext), >= 16))
54370// TODO: This is X86 specific because we want to be able to handle wide types
54371// before type legalization. But we can only do it if the vector will be
54372// legalized via widening/splitting. Type legalization can't handle promotion
54373// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54374// combiner.
54375static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54376 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54377 using namespace llvm::SDPatternMatch;
54378
54379 if (!Subtarget.hasSSE2())
54380 return SDValue();
54381
54382 // Only handle vXi16 types that are at least 128-bits unless they will be
54383 // widened.
54384 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54385 return SDValue();
54386
54387 // Input type should be at least vXi32.
54388 EVT InVT = Src.getValueType();
54389 if (InVT.getVectorElementType().getSizeInBits() < 32)
54390 return SDValue();
54391
54392 // First instruction should be a right shift by 16 of a multiply.
54393 SDValue LHS, RHS;
54394 APInt ShiftAmt;
54395 if (!sd_match(Src,
54396 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54397 return SDValue();
54398
54399 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54400 return SDValue();
54401
54402 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54403
54404 // Count leading sign/zero bits on both inputs - if there are enough then
54405 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54406 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54407 // truncations may actually be free by peeking through to the ext source.
54408 auto IsSext = [&DAG](SDValue V) {
54409 return DAG.ComputeMaxSignificantBits(V) <= 16;
54410 };
54411 auto IsZext = [&DAG](SDValue V) {
54412 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54413 };
54414
54415 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54416 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54417 if (!IsSigned && !IsUnsigned)
54418 return SDValue();
54419
54420 // Check if both inputs are extensions, which will be removed by truncation.
54421 auto isOpTruncateFree = [](SDValue Op) {
54422 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54423 Op.getOpcode() == ISD::ZERO_EXTEND)
54424 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54425 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54426 };
54427 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54428
54429 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54430 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54431 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54432 // will have to split anyway.
54433 unsigned InSizeInBits = InVT.getSizeInBits();
54434 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54435 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54436 (InSizeInBits % 16) == 0) {
54437 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54438 InVT.getSizeInBits() / 16);
54439 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54440 DAG.getBitcast(BCVT, RHS));
54441 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54442 return DAG.getNode(ISD::SRL, DL, VT, Res,
54443 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54444 }
54445
54446 // Truncate back to source type.
54447 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54448 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54449
54450 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54451 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54452 return DAG.getNode(ISD::SRL, DL, VT, Res,
54453 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54454}
54455
54456// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54457// from one vector with signed bytes from another vector, adds together
54458// adjacent pairs of 16-bit products, and saturates the result before
54459// truncating to 16-bits.
54460//
54461// Which looks something like this:
54462// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54463// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54465 const X86Subtarget &Subtarget,
54466 const SDLoc &DL) {
54467 if (!VT.isVector() || !Subtarget.hasSSSE3())
54468 return SDValue();
54469
54470 unsigned NumElems = VT.getVectorNumElements();
54471 EVT ScalarVT = VT.getVectorElementType();
54472 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54473 return SDValue();
54474
54475 SDValue SSatVal = detectSSatPattern(In, VT);
54476 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54477 return SDValue();
54478
54479 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54480 // of multiplies from even/odd elements.
54481 SDValue N0 = SSatVal.getOperand(0);
54482 SDValue N1 = SSatVal.getOperand(1);
54483
54484 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54485 return SDValue();
54486
54487 SDValue N00 = N0.getOperand(0);
54488 SDValue N01 = N0.getOperand(1);
54489 SDValue N10 = N1.getOperand(0);
54490 SDValue N11 = N1.getOperand(1);
54491
54492 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54493 // Canonicalize zero_extend to LHS.
54494 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54495 std::swap(N00, N01);
54496 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54497 std::swap(N10, N11);
54498
54499 // Ensure we have a zero_extend and a sign_extend.
54500 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54501 N01.getOpcode() != ISD::SIGN_EXTEND ||
54502 N10.getOpcode() != ISD::ZERO_EXTEND ||
54503 N11.getOpcode() != ISD::SIGN_EXTEND)
54504 return SDValue();
54505
54506 // Peek through the extends.
54507 N00 = N00.getOperand(0);
54508 N01 = N01.getOperand(0);
54509 N10 = N10.getOperand(0);
54510 N11 = N11.getOperand(0);
54511
54512 // Ensure the extend is from vXi8.
54513 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54514 N01.getValueType().getVectorElementType() != MVT::i8 ||
54515 N10.getValueType().getVectorElementType() != MVT::i8 ||
54516 N11.getValueType().getVectorElementType() != MVT::i8)
54517 return SDValue();
54518
54519 // All inputs should be build_vectors.
54520 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54521 N01.getOpcode() != ISD::BUILD_VECTOR ||
54522 N10.getOpcode() != ISD::BUILD_VECTOR ||
54524 return SDValue();
54525
54526 // N00/N10 are zero extended. N01/N11 are sign extended.
54527
54528 // For each element, we need to ensure we have an odd element from one vector
54529 // multiplied by the odd element of another vector and the even element from
54530 // one of the same vectors being multiplied by the even element from the
54531 // other vector. So we need to make sure for each element i, this operator
54532 // is being performed:
54533 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54534 SDValue ZExtIn, SExtIn;
54535 for (unsigned i = 0; i != NumElems; ++i) {
54536 SDValue N00Elt = N00.getOperand(i);
54537 SDValue N01Elt = N01.getOperand(i);
54538 SDValue N10Elt = N10.getOperand(i);
54539 SDValue N11Elt = N11.getOperand(i);
54540 // TODO: Be more tolerant to undefs.
54541 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54542 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54543 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54545 return SDValue();
54546 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54547 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54548 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54549 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54550 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54551 return SDValue();
54552 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54553 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54554 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54555 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54556 // Add is commutative so indices can be reordered.
54557 if (IdxN00 > IdxN10) {
54558 std::swap(IdxN00, IdxN10);
54559 std::swap(IdxN01, IdxN11);
54560 }
54561 // N0 indices be the even element. N1 indices must be the next odd element.
54562 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54563 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54564 return SDValue();
54565 SDValue N00In = N00Elt.getOperand(0);
54566 SDValue N01In = N01Elt.getOperand(0);
54567 SDValue N10In = N10Elt.getOperand(0);
54568 SDValue N11In = N11Elt.getOperand(0);
54569 // First time we find an input capture it.
54570 if (!ZExtIn) {
54571 ZExtIn = N00In;
54572 SExtIn = N01In;
54573 }
54574 if (ZExtIn != N00In || SExtIn != N01In ||
54575 ZExtIn != N10In || SExtIn != N11In)
54576 return SDValue();
54577 }
54578
54579 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54580 EVT ExtVT = Ext.getValueType();
54581 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54582 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54583 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54584 DAG.getVectorIdxConstant(0, DL));
54585 }
54586 };
54587 ExtractVec(ZExtIn);
54588 ExtractVec(SExtIn);
54589
54590 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54592 // Shrink by adding truncate nodes and let DAGCombine fold with the
54593 // sources.
54594 EVT InVT = Ops[0].getValueType();
54595 assert(InVT.getScalarType() == MVT::i8 &&
54596 "Unexpected scalar element type");
54597 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54598 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54599 InVT.getVectorNumElements() / 2);
54600 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54601 };
54602 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54603 PMADDBuilder);
54604}
54605
54607 const X86Subtarget &Subtarget) {
54608 EVT VT = N->getValueType(0);
54609 SDValue Src = N->getOperand(0);
54610 SDLoc DL(N);
54611
54612 // Attempt to pre-truncate inputs to arithmetic ops instead.
54613 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54614 return V;
54615
54616 // Try to detect PMADD
54617 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54618 return PMAdd;
54619
54620 // Try to combine truncation with signed/unsigned saturation.
54621 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54622 return Val;
54623
54624 // Try to combine PMULHUW/PMULHW for vXi16.
54625 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54626 return V;
54627
54628 // The bitcast source is a direct mmx result.
54629 // Detect bitcasts between i32 to x86mmx
54630 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54631 SDValue BCSrc = Src.getOperand(0);
54632 if (BCSrc.getValueType() == MVT::x86mmx)
54633 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54634 }
54635
54636 return SDValue();
54637}
54638
54641 EVT VT = N->getValueType(0);
54642 SDValue In = N->getOperand(0);
54643 SDLoc DL(N);
54644
54645 if (SDValue SSatVal = detectSSatPattern(In, VT))
54646 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54647 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54648 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54649
54650 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54651 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54652 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54653 return SDValue(N, 0);
54654
54655 return SDValue();
54656}
54657
54658/// Returns the negated value if the node \p N flips sign of FP value.
54659///
54660/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54661/// or FSUB(0, x)
54662/// AVX512F does not have FXOR, so FNEG is lowered as
54663/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54664/// In this case we go though all bitcasts.
54665/// This also recognizes splat of a negated value and returns the splat of that
54666/// value.
54667static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54668 if (N->getOpcode() == ISD::FNEG)
54669 return N->getOperand(0);
54670
54671 // Don't recurse exponentially.
54673 return SDValue();
54674
54675 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54676
54678 EVT VT = Op->getValueType(0);
54679
54680 // Make sure the element size doesn't change.
54681 if (VT.getScalarSizeInBits() != ScalarSize)
54682 return SDValue();
54683
54684 unsigned Opc = Op.getOpcode();
54685 switch (Opc) {
54686 case ISD::VECTOR_SHUFFLE: {
54687 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54688 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54689 if (!Op.getOperand(1).isUndef())
54690 return SDValue();
54691 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54692 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54693 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54694 cast<ShuffleVectorSDNode>(Op)->getMask());
54695 break;
54696 }
54698 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54699 // -V, INDEX).
54700 SDValue InsVector = Op.getOperand(0);
54701 SDValue InsVal = Op.getOperand(1);
54702 if (!InsVector.isUndef())
54703 return SDValue();
54704 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54705 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54706 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54707 NegInsVal, Op.getOperand(2));
54708 break;
54709 }
54710 case ISD::FSUB:
54711 case ISD::XOR:
54712 case X86ISD::FXOR: {
54713 SDValue Op1 = Op.getOperand(1);
54714 SDValue Op0 = Op.getOperand(0);
54715
54716 // For XOR and FXOR, we want to check if constant
54717 // bits of Op1 are sign bit masks. For FSUB, we
54718 // have to check if constant bits of Op0 are sign
54719 // bit masks and hence we swap the operands.
54720 if (Opc == ISD::FSUB)
54721 std::swap(Op0, Op1);
54722
54723 APInt UndefElts;
54724 SmallVector<APInt, 16> EltBits;
54725 // Extract constant bits and see if they are all
54726 // sign bit masks. Ignore the undef elements.
54727 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54728 /* AllowWholeUndefs */ true,
54729 /* AllowPartialUndefs */ false)) {
54730 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54731 if (!UndefElts[I] && !EltBits[I].isSignMask())
54732 return SDValue();
54733
54734 // Only allow bitcast from correctly-sized constant.
54735 Op0 = peekThroughBitcasts(Op0);
54736 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54737 return Op0;
54738 }
54739 break;
54740 } // case
54741 } // switch
54742
54743 return SDValue();
54744}
54745
54746static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54747 bool NegRes) {
54748 if (NegMul) {
54749 switch (Opcode) {
54750 // clang-format off
54751 default: llvm_unreachable("Unexpected opcode");
54752 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54753 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54754 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54755 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54756 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54757 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54758 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54759 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54760 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54761 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54762 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54763 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54764 // clang-format on
54765 }
54766 }
54767
54768 if (NegAcc) {
54769 switch (Opcode) {
54770 // clang-format off
54771 default: llvm_unreachable("Unexpected opcode");
54772 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54773 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54774 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54775 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54776 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54777 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54778 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54779 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54780 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54781 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54782 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54783 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54784 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54785 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54786 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54787 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54788 // clang-format on
54789 }
54790 }
54791
54792 if (NegRes) {
54793 switch (Opcode) {
54794 // For accuracy reason, we never combine fneg and fma under strict FP.
54795 // clang-format off
54796 default: llvm_unreachable("Unexpected opcode");
54797 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54798 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54799 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54800 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54801 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54802 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54803 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54804 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54805 // clang-format on
54806 }
54807 }
54808
54809 return Opcode;
54810}
54811
54812/// Do target-specific dag combines on floating point negations.
54815 const X86Subtarget &Subtarget) {
54816 EVT OrigVT = N->getValueType(0);
54817 SDValue Arg = isFNEG(DAG, N);
54818 if (!Arg)
54819 return SDValue();
54820
54821 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54822 EVT VT = Arg.getValueType();
54823 EVT SVT = VT.getScalarType();
54824 SDLoc DL(N);
54825
54826 // Let legalize expand this if it isn't a legal type yet.
54827 if (!TLI.isTypeLegal(VT))
54828 return SDValue();
54829
54830 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54831 // use of a constant by performing (-0 - A*B) instead.
54832 // FIXME: Check rounding control flags as well once it becomes available.
54833 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54834 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54835 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54836 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54837 Arg.getOperand(1), Zero);
54838 return DAG.getBitcast(OrigVT, NewNode);
54839 }
54840
54842 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54843 if (SDValue NegArg =
54844 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54845 return DAG.getBitcast(OrigVT, NegArg);
54846
54847 return SDValue();
54848}
54849
54851 bool LegalOperations,
54852 bool ForCodeSize,
54854 unsigned Depth) const {
54855 // fneg patterns are removable even if they have multiple uses.
54856 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54858 return DAG.getBitcast(Op.getValueType(), Arg);
54859 }
54860
54861 EVT VT = Op.getValueType();
54862 EVT SVT = VT.getScalarType();
54863 unsigned Opc = Op.getOpcode();
54864 SDNodeFlags Flags = Op.getNode()->getFlags();
54865 switch (Opc) {
54866 case ISD::FMA:
54867 case X86ISD::FMSUB:
54868 case X86ISD::FNMADD:
54869 case X86ISD::FNMSUB:
54870 case X86ISD::FMADD_RND:
54871 case X86ISD::FMSUB_RND:
54872 case X86ISD::FNMADD_RND:
54873 case X86ISD::FNMSUB_RND: {
54874 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54875 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54877 break;
54878
54879 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54880 // if it may have signed zeros.
54881 if (!Flags.hasNoSignedZeros())
54882 break;
54883
54884 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54885 // keep temporary nodes alive.
54886 std::list<HandleSDNode> Handles;
54887
54888 // This is always negatible for free but we might be able to remove some
54889 // extra operand negations as well.
54890 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
54891 for (int i = 0; i != 3; ++i) {
54892 NewOps[i] = getCheaperNegatedExpression(
54893 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54894 if (!!NewOps[i])
54895 Handles.emplace_back(NewOps[i]);
54896 }
54897
54898 bool NegA = !!NewOps[0];
54899 bool NegB = !!NewOps[1];
54900 bool NegC = !!NewOps[2];
54901 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54902
54903 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54905
54906 // Fill in the non-negated ops with the original values.
54907 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54908 if (!NewOps[i])
54909 NewOps[i] = Op.getOperand(i);
54910 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54911 }
54912 case X86ISD::FRCP:
54913 if (SDValue NegOp0 =
54914 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54915 ForCodeSize, Cost, Depth + 1))
54916 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54917 break;
54918 }
54919
54920 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54921 ForCodeSize, Cost, Depth);
54922}
54923
54925 const X86Subtarget &Subtarget) {
54926 MVT VT = N->getSimpleValueType(0);
54927 // If we have integer vector types available, use the integer opcodes.
54928 if (!VT.isVector() || !Subtarget.hasSSE2())
54929 return SDValue();
54930
54931 SDLoc dl(N);
54933 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54934 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54935 unsigned IntOpcode;
54936 switch (N->getOpcode()) {
54937 // clang-format off
54938 default: llvm_unreachable("Unexpected FP logic op");
54939 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54940 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54941 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54942 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54943 // clang-format on
54944 }
54945 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54946 return DAG.getBitcast(VT, IntOp);
54947}
54948
54949/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54951 if (N->getOpcode() != ISD::XOR)
54952 return SDValue();
54953
54954 SDValue LHS = N->getOperand(0);
54955 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54956 return SDValue();
54957
54959 X86::CondCode(LHS->getConstantOperandVal(0)));
54960 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54961}
54962
54964 const X86Subtarget &Subtarget) {
54965 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54966 "Invalid opcode for combing with CTLZ");
54967 if (Subtarget.hasFastLZCNT())
54968 return SDValue();
54969
54970 EVT VT = N->getValueType(0);
54971 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54972 (VT != MVT::i64 || !Subtarget.is64Bit()))
54973 return SDValue();
54974
54975 SDValue N0 = N->getOperand(0);
54976 SDValue N1 = N->getOperand(1);
54977
54978 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
54980 return SDValue();
54981
54982 SDValue OpCTLZ;
54983 SDValue OpSizeTM1;
54984
54985 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
54986 OpCTLZ = N1;
54987 OpSizeTM1 = N0;
54988 } else if (N->getOpcode() == ISD::SUB) {
54989 return SDValue();
54990 } else {
54991 OpCTLZ = N0;
54992 OpSizeTM1 = N1;
54993 }
54994
54995 if (!OpCTLZ.hasOneUse())
54996 return SDValue();
54997 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
54998 if (!C)
54999 return SDValue();
55000
55001 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
55002 return SDValue();
55003 EVT OpVT = VT;
55004 SDValue Op = OpCTLZ.getOperand(0);
55005 if (VT == MVT::i8) {
55006 // Zero extend to i32 since there is not an i8 bsr.
55007 OpVT = MVT::i32;
55008 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
55009 }
55010
55011 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
55012 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
55013 if (VT == MVT::i8)
55014 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
55015
55016 return Op;
55017}
55018
55021 const X86Subtarget &Subtarget) {
55022 SDValue N0 = N->getOperand(0);
55023 SDValue N1 = N->getOperand(1);
55024 EVT VT = N->getValueType(0);
55025 SDLoc DL(N);
55026
55027 // If this is SSE1 only convert to FXOR to avoid scalarization.
55028 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
55029 return DAG.getBitcast(MVT::v4i32,
55030 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
55031 DAG.getBitcast(MVT::v4f32, N0),
55032 DAG.getBitcast(MVT::v4f32, N1)));
55033 }
55034
55035 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
55036 return Cmp;
55037
55038 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
55039 return R;
55040
55041 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
55042 return R;
55043
55044 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
55045 return R;
55046
55047 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
55048 DAG, DCI, Subtarget))
55049 return FPLogic;
55050
55051 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
55052 return R;
55053
55054 if (DCI.isBeforeLegalizeOps())
55055 return SDValue();
55056
55057 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
55058 return SetCC;
55059
55060 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
55061 return R;
55062
55063 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
55064 return RV;
55065
55066 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
55067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55068 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
55069 N0.getOperand(0).getValueType().isVector() &&
55070 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55071 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
55072 return DAG.getBitcast(
55073 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
55074 }
55075
55076 // Handle AVX512 mask widening.
55077 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
55078 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
55079 VT.getVectorElementType() == MVT::i1 &&
55081 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
55082 return DAG.getNode(
55084 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
55085 N0.getOperand(2));
55086 }
55087
55088 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
55089 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55090 // TODO: Under what circumstances could this be performed in DAGCombine?
55091 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55092 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55093 SDValue TruncExtSrc = N0.getOperand(0);
55094 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55095 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55096 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55097 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55098 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55099 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55100 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55101 }
55102 }
55103
55104 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55105 return R;
55106
55107 return combineFneg(N, DAG, DCI, Subtarget);
55108}
55109
55112 const X86Subtarget &Subtarget) {
55113 SDValue N0 = N->getOperand(0);
55114 EVT VT = N->getValueType(0);
55115
55116 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55117 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55118 SDValue Src = N0.getOperand(0);
55119 EVT SrcVT = Src.getValueType();
55120 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55121 (DCI.isBeforeLegalize() ||
55122 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55123 Subtarget.hasSSSE3()) {
55124 unsigned NumElts = SrcVT.getVectorNumElements();
55125 SmallVector<int, 32> ReverseMask(NumElts);
55126 for (unsigned I = 0; I != NumElts; ++I)
55127 ReverseMask[I] = (NumElts - 1) - I;
55128 SDValue Rev =
55129 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55130 return DAG.getBitcast(VT, Rev);
55131 }
55132 }
55133
55134 return SDValue();
55135}
55136
55137// Various combines to try to convert to avgceilu.
55140 const X86Subtarget &Subtarget) {
55141 unsigned Opcode = N->getOpcode();
55142 SDValue N0 = N->getOperand(0);
55143 SDValue N1 = N->getOperand(1);
55144 EVT VT = N->getValueType(0);
55145 EVT SVT = VT.getScalarType();
55146 SDLoc DL(N);
55147
55148 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55149 // Only useful on vXi8 which doesn't have good SRA handling.
55150 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55152 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55153 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55154 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55155 return DAG.getNode(ISD::XOR, DL, VT,
55156 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55157 }
55158
55159 return SDValue();
55160}
55161
55164 const X86Subtarget &Subtarget) {
55165 EVT VT = N->getValueType(0);
55166 unsigned NumBits = VT.getSizeInBits();
55167
55168 // TODO - Constant Folding.
55169
55170 // Simplify the inputs.
55171 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55172 APInt DemandedMask(APInt::getAllOnes(NumBits));
55173 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55174 return SDValue(N, 0);
55175
55176 return SDValue();
55177}
55178
55180 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55181}
55182
55183/// If a value is a scalar FP zero or a vector FP zero (potentially including
55184/// undefined elements), return a zero constant that may be used to fold away
55185/// that value. In the case of a vector, the returned constant will not contain
55186/// undefined elements even if the input parameter does. This makes it suitable
55187/// to be used as a replacement operand with operations (eg, bitwise-and) where
55188/// an undef should not propagate.
55190 const X86Subtarget &Subtarget) {
55192 return SDValue();
55193
55194 if (V.getValueType().isVector())
55195 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55196
55197 return V;
55198}
55199
55201 const X86Subtarget &Subtarget) {
55202 SDValue N0 = N->getOperand(0);
55203 SDValue N1 = N->getOperand(1);
55204 EVT VT = N->getValueType(0);
55205 SDLoc DL(N);
55206
55207 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55208 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55209 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55210 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55211 return SDValue();
55212
55213 auto isAllOnesConstantFP = [](SDValue V) {
55214 if (V.getSimpleValueType().isVector())
55215 return ISD::isBuildVectorAllOnes(V.getNode());
55216 auto *C = dyn_cast<ConstantFPSDNode>(V);
55217 return C && C->getConstantFPValue()->isAllOnesValue();
55218 };
55219
55220 // fand (fxor X, -1), Y --> fandn X, Y
55221 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55222 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55223
55224 // fand X, (fxor Y, -1) --> fandn Y, X
55225 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55226 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55227
55228 return SDValue();
55229}
55230
55231/// Do target-specific dag combines on X86ISD::FAND nodes.
55233 const X86Subtarget &Subtarget) {
55234 // FAND(0.0, x) -> 0.0
55235 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55236 return V;
55237
55238 // FAND(x, 0.0) -> 0.0
55239 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55240 return V;
55241
55242 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55243 return V;
55244
55245 return lowerX86FPLogicOp(N, DAG, Subtarget);
55246}
55247
55248/// Do target-specific dag combines on X86ISD::FANDN nodes.
55250 const X86Subtarget &Subtarget) {
55251 // FANDN(0.0, x) -> x
55252 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55253 return N->getOperand(1);
55254
55255 // FANDN(x, 0.0) -> 0.0
55256 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55257 return V;
55258
55259 return lowerX86FPLogicOp(N, DAG, Subtarget);
55260}
55261
55262/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55265 const X86Subtarget &Subtarget) {
55266 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55267
55268 // F[X]OR(0.0, x) -> x
55269 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55270 return N->getOperand(1);
55271
55272 // F[X]OR(x, 0.0) -> x
55273 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55274 return N->getOperand(0);
55275
55276 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55277 return NewVal;
55278
55279 return lowerX86FPLogicOp(N, DAG, Subtarget);
55280}
55281
55282/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55284 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55285
55286 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55287 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55289 return SDValue();
55290
55291 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55292 // into FMINC and FMAXC, which are Commutative operations.
55293 unsigned NewOp = 0;
55294 switch (N->getOpcode()) {
55295 default: llvm_unreachable("unknown opcode");
55296 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55297 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55298 }
55299
55300 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55301 N->getOperand(0), N->getOperand(1));
55302}
55303
55305 const X86Subtarget &Subtarget) {
55306 EVT VT = N->getValueType(0);
55307 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55308 return SDValue();
55309
55310 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55311
55312 auto IsMinMaxLegal = [&](EVT VT) {
55313 if (!TLI.isTypeLegal(VT))
55314 return false;
55315 return VT.getScalarType() != MVT::f16 ||
55316 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55317 };
55318
55319 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55320 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55321 (Subtarget.hasFP16() && VT == MVT::f16) ||
55322 (VT.isVector() && IsMinMaxLegal(VT))))
55323 return SDValue();
55324
55325 SDValue Op0 = N->getOperand(0);
55326 SDValue Op1 = N->getOperand(1);
55327 SDLoc DL(N);
55328 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55329
55330 // If we don't have to respect NaN inputs, this is a direct translation to x86
55331 // min/max instructions.
55332 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55333 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55334
55335 // If one of the operands is known non-NaN use the native min/max instructions
55336 // with the non-NaN input as second operand.
55337 if (DAG.isKnownNeverNaN(Op1))
55338 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55339 if (DAG.isKnownNeverNaN(Op0))
55340 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55341
55342 // If we have to respect NaN inputs, this takes at least 3 instructions.
55343 // Favor a library call when operating on a scalar and minimizing code size.
55344 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55345 return SDValue();
55346
55347 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55348 VT);
55349
55350 // There are 4 possibilities involving NaN inputs, and these are the required
55351 // outputs:
55352 // Op1
55353 // Num NaN
55354 // ----------------
55355 // Num | Max | Op0 |
55356 // Op0 ----------------
55357 // NaN | Op1 | NaN |
55358 // ----------------
55359 //
55360 // The SSE FP max/min instructions were not designed for this case, but rather
55361 // to implement:
55362 // Min = Op1 < Op0 ? Op1 : Op0
55363 // Max = Op1 > Op0 ? Op1 : Op0
55364 //
55365 // So they always return Op0 if either input is a NaN. However, we can still
55366 // use those instructions for fmaxnum by selecting away a NaN input.
55367
55368 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55369 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55370 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55371
55372 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55373 // are NaN, the NaN value of Op1 is the result.
55374 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55375}
55376
55379 EVT VT = N->getValueType(0);
55380 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55381
55382 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55383 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55384 return SDValue(N, 0);
55385
55386 // Convert a full vector load into vzload when not all bits are needed.
55387 SDValue In = N->getOperand(0);
55388 MVT InVT = In.getSimpleValueType();
55389 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55390 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55391 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55392 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55393 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55394 MVT MemVT = MVT::getIntegerVT(NumBits);
55395 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55396 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55397 SDLoc dl(N);
55398 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55399 DAG.getBitcast(InVT, VZLoad));
55400 DCI.CombineTo(N, Convert);
55401 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55403 return SDValue(N, 0);
55404 }
55405 }
55406
55407 return SDValue();
55408}
55409
55413 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55414 EVT VT = N->getValueType(0);
55415
55416 // Convert a full vector load into vzload when not all bits are needed.
55417 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55418 MVT InVT = In.getSimpleValueType();
55419 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55420 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55421 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55422 LoadSDNode *LN = cast<LoadSDNode>(In);
55423 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55424 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55425 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55426 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55427 SDLoc dl(N);
55428 if (IsStrict) {
55429 SDValue Convert =
55430 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55431 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55432 DCI.CombineTo(N, Convert, Convert.getValue(1));
55433 } else {
55434 SDValue Convert =
55435 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55436 DCI.CombineTo(N, Convert);
55437 }
55438 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55440 return SDValue(N, 0);
55441 }
55442 }
55443
55444 return SDValue();
55445}
55446
55447/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55450 const X86Subtarget &Subtarget) {
55451 SDValue N0 = N->getOperand(0);
55452 SDValue N1 = N->getOperand(1);
55453 MVT VT = N->getSimpleValueType(0);
55454 int NumElts = VT.getVectorNumElements();
55455 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55456 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55457 SDLoc DL(N);
55458
55459 // ANDNP(undef, x) -> 0
55460 // ANDNP(x, undef) -> 0
55461 if (N0.isUndef() || N1.isUndef())
55462 return DAG.getConstant(0, DL, VT);
55463
55464 // ANDNP(0, x) -> x
55466 return N1;
55467
55468 // ANDNP(x, 0) -> 0
55470 return DAG.getConstant(0, DL, VT);
55471
55472 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55474 return DAG.getNOT(DL, N0, VT);
55475
55476 // Turn ANDNP back to AND if input is inverted.
55477 if (SDValue Not = IsNOT(N0, DAG))
55478 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55479
55480 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55481 // to make use of predicated selects.
55482 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55483 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55484 SDValue Src = N0.getOperand(0);
55485 EVT SrcVT = Src.getValueType();
55486 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55487 (VT.is512BitVector() || Subtarget.hasVLX()) &&
55488 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
55489 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55490 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55491 getZeroVector(VT, Subtarget, DAG, DL));
55492 }
55493
55494 // Constant Folding
55495 APInt Undefs0, Undefs1;
55496 SmallVector<APInt> EltBits0, EltBits1;
55497 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55498 /*AllowWholeUndefs*/ true,
55499 /*AllowPartialUndefs*/ true)) {
55500 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55501 /*AllowWholeUndefs*/ true,
55502 /*AllowPartialUndefs*/ true)) {
55503 SmallVector<APInt> ResultBits;
55504 for (int I = 0; I != NumElts; ++I)
55505 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55506 return getConstVector(ResultBits, VT, DAG, DL);
55507 }
55508
55509 // Constant fold NOT(N0) to allow us to use AND.
55510 // Ensure this is only performed if we can confirm that the bitcasted source
55511 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55512 if (N0->hasOneUse()) {
55514 if (BC0.getOpcode() != ISD::BITCAST) {
55515 for (APInt &Elt : EltBits0)
55516 Elt = ~Elt;
55517 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55518 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55519 }
55520 }
55521 }
55522
55523 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55524 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55525 SDValue Op(N, 0);
55526 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55527 return Res;
55528
55529 // If either operand is a constant mask, then only the elements that aren't
55530 // zero are actually demanded by the other operand.
55531 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55532 APInt UndefElts;
55533 SmallVector<APInt> EltBits;
55534 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55535 APInt DemandedElts = APInt::getAllOnes(NumElts);
55536 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55537 EltBits)) {
55538 DemandedBits.clearAllBits();
55539 DemandedElts.clearAllBits();
55540 for (int I = 0; I != NumElts; ++I) {
55541 if (UndefElts[I]) {
55542 // We can't assume an undef src element gives an undef dst - the
55543 // other src might be zero.
55544 DemandedBits.setAllBits();
55545 DemandedElts.setBit(I);
55546 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55547 (!Invert && !EltBits[I].isZero())) {
55548 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55549 DemandedElts.setBit(I);
55550 }
55551 }
55552 }
55553 return std::make_pair(DemandedBits, DemandedElts);
55554 };
55555 APInt Bits0, Elts0;
55556 APInt Bits1, Elts1;
55557 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55558 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55559
55560 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55561 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55562 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55563 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55564 if (N->getOpcode() != ISD::DELETED_NODE)
55565 DCI.AddToWorklist(N);
55566 return SDValue(N, 0);
55567 }
55568 }
55569
55570 // Folds for better commutativity:
55571 if (N1->hasOneUse()) {
55572 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55573 if (SDValue Not = IsNOT(N1, DAG))
55574 return DAG.getNOT(
55575 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55576
55577 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55578 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55579 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55581 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55582 EVT ShufVT = BC1.getValueType();
55583 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55584 DAG.getBitcast(ShufVT, N0));
55585 SDValue NewShuf =
55586 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55587 return DAG.getBitcast(VT, NewShuf);
55588 }
55589 }
55590 }
55591
55592 return SDValue();
55593}
55594
55597 SDValue N1 = N->getOperand(1);
55598
55599 // BT ignores high bits in the bit index operand.
55600 unsigned BitWidth = N1.getValueSizeInBits();
55602 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55603 if (N->getOpcode() != ISD::DELETED_NODE)
55604 DCI.AddToWorklist(N);
55605 return SDValue(N, 0);
55606 }
55607
55608 return SDValue();
55609}
55610
55613 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55614 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55615
55616 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55617 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55618 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55619 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55620 if (N->getOpcode() != ISD::DELETED_NODE)
55621 DCI.AddToWorklist(N);
55622 return SDValue(N, 0);
55623 }
55624
55625 // Convert a full vector load into vzload when not all bits are needed.
55626 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55627 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55628 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55629 SDLoc dl(N);
55630 if (IsStrict) {
55631 SDValue Convert = DAG.getNode(
55632 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55633 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55634 DCI.CombineTo(N, Convert, Convert.getValue(1));
55635 } else {
55636 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55637 DAG.getBitcast(MVT::v8i16, VZLoad));
55638 DCI.CombineTo(N, Convert);
55639 }
55640
55641 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55643 return SDValue(N, 0);
55644 }
55645 }
55646 }
55647
55648 return SDValue();
55649}
55650
55651// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55653 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55654
55655 EVT DstVT = N->getValueType(0);
55656
55657 SDValue N0 = N->getOperand(0);
55658 SDValue N1 = N->getOperand(1);
55659 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55660
55661 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55662 return SDValue();
55663
55664 // Look through single use any_extends / truncs.
55665 SDValue IntermediateBitwidthOp;
55666 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55667 N0.hasOneUse()) {
55668 IntermediateBitwidthOp = N0;
55669 N0 = N0.getOperand(0);
55670 }
55671
55672 // See if we have a single use cmov.
55673 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55674 return SDValue();
55675
55676 SDValue CMovOp0 = N0.getOperand(0);
55677 SDValue CMovOp1 = N0.getOperand(1);
55678
55679 // Make sure both operands are constants.
55680 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55681 !isa<ConstantSDNode>(CMovOp1.getNode()))
55682 return SDValue();
55683
55684 SDLoc DL(N);
55685
55686 // If we looked through an any_extend/trunc above, add one to the constants.
55687 if (IntermediateBitwidthOp) {
55688 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55689 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55690 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55691 }
55692
55693 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55694 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55695
55696 EVT CMovVT = DstVT;
55697 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55698 if (DstVT == MVT::i16) {
55699 CMovVT = MVT::i32;
55700 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55701 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55702 }
55703
55704 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55705 N0.getOperand(2), N0.getOperand(3));
55706
55707 if (CMovVT != DstVT)
55708 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55709
55710 return CMov;
55711}
55712
55714 const X86Subtarget &Subtarget) {
55715 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55716
55717 if (SDValue V = combineSextInRegCmov(N, DAG))
55718 return V;
55719
55720 EVT VT = N->getValueType(0);
55721 SDValue N0 = N->getOperand(0);
55722 SDValue N1 = N->getOperand(1);
55723 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55724 SDLoc dl(N);
55725
55726 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55727 // both SSE and AVX2 since there is no sign-extended shift right
55728 // operation on a vector with 64-bit elements.
55729 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55730 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55731 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55732 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55733 SDValue N00 = N0.getOperand(0);
55734
55735 // EXTLOAD has a better solution on AVX2,
55736 // it may be replaced with X86ISD::VSEXT node.
55737 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55738 if (!ISD::isNormalLoad(N00.getNode()))
55739 return SDValue();
55740
55741 // Attempt to promote any comparison mask ops before moving the
55742 // SIGN_EXTEND_INREG in the way.
55743 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55744 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55745
55746 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55747 SDValue Tmp =
55748 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55749 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55750 }
55751 }
55752 return SDValue();
55753}
55754
55755/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55756/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55757/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55758/// opportunities to combine math ops, use an LEA, or use a complex addressing
55759/// mode. This can eliminate extend, add, and shift instructions.
55761 const X86Subtarget &Subtarget) {
55762 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55763 Ext->getOpcode() != ISD::ZERO_EXTEND)
55764 return SDValue();
55765
55766 // TODO: This should be valid for other integer types.
55767 EVT VT = Ext->getValueType(0);
55768 if (VT != MVT::i64)
55769 return SDValue();
55770
55771 SDValue Add = Ext->getOperand(0);
55772 if (Add.getOpcode() != ISD::ADD)
55773 return SDValue();
55774
55775 SDValue AddOp0 = Add.getOperand(0);
55776 SDValue AddOp1 = Add.getOperand(1);
55777 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55778 bool NSW = Add->getFlags().hasNoSignedWrap();
55779 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55780 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55781 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55782
55783 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55784 // into the 'zext'
55785 if ((Sext && !NSW) || (!Sext && !NUW))
55786 return SDValue();
55787
55788 // Having a constant operand to the 'add' ensures that we are not increasing
55789 // the instruction count because the constant is extended for free below.
55790 // A constant operand can also become the displacement field of an LEA.
55791 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55792 if (!AddOp1C)
55793 return SDValue();
55794
55795 // Don't make the 'add' bigger if there's no hope of combining it with some
55796 // other 'add' or 'shl' instruction.
55797 // TODO: It may be profitable to generate simpler LEA instructions in place
55798 // of single 'add' instructions, but the cost model for selecting an LEA
55799 // currently has a high threshold.
55800 bool HasLEAPotential = false;
55801 for (auto *User : Ext->users()) {
55802 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55803 HasLEAPotential = true;
55804 break;
55805 }
55806 }
55807 if (!HasLEAPotential)
55808 return SDValue();
55809
55810 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55811 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55812 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55813 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55814
55815 // The wider add is guaranteed to not wrap because both operands are
55816 // sign-extended.
55817 SDNodeFlags Flags;
55818 Flags.setNoSignedWrap(NSW);
55819 Flags.setNoUnsignedWrap(NUW);
55820 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55821}
55822
55823// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55824// operands and the result of CMOV is not used anywhere else - promote CMOV
55825// itself instead of promoting its result. This could be beneficial, because:
55826// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55827// (or more) pseudo-CMOVs only when they go one-after-another and
55828// getting rid of result extension code after CMOV will help that.
55829// 2) Promotion of constant CMOV arguments is free, hence the
55830// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55831// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55832// promotion is also good in terms of code-size.
55833// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55834// promotion).
55836 SDValue CMovN = Extend->getOperand(0);
55837 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55838 return SDValue();
55839
55840 EVT TargetVT = Extend->getValueType(0);
55841 unsigned ExtendOpcode = Extend->getOpcode();
55842 SDLoc DL(Extend);
55843
55844 EVT VT = CMovN.getValueType();
55845 SDValue CMovOp0 = CMovN.getOperand(0);
55846 SDValue CMovOp1 = CMovN.getOperand(1);
55847
55848 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55849 !isa<ConstantSDNode>(CMovOp1.getNode()))
55850 return SDValue();
55851
55852 // Only extend to i32 or i64.
55853 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55854 return SDValue();
55855
55856 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55857 // are free.
55858 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55859 return SDValue();
55860
55861 // If this a zero extend to i64, we should only extend to i32 and use a free
55862 // zero extend to finish.
55863 EVT ExtendVT = TargetVT;
55864 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55865 ExtendVT = MVT::i32;
55866
55867 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55868 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55869
55870 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55871 CMovN.getOperand(2), CMovN.getOperand(3));
55872
55873 // Finish extending if needed.
55874 if (ExtendVT != TargetVT)
55875 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55876
55877 return Res;
55878}
55879
55880// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55881// result type.
55883 const X86Subtarget &Subtarget) {
55884 SDValue N0 = N->getOperand(0);
55885 EVT VT = N->getValueType(0);
55886 SDLoc dl(N);
55887
55888 // Only do this combine with AVX512 for vector extends.
55889 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55890 return SDValue();
55891
55892 // Only combine legal element types.
55893 EVT SVT = VT.getVectorElementType();
55894 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55895 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55896 return SDValue();
55897
55898 // We don't have CMPP Instruction for vxf16
55899 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55900 return SDValue();
55901 // We can only do this if the vector size in 256 bits or less.
55902 unsigned Size = VT.getSizeInBits();
55903 if (Size > 256 && Subtarget.useAVX512Regs())
55904 return SDValue();
55905
55906 EVT N00VT = N0.getOperand(0).getValueType();
55907
55908 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55909 // that's the only integer compares with we have.
55911 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55912 return SDValue();
55913
55914 // Only do this combine if the extension will be fully consumed by the setcc.
55915 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55916 if (Size != MatchingVecType.getSizeInBits())
55917 return SDValue();
55918
55919 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55920
55921 if (N->getOpcode() == ISD::ZERO_EXTEND)
55922 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55923
55924 return Res;
55925}
55926
55929 const X86Subtarget &Subtarget) {
55930 SDValue N0 = N->getOperand(0);
55931 EVT VT = N->getValueType(0);
55932 SDLoc DL(N);
55933
55934 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55935 if (!DCI.isBeforeLegalizeOps() &&
55937 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55938 N0->getOperand(1));
55939 bool ReplaceOtherUses = !N0.hasOneUse();
55940 DCI.CombineTo(N, Setcc);
55941 // Replace other uses with a truncate of the widened setcc_carry.
55942 if (ReplaceOtherUses) {
55943 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55944 N0.getValueType(), Setcc);
55945 DCI.CombineTo(N0.getNode(), Trunc);
55946 }
55947
55948 return SDValue(N, 0);
55949 }
55950
55951 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55952 return NewCMov;
55953
55954 if (!DCI.isBeforeLegalizeOps())
55955 return SDValue();
55956
55957 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55958 return V;
55959
55960 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55961 DAG, DCI, Subtarget))
55962 return V;
55963
55964 if (VT.isVector()) {
55965 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55966 return R;
55967
55969 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55970 }
55971
55972 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55973 return NewAdd;
55974
55975 return SDValue();
55976}
55977
55978// Inverting a constant vector is profitable if it can be eliminated and the
55979// inverted vector is already present in DAG. Otherwise, it will be loaded
55980// anyway.
55981//
55982// We determine which of the values can be completely eliminated and invert it.
55983// If both are eliminable, select a vector with the first negative element.
55986 "ConstantFP build vector expected");
55987 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
55988 // can eliminate it. Since this function is invoked for each FMA with this
55989 // vector.
55990 auto IsNotFMA = [](SDNode *User) {
55991 return User->getOpcode() != ISD::FMA &&
55992 User->getOpcode() != ISD::STRICT_FMA;
55993 };
55994 if (llvm::any_of(V->users(), IsNotFMA))
55995 return SDValue();
55996
55998 EVT VT = V.getValueType();
55999 EVT EltVT = VT.getVectorElementType();
56000 for (const SDValue &Op : V->op_values()) {
56001 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56002 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
56003 } else {
56004 assert(Op.isUndef());
56005 Ops.push_back(DAG.getUNDEF(EltVT));
56006 }
56007 }
56008
56010 if (!NV)
56011 return SDValue();
56012
56013 // If an inverted version cannot be eliminated, choose it instead of the
56014 // original version.
56015 if (llvm::any_of(NV->users(), IsNotFMA))
56016 return SDValue(NV, 0);
56017
56018 // If the inverted version also can be eliminated, we have to consistently
56019 // prefer one of the values. We prefer a constant with a negative value on
56020 // the first place.
56021 // N.B. We need to skip undefs that may precede a value.
56022 for (const SDValue &Op : V->op_values()) {
56023 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56024 if (Cst->isNegative())
56025 return SDValue();
56026 break;
56027 }
56028 }
56029 return SDValue(NV, 0);
56030}
56031
56034 const X86Subtarget &Subtarget) {
56035 SDLoc dl(N);
56036 EVT VT = N->getValueType(0);
56038 bool IsStrict = N->isTargetOpcode()
56039 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
56040 : N->isStrictFPOpcode();
56041
56042 // Let legalize expand this if it isn't a legal type yet.
56043 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56044 if (!TLI.isTypeLegal(VT))
56045 return SDValue();
56046
56047 SDValue A = N->getOperand(IsStrict ? 1 : 0);
56048 SDValue B = N->getOperand(IsStrict ? 2 : 1);
56049 SDValue C = N->getOperand(IsStrict ? 3 : 2);
56050
56051 // If the operation allows fast-math and the target does not support FMA,
56052 // split this into mul+add to avoid libcall(s).
56053 SDNodeFlags Flags = N->getFlags();
56054 if (!IsStrict && Flags.hasAllowReassociation() &&
56055 TLI.isOperationExpand(ISD::FMA, VT)) {
56056 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
56057 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
56058 }
56059
56060 EVT ScalarVT = VT.getScalarType();
56061 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
56062 !Subtarget.hasAnyFMA()) &&
56063 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
56064 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
56065 return SDValue();
56066
56067 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
56069 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56070 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
56071 CodeSize)) {
56072 V = NegV;
56073 return true;
56074 }
56075 // Look through extract_vector_elts. If it comes from an FNEG, create a
56076 // new extract from the FNEG input.
56077 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56078 isNullConstant(V.getOperand(1))) {
56079 SDValue Vec = V.getOperand(0);
56080 if (SDValue NegV = TLI.getCheaperNegatedExpression(
56081 Vec, DAG, LegalOperations, CodeSize)) {
56082 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
56083 NegV, V.getOperand(1));
56084 return true;
56085 }
56086 }
56087 // Lookup if there is an inverted version of constant vector V in DAG.
56088 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
56089 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
56090 V = NegV;
56091 return true;
56092 }
56093 }
56094 return false;
56095 };
56096
56097 // Do not convert the passthru input of scalar intrinsics.
56098 // FIXME: We could allow negations of the lower element only.
56099 bool NegA = invertIfNegative(A);
56100 // Create a dummy use for A so that in the process of negating B or C
56101 // recursively, it is not deleted.
56102 HandleSDNode NegAHandle(A);
56103 bool NegB = invertIfNegative(B);
56104 // Similar to A, get a handle on B.
56105 HandleSDNode NegBHandle(B);
56106 bool NegC = invertIfNegative(C);
56107
56108 if (!NegA && !NegB && !NegC)
56109 return SDValue();
56110
56111 unsigned NewOpcode =
56112 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56113
56114 // Propagate fast-math-flags to new FMA node.
56115 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56116 if (IsStrict) {
56117 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56118 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56119 {N->getOperand(0), A, B, C});
56120 } else {
56121 if (N->getNumOperands() == 4)
56122 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56123 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56124 }
56125}
56126
56127// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56128// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56131 SDLoc dl(N);
56132 EVT VT = N->getValueType(0);
56133 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56135 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56136
56137 SDValue N2 = N->getOperand(2);
56138
56139 SDValue NegN2 =
56140 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56141 if (!NegN2)
56142 return SDValue();
56143 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56144
56145 if (N->getNumOperands() == 4)
56146 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56147 NegN2, N->getOperand(3));
56148 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56149 NegN2);
56150}
56151
56152// Try to widen the build vector and bitcast it to the type of zext.
56153// This is a special case for the 128-bit vector types. Intention is to remove
56154// the zext and replace it with a bitcast the wider type. While lowering
56155// the bitcast is removed and extra commutation due to zext is avoided.
56156// For example:
56157// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56158// build_vector (x, 0, y, 0, z, w, 0)
56160
56161 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56162 return SDValue();
56163
56164 EVT ExtendVT = Extend->getValueType(0);
56165
56166 SDValue BV = Extend->getOperand(0);
56167 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56168 return SDValue();
56169
56170 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56171 // If the build vector has undef elements, we cannot widen it.
56172 // The widening would create a vector with more undef elements, which
56173 // is not valid.
56174 return SDValue();
56175 }
56176
56177 if (!all_of(BV->op_values(),
56178 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56179 // If the build vector any element other than \ISD::LOAD, we cannot widen
56180 // it.
56181 return SDValue();
56182 }
56183
56184 SDLoc dl(BV);
56185 EVT VT = BV.getValueType();
56186 EVT EltVT = BV.getOperand(0).getValueType();
56187 unsigned NumElts = VT.getVectorNumElements();
56188
56189 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56190
56191 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56193 return SDValue();
56194
56195 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56196 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56197
56198 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56199 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56200 // Fill the new elements with Zero.
56201 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56202 // Compute the step to place the elements in the right place and control the
56203 // iteration.
56204 unsigned step = WidenNumElts / NumElts;
56205 if (WidenVT.is128BitVector()) {
56206 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56207 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56208 i--, j -= step) {
56209 SDValue temp = NewOps[i];
56210 NewOps[i] = NewOps[j];
56211 NewOps[j] = temp;
56212 }
56213 // Create new build vector with WidenVT and NewOps
56214 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56215 // Replace the old build vector with the new one. Bitcast the
56216 // new build vector to the type of the zext.
56217 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56218 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56219 return NewBV;
56220 }
56221 }
56222 return SDValue();
56223}
56224
56227 const X86Subtarget &Subtarget) {
56228 SDLoc dl(N);
56229 SDValue N0 = N->getOperand(0);
56230 EVT VT = N->getValueType(0);
56231
56232 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56233 // FIXME: Is this needed? We don't seem to have any tests for it.
56234 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56236 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56237 N0->getOperand(1));
56238 bool ReplaceOtherUses = !N0.hasOneUse();
56239 DCI.CombineTo(N, Setcc);
56240 // Replace other uses with a truncate of the widened setcc_carry.
56241 if (ReplaceOtherUses) {
56242 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56243 N0.getValueType(), Setcc);
56244 DCI.CombineTo(N0.getNode(), Trunc);
56245 }
56246
56247 return SDValue(N, 0);
56248 }
56249
56250 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56251 return NewCMov;
56252
56253 if (DCI.isBeforeLegalizeOps())
56254 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56255 return V;
56256
56257 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56258 DAG, DCI, Subtarget))
56259 return V;
56260
56261 if (VT.isVector())
56262 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56263 return R;
56264
56265 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56266 return NewAdd;
56267
56268 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56269 return R;
56270
56271 // TODO: Combine with any target/faux shuffle.
56272 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56274 SDValue N00 = N0.getOperand(0);
56275 SDValue N01 = N0.getOperand(1);
56276 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56277 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56278 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56279 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56280 return concatSubVectors(N00, N01, DAG, dl);
56281 }
56282 }
56283
56284 if (SDValue V = widenBuildVec(N, DAG))
56285 return V;
56286
56287 return SDValue();
56288}
56289
56290/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56291/// pre-promote its result type since vXi1 vectors don't get promoted
56292/// during type legalization.
56295 const SDLoc &DL, SelectionDAG &DAG,
56296 const X86Subtarget &Subtarget) {
56297 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56298 VT.getVectorElementType() == MVT::i1 &&
56299 (OpVT.getVectorElementType() == MVT::i8 ||
56300 OpVT.getVectorElementType() == MVT::i16)) {
56301 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56302 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56303 }
56304 return SDValue();
56305}
56306
56307// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56308// eq/ne) is generated when using an integer as a mask. Instead of generating a
56309// broadcast + vptest, we can directly move the integer to a mask register.
56311 const SDLoc &DL, SelectionDAG &DAG,
56312 const X86Subtarget &Subtarget) {
56313 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56314 return SDValue();
56315
56316 if (!Subtarget.hasAVX512())
56317 return SDValue();
56318
56319 if (Op0.getOpcode() != ISD::AND)
56320 return SDValue();
56321
56322 SDValue Broadcast = Op0.getOperand(0);
56323 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56324 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56325 return SDValue();
56326
56327 SDValue Load = Op0.getOperand(1);
56328 EVT LoadVT = Load.getSimpleValueType();
56329
56330 APInt UndefElts;
56331 SmallVector<APInt, 32> EltBits;
56333 UndefElts, EltBits,
56334 /*AllowWholeUndefs*/ true,
56335 /*AllowPartialUndefs*/ false) ||
56336 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56337 return SDValue();
56338
56339 // Check if the constant pool contains only powers of 2 starting from some
56340 // 2^N. The table may also contain undefs because of widening of vector
56341 // operands.
56342 unsigned N = EltBits[0].logBase2();
56343 unsigned Len = UndefElts.getBitWidth();
56344 for (unsigned I = 1; I != Len; ++I) {
56345 if (UndefElts[I]) {
56346 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56347 return SDValue();
56348 break;
56349 }
56350
56351 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56352 return SDValue();
56353 }
56354
56355 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56356 SDValue BroadcastOp;
56357 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56358 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56359 Broadcast, DAG.getVectorIdxConstant(0, DL));
56360 } else {
56361 BroadcastOp = Broadcast.getOperand(0);
56362 if (BroadcastOp.getValueType().isVector())
56363 return SDValue();
56364 }
56365
56366 SDValue Masked = BroadcastOp;
56367 if (N != 0) {
56368 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56369 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56370
56371 if (NumDefinedElts > BroadcastOpBitWidth)
56372 return SDValue();
56373
56374 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56375 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56376 DAG.getConstant(N, DL, BroadcastOpVT));
56377 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56378 DAG.getConstant(Mask, DL, BroadcastOpVT));
56379 }
56380 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56381 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56382 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56383 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56384
56385 if (CC == ISD::SETEQ)
56386 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56387
56388 if (VT != MVT::v16i1)
56389 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56390 DAG.getVectorIdxConstant(0, DL));
56391
56392 return Bitcast;
56393}
56394
56397 const X86Subtarget &Subtarget) {
56398 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56399 const SDValue LHS = N->getOperand(0);
56400 const SDValue RHS = N->getOperand(1);
56401 EVT VT = N->getValueType(0);
56402 EVT OpVT = LHS.getValueType();
56403 SDLoc DL(N);
56404
56405 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56406 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56407 Subtarget))
56408 return V;
56409 }
56410
56411 if (VT == MVT::i1) {
56412 X86::CondCode X86CC;
56413 if (SDValue V =
56414 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56415 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56416 }
56417
56418 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56419 if (OpVT.isScalarInteger()) {
56420 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56421 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56422 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56423 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56424 if (N0.getOperand(0) == N1)
56425 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56426 N0.getOperand(1));
56427 if (N0.getOperand(1) == N1)
56428 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56429 N0.getOperand(0));
56430 }
56431 return SDValue();
56432 };
56433 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56434 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56435 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56436 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56437
56438 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56439 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56440 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56441 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56442 if (N0.getOperand(0) == N1)
56443 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56444 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56445 if (N0.getOperand(1) == N1)
56446 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56447 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56448 }
56449 return SDValue();
56450 };
56451 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56452 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56453 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56454 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56455
56456 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56457 // cmpne(trunc(x),C) --> cmpne(x,C)
56458 // iff x upper bits are zero.
56459 if (LHS.getOpcode() == ISD::TRUNCATE &&
56460 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56462 EVT SrcVT = LHS.getOperand(0).getValueType();
56464 OpVT.getScalarSizeInBits());
56465 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56466 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56467 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56468 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56469 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56470 }
56471
56472 // With C as a power of 2 and C != 0 and C != INT_MIN:
56473 // icmp eq Abs(X) C ->
56474 // (icmp eq A, C) | (icmp eq A, -C)
56475 // icmp ne Abs(X) C ->
56476 // (icmp ne A, C) & (icmp ne A, -C)
56477 // Both of these patterns can be better optimized in
56478 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56479 // integers which is checked above.
56480 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56481 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56482 const APInt &CInt = C->getAPIntValue();
56483 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56484 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56485 SDValue BaseOp = LHS.getOperand(0);
56486 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56487 SDValue SETCC1 = DAG.getSetCC(
56488 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56489 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56490 SETCC0, SETCC1);
56491 }
56492 }
56493 }
56494 }
56495 }
56496
56497 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56498 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56499 // Using temporaries to avoid messing up operand ordering for later
56500 // transformations if this doesn't work.
56501 SDValue Op0 = LHS;
56502 SDValue Op1 = RHS;
56503 ISD::CondCode TmpCC = CC;
56504 // Put build_vector on the right.
56505 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56506 std::swap(Op0, Op1);
56507 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56508 }
56509
56510 bool IsSEXT0 =
56511 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56512 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56513 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56514
56515 if (IsSEXT0 && IsVZero1) {
56516 assert(VT == Op0.getOperand(0).getValueType() &&
56517 "Unexpected operand type");
56518 if (TmpCC == ISD::SETGT)
56519 return DAG.getConstant(0, DL, VT);
56520 if (TmpCC == ISD::SETLE)
56521 return DAG.getConstant(1, DL, VT);
56522 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56523 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56524
56525 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56526 "Unexpected condition code!");
56527 return Op0.getOperand(0);
56528 }
56529
56530 if (IsVZero1)
56531 if (SDValue V =
56532 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56533 return V;
56534 }
56535
56536 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56537 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56538 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56539 // a mask, there are signed AVX512 comparisons).
56540 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56541 bool CanMakeSigned = false;
56542 if (ISD::isUnsignedIntSetCC(CC)) {
56543 KnownBits CmpKnown =
56545 // If we know LHS/RHS share the same sign bit at each element we can
56546 // make this signed.
56547 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56548 // across all lanes. So a pattern where the sign varies from lane to
56549 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56550 // missed. We could get around this by demanding each lane
56551 // independently, but this isn't the most important optimization and
56552 // that may eat into compile time.
56553 CanMakeSigned =
56554 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56555 }
56556 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56557 SDValue LHSOut = LHS;
56558 SDValue RHSOut = RHS;
56559 ISD::CondCode NewCC = CC;
56560 switch (CC) {
56561 case ISD::SETGE:
56562 case ISD::SETUGE:
56563 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56564 /*NSW*/ true))
56565 LHSOut = NewLHS;
56566 else if (SDValue NewRHS = incDecVectorConstant(
56567 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56568 RHSOut = NewRHS;
56569 else
56570 break;
56571
56572 [[fallthrough]];
56573 case ISD::SETUGT:
56574 NewCC = ISD::SETGT;
56575 break;
56576
56577 case ISD::SETLE:
56578 case ISD::SETULE:
56579 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56580 /*NSW*/ true))
56581 LHSOut = NewLHS;
56582 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56583 /*NSW*/ true))
56584 RHSOut = NewRHS;
56585 else
56586 break;
56587
56588 [[fallthrough]];
56589 case ISD::SETULT:
56590 // Will be swapped to SETGT in LowerVSETCC*.
56591 NewCC = ISD::SETLT;
56592 break;
56593 default:
56594 break;
56595 }
56596 if (NewCC != CC) {
56597 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56598 NewCC, DL, DAG, Subtarget))
56599 return R;
56600 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56601 }
56602 }
56603 }
56604
56605 if (SDValue R =
56606 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56607 return R;
56608
56609 // In the middle end transforms:
56610 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56611 // -> `(icmp ult (add x, -C), 2)`
56612 // Likewise inverted cases with `ugt`.
56613 //
56614 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56615 // in worse codegen. So, undo the middle-end transform and go back to `(or
56616 // (icmp eq), (icmp eq))` form.
56617 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56618 // the xmm approach.
56619 //
56620 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56621 // ne))` as it doesn't end up instruction positive.
56622 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56623 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56624 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56625 !Subtarget.hasAVX512() &&
56626 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56627 Subtarget.hasAVX2()) &&
56628 LHS.hasOneUse()) {
56629
56630 APInt CmpC;
56631 SDValue AddC = LHS.getOperand(1);
56632 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56634 // See which form we have depending on the constant/condition.
56635 SDValue C0 = SDValue();
56636 SDValue C1 = SDValue();
56637
56638 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56639 // we will end up generating an additional constant. Keeping in the
56640 // current form has a slight latency cost, but it probably worth saving a
56641 // constant.
56644 // Pass
56645 }
56646 // Normal Cases
56647 else if ((CC == ISD::SETULT && CmpC == 2) ||
56648 (CC == ISD::SETULE && CmpC == 1)) {
56649 // These will constant fold.
56650 C0 = DAG.getNegative(AddC, DL, OpVT);
56651 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56652 DAG.getAllOnesConstant(DL, OpVT));
56653 }
56654 // Inverted Cases
56655 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56656 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56657 // These will constant fold.
56658 C0 = DAG.getNOT(DL, AddC, OpVT);
56659 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56660 DAG.getAllOnesConstant(DL, OpVT));
56661 }
56662 if (C0 && C1) {
56663 SDValue NewLHS =
56664 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56665 SDValue NewRHS =
56666 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56667 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56668 }
56669 }
56670 }
56671
56672 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56673 // to avoid scalarization via legalization because v4i32 is not a legal type.
56674 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56675 LHS.getValueType() == MVT::v4f32)
56676 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56677
56678 // X pred 0.0 --> X pred -X
56679 // If the negation of X already exists, use it in the comparison. This removes
56680 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56681 // instructions in patterns with a 'select' node.
56683 SDVTList FNegVT = DAG.getVTList(OpVT);
56684 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56685 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56686 }
56687
56688 return SDValue();
56689}
56690
56693 const X86Subtarget &Subtarget) {
56694 SDValue Src = N->getOperand(0);
56695 MVT SrcVT = Src.getSimpleValueType();
56696 MVT VT = N->getSimpleValueType(0);
56697 unsigned NumBits = VT.getScalarSizeInBits();
56698 unsigned NumElts = SrcVT.getVectorNumElements();
56699 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56700 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56701
56702 // Perform constant folding.
56703 APInt UndefElts;
56704 SmallVector<APInt, 32> EltBits;
56705 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56706 /*AllowWholeUndefs*/ true,
56707 /*AllowPartialUndefs*/ true)) {
56708 APInt Imm(32, 0);
56709 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56710 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56711 Imm.setBit(Idx);
56712
56713 return DAG.getConstant(Imm, SDLoc(N), VT);
56714 }
56715
56716 // Look through int->fp bitcasts that don't change the element width.
56717 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56718 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56719 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56720 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56721
56722 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56723 // with scalar comparisons.
56724 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56725 SDLoc DL(N);
56726 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56727 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56728 return DAG.getNode(ISD::XOR, DL, VT,
56729 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56730 DAG.getConstant(NotMask, DL, VT));
56731 }
56732
56733 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56734 // results with scalar comparisons.
56735 if (Src.getOpcode() == X86ISD::PCMPGT &&
56736 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56737 SDLoc DL(N);
56738 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56739 return DAG.getNode(ISD::XOR, DL, VT,
56740 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56741 DAG.getConstant(NotMask, DL, VT));
56742 }
56743
56744 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56745 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56746 // iff pow2splat(c1).
56747 // Use KnownBits to determine if only a single bit is non-zero
56748 // in each element (pow2 or zero), and shift that bit to the msb.
56749 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56750 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56751 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56752 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56753 if (KnownLHS.countMaxPopulation() == 1 &&
56754 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56755 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56756 SDLoc DL(N);
56757 MVT ShiftVT = SrcVT;
56758 SDValue ShiftLHS = Src.getOperand(0);
56759 SDValue ShiftRHS = Src.getOperand(1);
56760 if (ShiftVT.getScalarType() == MVT::i8) {
56761 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56762 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56763 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56764 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56765 }
56766 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56767 ShiftLHS, ShiftAmt, DAG);
56768 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56769 ShiftRHS, ShiftAmt, DAG);
56770 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56771 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56772 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56773 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56774 }
56775 }
56776
56777 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56778 if (N->isOnlyUserOf(Src.getNode())) {
56780 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56781 APInt UndefElts;
56782 SmallVector<APInt, 32> EltBits;
56783 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56784 UndefElts, EltBits)) {
56785 APInt Mask = APInt::getZero(NumBits);
56786 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56787 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56788 Mask.setBit(Idx);
56789 }
56790 SDLoc DL(N);
56791 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56792 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56793 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56794 DAG.getConstant(Mask, DL, VT));
56795 }
56796 }
56797 }
56798
56799 // Simplify the inputs.
56800 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56801 APInt DemandedMask(APInt::getAllOnes(NumBits));
56802 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56803 return SDValue(N, 0);
56804
56805 return SDValue();
56806}
56807
56810 const X86Subtarget &Subtarget) {
56811 MVT VT = N->getSimpleValueType(0);
56812 unsigned NumBits = VT.getScalarSizeInBits();
56813
56814 // Simplify the inputs.
56815 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56816 APInt DemandedMask(APInt::getAllOnes(NumBits));
56817 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56818 return SDValue(N, 0);
56819
56820 return SDValue();
56821}
56822
56826 SDValue Mask = MemOp->getMask();
56827
56828 // With vector masks we only demand the upper bit of the mask.
56829 if (Mask.getScalarValueSizeInBits() != 1) {
56830 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56831 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56832 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56833 if (N->getOpcode() != ISD::DELETED_NODE)
56834 DCI.AddToWorklist(N);
56835 return SDValue(N, 0);
56836 }
56837 }
56838
56839 return SDValue();
56840}
56841
56843 SDValue Index, SDValue Base, SDValue Scale,
56844 SelectionDAG &DAG) {
56845 SDLoc DL(GorS);
56846
56847 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56848 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56849 Gather->getMask(), Base, Index, Scale } ;
56850 return DAG.getMaskedGather(Gather->getVTList(),
56851 Gather->getMemoryVT(), DL, Ops,
56852 Gather->getMemOperand(),
56853 Gather->getIndexType(),
56854 Gather->getExtensionType());
56855 }
56856 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56857 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56858 Scatter->getMask(), Base, Index, Scale };
56859 return DAG.getMaskedScatter(Scatter->getVTList(),
56860 Scatter->getMemoryVT(), DL,
56861 Ops, Scatter->getMemOperand(),
56862 Scatter->getIndexType(),
56863 Scatter->isTruncatingStore());
56864}
56865
56868 SDLoc DL(N);
56869 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56870 SDValue Index = GorS->getIndex();
56871 SDValue Base = GorS->getBasePtr();
56872 SDValue Scale = GorS->getScale();
56873 EVT IndexVT = Index.getValueType();
56874 EVT IndexSVT = IndexVT.getVectorElementType();
56875 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56876 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56877 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56878
56879 if (DCI.isBeforeLegalize()) {
56880 // Attempt to move shifted index into the address scale, allows further
56881 // index truncation below.
56882 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56883 isa<ConstantSDNode>(Scale)) {
56884 unsigned ScaleAmt = Scale->getAsZExtVal();
56885 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56886 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56887 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56888 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56889 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56890 if (N->getOpcode() != ISD::DELETED_NODE)
56891 DCI.AddToWorklist(N);
56892 return SDValue(N, 0);
56893 }
56894 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56895 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56896 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56897 SDValue ShAmt = Index.getOperand(1);
56898 SDValue NewShAmt =
56899 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56900 DAG.getConstant(1, DL, ShAmt.getValueType()));
56901 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56902 Index.getOperand(0), NewShAmt);
56903 SDValue NewScale =
56904 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56905 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56906 }
56907 }
56908 }
56909
56910 // Shrink indices if they are larger than 32-bits.
56911 // Only do this before legalize types since v2i64 could become v2i32.
56912 // FIXME: We could check that the type is legal if we're after legalize
56913 // types, but then we would need to construct test cases where that happens.
56914 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56915 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56916
56917 // FIXME: We could support more than just constant fold, but we need to
56918 // careful with costing. A truncate that can be optimized out would be
56919 // fine. Otherwise we might only want to create a truncate if it avoids
56920 // a split.
56921 if (SDValue TruncIndex =
56922 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56923 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56924
56925 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56926 // there are sufficient sign bits. Only do this before legalize types to
56927 // avoid creating illegal types in truncate.
56928 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56929 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56930 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56931 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56932 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56933 }
56934
56935 // Shrink if we remove an illegal type.
56936 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56937 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56938 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56939 }
56940 }
56941 }
56942
56943 // Try to move splat adders from the index operand to the base
56944 // pointer operand. Taking care to multiply by the scale. We can only do
56945 // this when index element type is the same as the pointer type.
56946 // Otherwise we need to be sure the math doesn't wrap before the scale.
56947 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56948 isa<ConstantSDNode>(Scale)) {
56949 uint64_t ScaleAmt = Scale->getAsZExtVal();
56950
56951 for (unsigned I = 0; I != 2; ++I)
56952 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56953 BitVector UndefElts;
56954 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56955 if (UndefElts.none()) {
56956 // If the splat value is constant we can add the scaled splat value
56957 // to the existing base.
56958 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56959 APInt Adder = C->getAPIntValue() * ScaleAmt;
56960 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56961 DAG.getConstant(Adder, DL, PtrVT));
56962 SDValue NewIndex = Index.getOperand(1 - I);
56963 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56964 }
56965 // For non-constant cases, limit this to non-scaled cases.
56966 if (ScaleAmt == 1) {
56967 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
56968 SDValue NewIndex = Index.getOperand(1 - I);
56969 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56970 }
56971 }
56972 }
56973 // It's also possible base is just a constant. In that case, just
56974 // replace it with 0 and move the displacement into the index.
56975 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
56976 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
56977 // Combine the constant build_vector and the constant base.
56978 Splat =
56979 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
56980 // Add to the other half of the original Index add.
56981 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
56982 Index.getOperand(1 - I), Splat);
56983 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
56984 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56985 }
56986 }
56987 }
56988
56989 if (DCI.isBeforeLegalizeOps()) {
56990 // Make sure the index is either i32 or i64
56991 if (IndexWidth != 32 && IndexWidth != 64) {
56992 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
56993 IndexVT = IndexVT.changeVectorElementType(EltVT);
56994 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
56995 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56996 }
56997 }
56998
56999 // With vector masks we only demand the upper bit of the mask.
57000 SDValue Mask = GorS->getMask();
57001 if (Mask.getScalarValueSizeInBits() != 1) {
57002 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
57003 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
57004 if (N->getOpcode() != ISD::DELETED_NODE)
57005 DCI.AddToWorklist(N);
57006 return SDValue(N, 0);
57007 }
57008 }
57009
57010 return SDValue();
57011}
57012
57013// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
57015 const X86Subtarget &Subtarget) {
57016 SDLoc DL(N);
57017 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
57018 SDValue EFLAGS = N->getOperand(1);
57019
57020 // Try to simplify the EFLAGS and condition code operands.
57021 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
57022 return getSETCC(CC, Flags, DL, DAG);
57023
57024 return SDValue();
57025}
57026
57027/// Optimize branch condition evaluation.
57029 const X86Subtarget &Subtarget) {
57030 SDLoc DL(N);
57031 SDValue EFLAGS = N->getOperand(3);
57032 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
57033
57034 // Try to simplify the EFLAGS and condition code operands.
57035 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
57036 // RAUW them under us.
57037 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
57038 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
57039 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
57040 N->getOperand(1), Cond, Flags);
57041 }
57042
57043 return SDValue();
57044}
57045
57046// TODO: Could we move this to DAGCombine?
57048 SelectionDAG &DAG) {
57049 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
57050 // to optimize away operation when it's from a constant.
57051 //
57052 // The general transformation is:
57053 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
57054 // AND(VECTOR_CMP(x,y), constant2)
57055 // constant2 = UNARYOP(constant)
57056
57057 // Early exit if this isn't a vector operation, the operand of the
57058 // unary operation isn't a bitwise AND, or if the sizes of the operations
57059 // aren't the same.
57060 EVT VT = N->getValueType(0);
57061 bool IsStrict = N->isStrictFPOpcode();
57062 unsigned NumEltBits = VT.getScalarSizeInBits();
57063 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57064 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
57065 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
57066 VT.getSizeInBits() != Op0.getValueSizeInBits())
57067 return SDValue();
57068
57069 // Now check that the other operand of the AND is a constant. We could
57070 // make the transformation for non-constant splats as well, but it's unclear
57071 // that would be a benefit as it would not eliminate any operations, just
57072 // perform one more step in scalar code before moving to the vector unit.
57073 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
57074 // Bail out if the vector isn't a constant.
57075 if (!BV->isConstant())
57076 return SDValue();
57077
57078 // Everything checks out. Build up the new and improved node.
57079 SDLoc DL(N);
57080 EVT IntVT = BV->getValueType(0);
57081 // Create a new constant of the appropriate type for the transformed
57082 // DAG.
57083 SDValue SourceConst;
57084 if (IsStrict)
57085 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
57086 {N->getOperand(0), SDValue(BV, 0)});
57087 else
57088 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
57089 // The AND node needs bitcasts to/from an integer vector type around it.
57090 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
57091 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57092 MaskConst);
57093 SDValue Res = DAG.getBitcast(VT, NewAnd);
57094 if (IsStrict)
57095 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57096 return Res;
57097 }
57098
57099 return SDValue();
57100}
57101
57102/// If we are converting a value to floating-point, try to replace scalar
57103/// truncate of an extracted vector element with a bitcast. This tries to keep
57104/// the sequence on XMM registers rather than moving between vector and GPRs.
57106 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57107 // to allow being called by any similar cast opcode.
57108 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57109 SDValue Trunc = N->getOperand(0);
57110 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57111 return SDValue();
57112
57113 SDValue ExtElt = Trunc.getOperand(0);
57114 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57115 !isNullConstant(ExtElt.getOperand(1)))
57116 return SDValue();
57117
57118 EVT TruncVT = Trunc.getValueType();
57119 EVT SrcVT = ExtElt.getValueType();
57120 unsigned DestWidth = TruncVT.getSizeInBits();
57121 unsigned SrcWidth = SrcVT.getSizeInBits();
57122 if (SrcWidth % DestWidth != 0)
57123 return SDValue();
57124
57125 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57126 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57127 unsigned VecWidth = SrcVecVT.getSizeInBits();
57128 unsigned NumElts = VecWidth / DestWidth;
57129 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57130 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57131 SDLoc DL(N);
57132 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57133 BitcastVec, ExtElt.getOperand(1));
57134 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57135}
57136
57138 const X86Subtarget &Subtarget) {
57139 bool IsStrict = N->isStrictFPOpcode();
57140 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57141 EVT VT = N->getValueType(0);
57142 EVT InVT = Op0.getValueType();
57143
57144 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57145 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57146 // if hasFP16 support:
57147 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57148 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57149 // else
57150 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57151 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57152 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57153 unsigned ScalarSize = InVT.getScalarSizeInBits();
57154 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57155 ScalarSize >= 64)
57156 return SDValue();
57157 SDLoc dl(N);
57158 EVT DstVT =
57160 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57161 : ScalarSize < 32 ? MVT::i32
57162 : MVT::i64,
57163 InVT.getVectorNumElements());
57164 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57165 if (IsStrict)
57166 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57167 {N->getOperand(0), P});
57168 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57169 }
57170
57171 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57172 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57173 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57174 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57175 VT.getScalarType() != MVT::f16) {
57176 SDLoc dl(N);
57177 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57178 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57179
57180 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57181 if (IsStrict)
57182 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57183 {N->getOperand(0), P});
57184 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57185 }
57186
57187 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57188 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57189 // the optimization here.
57190 SDNodeFlags Flags = N->getFlags();
57191 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57192 if (IsStrict)
57193 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57194 {N->getOperand(0), Op0});
57195 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57196 }
57197
57198 return SDValue();
57199}
57200
57203 const X86Subtarget &Subtarget) {
57204 // First try to optimize away the conversion entirely when it's
57205 // conditionally from a constant. Vectors only.
57206 bool IsStrict = N->isStrictFPOpcode();
57208 return Res;
57209
57210 // Now move on to more general possibilities.
57211 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57212 EVT VT = N->getValueType(0);
57213 EVT InVT = Op0.getValueType();
57214
57215 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57216 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57217 // if hasFP16 support:
57218 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57219 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57220 // else
57221 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57222 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57223 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57224 unsigned ScalarSize = InVT.getScalarSizeInBits();
57225 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57226 ScalarSize >= 64)
57227 return SDValue();
57228 SDLoc dl(N);
57229 EVT DstVT =
57231 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57232 : ScalarSize < 32 ? MVT::i32
57233 : MVT::i64,
57234 InVT.getVectorNumElements());
57235 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57236 if (IsStrict)
57237 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57238 {N->getOperand(0), P});
57239 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57240 }
57241
57242 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57243 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57244 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57245 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57246 VT.getScalarType() != MVT::f16) {
57247 SDLoc dl(N);
57248 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57249 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57250 if (IsStrict)
57251 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57252 {N->getOperand(0), P});
57253 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57254 }
57255
57256 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57257 // vectors and scalars, see if we know that the upper bits are all the sign
57258 // bit, in which case we can truncate the input to i32 and convert from that.
57259 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57260 unsigned BitWidth = InVT.getScalarSizeInBits();
57261 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57262 if (NumSignBits >= (BitWidth - 31)) {
57263 EVT TruncVT = MVT::i32;
57264 if (InVT.isVector())
57265 TruncVT = InVT.changeVectorElementType(TruncVT);
57266 SDLoc dl(N);
57267 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57268 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57269 if (IsStrict)
57270 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57271 {N->getOperand(0), Trunc});
57272 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57273 }
57274 // If we're after legalize and the type is v2i32 we need to shuffle and
57275 // use CVTSI2P.
57276 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57277 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57278 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57279 { 0, 2, -1, -1 });
57280 if (IsStrict)
57281 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57282 {N->getOperand(0), Shuf});
57283 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57284 }
57285 }
57286
57287 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57288 // a 32-bit target where SSE doesn't support i64->FP operations.
57289 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57290 Op0.getOpcode() == ISD::LOAD) {
57291 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57292
57293 // This transformation is not supported if the result type is f16 or f128.
57294 if (VT == MVT::f16 || VT == MVT::f128)
57295 return SDValue();
57296
57297 // If we have AVX512DQ we can use packed conversion instructions unless
57298 // the VT is f80.
57299 if (Subtarget.hasDQI() && VT != MVT::f80)
57300 return SDValue();
57301
57302 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57303 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57304 std::pair<SDValue, SDValue> Tmp =
57305 Subtarget.getTargetLowering()->BuildFILD(
57306 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57307 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57308 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57309 return Tmp.first;
57310 }
57311 }
57312
57313 if (IsStrict)
57314 return SDValue();
57315
57316 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57317 return V;
57318
57319 return SDValue();
57320}
57321
57323 const X86Subtarget &Subtarget) {
57324 EVT VT = N->getValueType(0);
57325 SDValue Src = N->getOperand(0);
57326 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57327 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57328 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57329
57330 return SDValue();
57331}
57332
57333// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57335 const X86Subtarget &Subtarget) {
57336 if (!Subtarget.hasAVX10_2())
57337 return SDValue();
57338
57339 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57340 EVT SrcVT = N->getOperand(0).getValueType();
57341 EVT DstVT = N->getValueType(0);
57342 SDLoc dl(N);
57343
57344 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57345 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57346
57347 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57348 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57349 N->getOperand(0), V2F32Value);
57350
57351 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57352 if (IsSigned)
57353 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57354
57355 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57356 }
57357 return SDValue();
57358}
57359
57361 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57362
57363 for (const SDNode *User : Flags->users()) {
57364 X86::CondCode CC;
57365 switch (User->getOpcode()) {
57366 default:
57367 // Be conservative.
57368 return true;
57369 case X86ISD::SETCC:
57371 CC = (X86::CondCode)User->getConstantOperandVal(0);
57372 break;
57373 case X86ISD::BRCOND:
57374 case X86ISD::CMOV:
57375 CC = (X86::CondCode)User->getConstantOperandVal(2);
57376 break;
57377 }
57378
57379 switch (CC) {
57380 // clang-format off
57381 default: break;
57382 case X86::COND_A: case X86::COND_AE:
57383 case X86::COND_B: case X86::COND_BE:
57384 case X86::COND_O: case X86::COND_NO:
57385 case X86::COND_G: case X86::COND_GE:
57386 case X86::COND_L: case X86::COND_LE:
57387 return true;
57388 // clang-format on
57389 }
57390 }
57391
57392 return false;
57393}
57394
57395static bool onlyZeroFlagUsed(SDValue Flags) {
57396 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57397
57398 for (const SDNode *User : Flags->users()) {
57399 unsigned CCOpNo;
57400 switch (User->getOpcode()) {
57401 default:
57402 // Be conservative.
57403 return false;
57404 case X86ISD::SETCC:
57406 CCOpNo = 0;
57407 break;
57408 case X86ISD::BRCOND:
57409 case X86ISD::CMOV:
57410 CCOpNo = 2;
57411 break;
57412 }
57413
57414 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57415 if (CC != X86::COND_E && CC != X86::COND_NE)
57416 return false;
57417 }
57418
57419 return true;
57420}
57421
57424 const X86Subtarget &Subtarget) {
57425 // Only handle test patterns.
57426 if (!isNullConstant(N->getOperand(1)))
57427 return SDValue();
57428
57429 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57430 // and use its flags directly.
57431 // TODO: Maybe we should try promoting compares that only use the zero flag
57432 // first if we can prove the upper bits with computeKnownBits?
57433 SDLoc dl(N);
57434 SDValue Op = N->getOperand(0);
57435 EVT VT = Op.getValueType();
57436 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57437
57438 if (SDValue CMP =
57439 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57440 return CMP;
57441
57442 // If we have a constant logical shift that's only used in a comparison
57443 // against zero turn it into an equivalent AND. This allows turning it into
57444 // a TEST instruction later.
57445 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57446 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57447 onlyZeroFlagUsed(SDValue(N, 0))) {
57448 unsigned BitWidth = VT.getSizeInBits();
57449 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57450 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57451 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57452 APInt Mask = Op.getOpcode() == ISD::SRL
57453 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57454 : APInt::getLowBitsSet(BitWidth, MaskBits);
57455 if (Mask.isSignedIntN(32)) {
57456 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57457 DAG.getConstant(Mask, dl, VT));
57458 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57459 DAG.getConstant(0, dl, VT));
57460 }
57461 }
57462 }
57463
57464 // If we're extracting from a avx512 bool vector and comparing against zero,
57465 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57466 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57467 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57468 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57469 SDValue Src = Op.getOperand(0);
57470 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57471 isNullConstant(Src.getOperand(1)) &&
57472 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57473 SDValue BoolVec = Src.getOperand(0);
57474 unsigned ShAmt = 0;
57475 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57476 ShAmt = BoolVec.getConstantOperandVal(1);
57477 BoolVec = BoolVec.getOperand(0);
57478 }
57479 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57480 EVT VecVT = BoolVec.getValueType();
57481 unsigned BitWidth = VecVT.getVectorNumElements();
57482 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57483 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57484 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57485 Op = DAG.getBitcast(BCVT, BoolVec);
57486 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57487 DAG.getConstant(Mask, dl, BCVT));
57488 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57489 DAG.getConstant(0, dl, BCVT));
57490 }
57491 }
57492 }
57493
57494 // Peek through any zero-extend if we're only testing for a zero result.
57495 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57496 SDValue Src = Op.getOperand(0);
57497 EVT SrcVT = Src.getValueType();
57498 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57499 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57500 DAG.getConstant(0, dl, SrcVT));
57501 }
57502
57503 // Look for a truncate.
57504 if (Op.getOpcode() != ISD::TRUNCATE)
57505 return SDValue();
57506
57507 SDValue Trunc = Op;
57508 Op = Op.getOperand(0);
57509
57510 // See if we can compare with zero against the truncation source,
57511 // which should help using the Z flag from many ops. Only do this for
57512 // i32 truncated op to prevent partial-reg compares of promoted ops.
57513 EVT OpVT = Op.getValueType();
57514 APInt UpperBits =
57516 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57517 onlyZeroFlagUsed(SDValue(N, 0))) {
57518 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57519 DAG.getConstant(0, dl, OpVT));
57520 }
57521
57522 // After this the truncate and arithmetic op must have a single use.
57523 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57524 return SDValue();
57525
57526 unsigned NewOpc;
57527 switch (Op.getOpcode()) {
57528 default: return SDValue();
57529 case ISD::AND:
57530 // Skip and with constant. We have special handling for and with immediate
57531 // during isel to generate test instructions.
57532 if (isa<ConstantSDNode>(Op.getOperand(1)))
57533 return SDValue();
57534 NewOpc = X86ISD::AND;
57535 break;
57536 case ISD::OR: NewOpc = X86ISD::OR; break;
57537 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57538 case ISD::ADD:
57539 // If the carry or overflow flag is used, we can't truncate.
57541 return SDValue();
57542 NewOpc = X86ISD::ADD;
57543 break;
57544 case ISD::SUB:
57545 // If the carry or overflow flag is used, we can't truncate.
57547 return SDValue();
57548 NewOpc = X86ISD::SUB;
57549 break;
57550 }
57551
57552 // We found an op we can narrow. Truncate its inputs.
57553 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57554 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57555
57556 // Use a X86 specific opcode to avoid DAG combine messing with it.
57557 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57558 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57559
57560 // For AND, keep a CMP so that we can match the test pattern.
57561 if (NewOpc == X86ISD::AND)
57562 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57563 DAG.getConstant(0, dl, VT));
57564
57565 // Return the flags.
57566 return Op.getValue(1);
57567}
57568
57571 const X86Subtarget &ST) {
57572 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57573 "Expected X86ISD::ADD or X86ISD::SUB");
57574
57575 SDLoc DL(N);
57576 SDValue LHS = N->getOperand(0);
57577 SDValue RHS = N->getOperand(1);
57578 MVT VT = LHS.getSimpleValueType();
57579 bool IsSub = X86ISD::SUB == N->getOpcode();
57580 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57581
57582 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57583 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57584 return CMP;
57585
57586 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57587 if (!N->hasAnyUseOfValue(1)) {
57588 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57589 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57590 }
57591
57592 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57593 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57594 SDValue Ops[] = {N0, N1};
57595 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57596 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57597 SDValue Op(N, 0);
57598 if (Negate) {
57599 // Bail if this is only used by a user of the x86 add/sub.
57600 if (GenericAddSub->hasOneUse() &&
57601 GenericAddSub->user_begin()->isOnlyUserOf(N))
57602 return;
57603 Op = DAG.getNegative(Op, DL, VT);
57604 }
57605 DCI.CombineTo(GenericAddSub, Op);
57606 }
57607 };
57608 MatchGeneric(LHS, RHS, false);
57609 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57610
57611 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57612 // EFLAGS result doesn't change.
57613 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57614 /*ZeroSecondOpOnly*/ true);
57615}
57616
57618 SDValue LHS = N->getOperand(0);
57619 SDValue RHS = N->getOperand(1);
57620 SDValue BorrowIn = N->getOperand(2);
57621
57622 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57623 MVT VT = N->getSimpleValueType(0);
57624 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57625 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57626 }
57627
57628 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57629 // iff the flag result is dead.
57630 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57631 !N->hasAnyUseOfValue(1))
57632 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57633 LHS.getOperand(1), BorrowIn);
57634
57635 return SDValue();
57636}
57637
57638// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57641 SDValue LHS = N->getOperand(0);
57642 SDValue RHS = N->getOperand(1);
57643 SDValue CarryIn = N->getOperand(2);
57644 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57645 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57646
57647 // Canonicalize constant to RHS.
57648 if (LHSC && !RHSC)
57649 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57650 CarryIn);
57651
57652 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57653 // the result is either zero or one (depending on the input carry bit).
57654 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57655 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57656 // We don't have a good way to replace an EFLAGS use, so only do this when
57657 // dead right now.
57658 SDValue(N, 1).use_empty()) {
57659 SDLoc DL(N);
57660 EVT VT = N->getValueType(0);
57661 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57662 SDValue Res1 = DAG.getNode(
57663 ISD::AND, DL, VT,
57665 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57666 DAG.getConstant(1, DL, VT));
57667 return DCI.CombineTo(N, Res1, CarryOut);
57668 }
57669
57670 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57671 // iff the flag result is dead.
57672 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57673 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57674 SDLoc DL(N);
57675 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57676 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57677 DAG.getConstant(0, DL, LHS.getValueType()),
57678 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57679 }
57680
57681 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57682 MVT VT = N->getSimpleValueType(0);
57683 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57684 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57685 }
57686
57687 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57688 // iff the flag result is dead.
57689 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57690 !N->hasAnyUseOfValue(1))
57691 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57692 LHS.getOperand(1), CarryIn);
57693
57694 return SDValue();
57695}
57696
57698 const SDLoc &DL, EVT VT,
57699 const X86Subtarget &Subtarget) {
57700 using namespace SDPatternMatch;
57701
57702 // Example of pattern we try to detect:
57703 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57704 //(add (build_vector (extract_elt t, 0),
57705 // (extract_elt t, 2),
57706 // (extract_elt t, 4),
57707 // (extract_elt t, 6)),
57708 // (build_vector (extract_elt t, 1),
57709 // (extract_elt t, 3),
57710 // (extract_elt t, 5),
57711 // (extract_elt t, 7)))
57712
57713 if (!Subtarget.hasSSE2())
57714 return SDValue();
57715
57716 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57717 VT.getVectorNumElements() < 4 ||
57719 return SDValue();
57720
57721 SDValue Op0, Op1, Accum;
57726 m_Value(Op1))))))
57727 return SDValue();
57728
57729 // Check if one of Op0,Op1 is of the form:
57730 // (build_vector (extract_elt Mul, 0),
57731 // (extract_elt Mul, 2),
57732 // (extract_elt Mul, 4),
57733 // ...
57734 // the other is of the form:
57735 // (build_vector (extract_elt Mul, 1),
57736 // (extract_elt Mul, 3),
57737 // (extract_elt Mul, 5),
57738 // ...
57739 // and identify Mul.
57740 SDValue Mul;
57741 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57742 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57743 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57744 // TODO: Be more tolerant to undefs.
57745 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57746 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57747 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57748 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57749 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57750 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57751 return SDValue();
57752 // Commutativity of mul allows factors of a product to reorder.
57753 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57754 std::swap(Idx0L, Idx1L);
57755 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57756 std::swap(Idx0H, Idx1H);
57757 // Commutativity of add allows pairs of factors to reorder.
57758 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57759 std::swap(Idx0L, Idx0H);
57760 std::swap(Idx1L, Idx1H);
57761 }
57762 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57763 Idx1H != 2 * i + 3)
57764 return SDValue();
57765 if (!Mul) {
57766 // First time an extract_elt's source vector is visited. Must be a MUL
57767 // with 2X number of vector elements than the BUILD_VECTOR.
57768 // Both extracts must be from same MUL.
57769 Mul = Vec0L;
57770 if (Mul.getOpcode() != ISD::MUL ||
57771 Mul.getValueType().getVectorNumElements() != 2 * e)
57772 return SDValue();
57773 }
57774 // Check that the extract is from the same MUL previously seen.
57775 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57776 return SDValue();
57777 }
57778
57779 // Check if the Mul source can be safely shrunk.
57781 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57783 return SDValue();
57784
57785 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57786 VT.getVectorNumElements() * 2);
57787 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57788 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57789
57790 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57792 EVT InVT = Ops[0].getValueType();
57793 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57794 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57795 InVT.getVectorNumElements() / 2);
57796 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57797 };
57798 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57799 if (Accum)
57800 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57801 return R;
57802}
57803
57804// Attempt to turn this pattern into PMADDWD.
57805// (add (mul (sext (build_vector)), (sext (build_vector))),
57806// (mul (sext (build_vector)), (sext (build_vector)))
57808 const SDLoc &DL, EVT VT,
57809 const X86Subtarget &Subtarget) {
57810 using namespace SDPatternMatch;
57811
57812 if (!Subtarget.hasSSE2())
57813 return SDValue();
57814
57815 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57816 VT.getVectorNumElements() < 4 ||
57818 return SDValue();
57819
57820 // All inputs need to be sign extends.
57821 // TODO: Support ZERO_EXTEND from known positive?
57822 SDValue N00, N01, N10, N11;
57823 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57824 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57825 return SDValue();
57826
57827 // Must be extending from vXi16.
57828 EVT InVT = N00.getValueType();
57829 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57830 N10.getValueType() != InVT || N11.getValueType() != InVT)
57831 return SDValue();
57832
57833 // All inputs should be build_vectors.
57834 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57835 N01.getOpcode() != ISD::BUILD_VECTOR ||
57836 N10.getOpcode() != ISD::BUILD_VECTOR ||
57838 return SDValue();
57839
57840 // For each element, we need to ensure we have an odd element from one vector
57841 // multiplied by the odd element of another vector and the even element from
57842 // one of the same vectors being multiplied by the even element from the
57843 // other vector. So we need to make sure for each element i, this operator
57844 // is being performed:
57845 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57846 SDValue In0, In1;
57847 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57848 SDValue N00Elt = N00.getOperand(i);
57849 SDValue N01Elt = N01.getOperand(i);
57850 SDValue N10Elt = N10.getOperand(i);
57851 SDValue N11Elt = N11.getOperand(i);
57852 // TODO: Be more tolerant to undefs.
57853 SDValue N00In, N01In, N10In, N11In;
57854 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57855 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57856 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57857 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57858 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57859 return SDValue();
57860 // Add is commutative so indices can be reordered.
57861 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57862 std::swap(IdxN00, IdxN10);
57863 std::swap(IdxN01, IdxN11);
57864 }
57865 // N0 indices be the even element. N1 indices must be the next odd element.
57866 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57867 IdxN11 != 2 * i + 1)
57868 return SDValue();
57869
57870 // First time we find an input capture it.
57871 if (!In0) {
57872 In0 = N00In;
57873 In1 = N01In;
57874
57875 // The input vectors must be at least as wide as the output.
57876 // If they are larger than the output, we extract subvector below.
57877 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57878 In1.getValueSizeInBits() < VT.getSizeInBits())
57879 return SDValue();
57880 }
57881 // Mul is commutative so the input vectors can be in any order.
57882 // Canonicalize to make the compares easier.
57883 if (In0 != N00In)
57884 std::swap(N00In, N01In);
57885 if (In0 != N10In)
57886 std::swap(N10In, N11In);
57887 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57888 return SDValue();
57889 }
57890
57891 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57893 EVT OpVT = Ops[0].getValueType();
57894 assert(OpVT.getScalarType() == MVT::i16 &&
57895 "Unexpected scalar element type");
57896 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57897 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57898 OpVT.getVectorNumElements() / 2);
57899 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57900 };
57901
57902 // If the output is narrower than an input, extract the low part of the input
57903 // vector.
57904 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57905 VT.getVectorNumElements() * 2);
57906 if (OutVT16.bitsLT(In0.getValueType())) {
57907 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57908 DAG.getVectorIdxConstant(0, DL));
57909 }
57910 if (OutVT16.bitsLT(In1.getValueType())) {
57911 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57912 DAG.getVectorIdxConstant(0, DL));
57913 }
57914 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57915 PMADDBuilder);
57916}
57917
57918// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57919// If upper element in each pair of both VPMADDWD are zero then we can merge
57920// the operand elements and use the implicit add of VPMADDWD.
57921// TODO: Add support for VPMADDUBSW (which isn't commutable).
57923 const SDLoc &DL, EVT VT) {
57924 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57925 return SDValue();
57926
57927 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57928 if (VT.getSizeInBits() > 128)
57929 return SDValue();
57930
57931 unsigned NumElts = VT.getVectorNumElements();
57932 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57934 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57935
57936 bool Op0HiZero =
57937 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57938 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57939 bool Op1HiZero =
57940 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57941 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57942
57943 // TODO: Check for zero lower elements once we have actual codegen that
57944 // creates them.
57945 if (!Op0HiZero || !Op1HiZero)
57946 return SDValue();
57947
57948 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57949 SmallVector<int> Mask;
57950 for (int i = 0; i != (int)NumElts; ++i) {
57951 Mask.push_back(2 * i);
57952 Mask.push_back(2 * (i + NumElts));
57953 }
57954
57955 SDValue LHS =
57956 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57957 SDValue RHS =
57958 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57959 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57960}
57961
57962/// CMOV of constants requires materializing constant operands in registers.
57963/// Try to fold those constants into an 'add' instruction to reduce instruction
57964/// count. We do this with CMOV rather the generic 'select' because there are
57965/// earlier folds that may be used to turn select-of-constants into logic hacks.
57967 SelectionDAG &DAG,
57968 const X86Subtarget &Subtarget) {
57969 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57970 // better because we eliminate 1-2 instructions. This transform is still
57971 // an improvement without zero operands because we trade 2 move constants and
57972 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57973 // immediate asm operands (fit in 32-bits).
57974 auto isSuitableCmov = [](SDValue V) {
57975 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
57976 return false;
57977 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
57978 !isa<ConstantSDNode>(V.getOperand(1)))
57979 return false;
57980 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
57981 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
57982 V.getConstantOperandAPInt(1).isSignedIntN(32));
57983 };
57984
57985 // Match an appropriate CMOV as the first operand of the add.
57986 SDValue Cmov = N->getOperand(0);
57987 SDValue OtherOp = N->getOperand(1);
57988 if (!isSuitableCmov(Cmov))
57989 std::swap(Cmov, OtherOp);
57990 if (!isSuitableCmov(Cmov))
57991 return SDValue();
57992
57993 // Don't remove a load folding opportunity for the add. That would neutralize
57994 // any improvements from removing constant materializations.
57995 if (X86::mayFoldLoad(OtherOp, Subtarget))
57996 return SDValue();
57997
57998 EVT VT = N->getValueType(0);
57999 SDValue FalseOp = Cmov.getOperand(0);
58000 SDValue TrueOp = Cmov.getOperand(1);
58001
58002 // We will push the add through the select, but we can potentially do better
58003 // if we know there is another add in the sequence and this is pointer math.
58004 // In that case, we can absorb an add into the trailing memory op and avoid
58005 // a 3-operand LEA which is likely slower than a 2-operand LEA.
58006 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
58007 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
58008 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
58009 all_of(N->users(), [&](SDNode *Use) {
58010 auto *MemNode = dyn_cast<MemSDNode>(Use);
58011 return MemNode && MemNode->getBasePtr().getNode() == N;
58012 })) {
58013 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
58014 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
58015 // it is possible that choosing op1 might be better.
58016 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
58017 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
58018 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
58019 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
58020 Cmov.getOperand(2), Cmov.getOperand(3));
58021 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
58022 }
58023
58024 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
58025 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
58026 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
58027 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
58028 Cmov.getOperand(3));
58029}
58030
58031// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
58032// When upper 12 bits of x, y and MUL(x, y) are known to be 0
58034 EVT VT, const X86Subtarget &Subtarget) {
58035 using namespace SDPatternMatch;
58036 if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
58037 (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
58038 return SDValue();
58039
58040 // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
58041 if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
58042 VT.getSizeInBits() < 512)
58043 return SDValue();
58044
58045 const auto TotalSize = VT.getSizeInBits();
58046 if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
58047 return SDValue();
58048
58049 SDValue X, Y, Acc;
58050 if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
58051 return SDValue();
58052
58053 KnownBits KnownX = DAG.computeKnownBits(X);
58054 if (KnownX.countMinLeadingZeros() < 12)
58055 return SDValue();
58056 KnownBits KnownY = DAG.computeKnownBits(Y);
58057 if (KnownY.countMinLeadingZeros() < 12)
58058 return SDValue();
58059 KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
58060 if (KnownMul.countMinLeadingZeros() < 12)
58061 return SDValue();
58062
58063 auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
58064 ArrayRef<SDValue> SubOps) {
58065 EVT SubVT = SubOps[0].getValueType();
58066 assert(SubVT.getScalarSizeInBits() == 64 &&
58067 "Unexpected element size, only supports 64bit size");
58068 return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
58069 SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
58070 };
58071
58072 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
58073 /*CheckBWI*/ false);
58074}
58075
58078 const X86Subtarget &Subtarget) {
58079 using namespace SDPatternMatch;
58080 EVT VT = N->getValueType(0);
58081 SDValue Op0 = N->getOperand(0);
58082 SDValue Op1 = N->getOperand(1);
58083 SDLoc DL(N);
58084
58085 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
58086 return Select;
58087
58088 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
58089 return MAdd;
58090 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
58091 return MAdd;
58092 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
58093 return MAdd;
58094
58095 // Try to synthesize horizontal adds from adds of shuffles.
58096 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58097 return V;
58098
58099 // Canonicalize hidden LEA pattern:
58100 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
58101 // iff c < 4
58102 if (VT == MVT::i32 || VT == MVT::i64) {
58103 SDValue Y, Z, Shift;
58104 APInt Amt;
58105 if (sd_match(
58107 m_Shl(m_Value(), m_ConstInt(Amt))),
58108 m_Value(Y))),
58109 m_Value(Z))) &&
58110 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
58111 return DAG.getNode(ISD::SUB, DL, VT,
58112 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
58113 }
58114 }
58115
58116 SDValue X, Y;
58117
58118 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
58119 // iff X and Y won't overflow.
58120 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
58122 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
58123 MVT OpVT = X.getSimpleValueType();
58124 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
58125 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
58126 getZeroVector(OpVT, Subtarget, DAG, DL));
58127 }
58128
58129 if (VT.isVector()) {
58130 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
58132
58133 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
58134 // (sub Y, (sext (vXi1 X))).
58135 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
58136 // in generic DAG combine without a legal type check, but adding this there
58137 // caused regressions.
58138 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58140 m_Value(Y)))) {
58141 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58142 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58143 }
58144
58145 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58146 // canonicalisation as we don't have good vXi8 shifts.
58147 if (VT.getScalarType() == MVT::i8 &&
58149 SDValue Cmp =
58150 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58151 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58152 }
58153 }
58154
58155 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58156 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58157 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58158 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58159 if (sd_match(N, m_Add(m_Value(Accum),
58162 m_Value(Lo1)),
58164 m_Value(Hi1)))))) {
58165 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58166 concatSubVectors(Lo0, Hi0, DAG, DL),
58167 concatSubVectors(Lo1, Hi1, DAG, DL));
58168 }
58169 }
58170
58171 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58172 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58173 X86::isZeroNode(Op0.getOperand(1))) {
58174 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58175 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58176 Op0.getOperand(0), Op0.getOperand(2));
58177 }
58178
58179 if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
58180 return IFMA52;
58181
58182 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58183}
58184
58185// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58186// condition comes from the subtract node that produced -X. This matches the
58187// cmov expansion for absolute value. By swapping the operands we convert abs
58188// to nabs.
58189static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58190 SelectionDAG &DAG) {
58191 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58192 return SDValue();
58193
58194 SDValue Cond = N1.getOperand(3);
58195 if (Cond.getOpcode() != X86ISD::SUB)
58196 return SDValue();
58197 assert(Cond.getResNo() == 1 && "Unexpected result number");
58198
58199 SDValue FalseOp = N1.getOperand(0);
58200 SDValue TrueOp = N1.getOperand(1);
58202
58203 // ABS condition should come from a negate operation.
58204 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58205 isNullConstant(Cond.getOperand(0))) {
58206 // Get the X and -X from the negate.
58207 SDValue NegX = Cond.getValue(0);
58208 SDValue X = Cond.getOperand(1);
58209
58210 // Cmov operands should be X and NegX. Order doesn't matter.
58211 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58212 return SDValue();
58213
58214 // Build a new CMOV with the operands swapped.
58215 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58216 N1.getOperand(2), Cond);
58217 // Convert sub to add.
58218 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58219 }
58220
58221 // Handle ABD special case:
58222 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58223 // ABD condition should come from a pair of matching subtracts.
58224 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58225 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58226 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58227 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58228 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58229 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58230 // Build a new CMOV with the operands swapped.
58231 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58232 Cond);
58233 }
58234
58235 return SDValue();
58236}
58237
58239 SDValue Op0 = N->getOperand(0);
58240 SDValue Op1 = N->getOperand(1);
58241
58242 // (sub C (zero_extend (setcc)))
58243 // =>
58244 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58245 // Don't disturb (sub 0 setcc), which is easily done with neg.
58246 EVT VT = N->getValueType(0);
58247 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58248 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58249 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58250 Op1.getOperand(0).hasOneUse()) {
58251 SDValue SetCC = Op1.getOperand(0);
58254 APInt NewImm = Op0C->getAPIntValue() - 1;
58255 SDLoc DL(Op1);
58256 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58257 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58258 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58259 DAG.getConstant(NewImm, DL, VT));
58260 }
58261
58262 return SDValue();
58263}
58264
58266 if (N->getConstantOperandVal(3) != X86::COND_NE)
58267 return SDValue();
58268
58269 SDValue Sub = N->getOperand(4);
58270 if (Sub.getOpcode() != X86ISD::SUB)
58271 return SDValue();
58272
58273 SDValue Op1 = Sub.getOperand(1);
58274
58275 if (!X86::isZeroNode(Sub.getOperand(0)))
58276 return SDValue();
58277
58278 SDLoc DL(N);
58279 SmallVector<SDValue, 5> Ops(N->op_values());
58280 if (Op1.getOpcode() == X86ISD::SETCC) {
58281 // res, flags2 = sub 0, (setcc cc, flag)
58282 // cload/cstore ..., cond_ne, flag2
58283 // ->
58284 // cload/cstore cc, flag
58285 Ops[3] = Op1.getOperand(0);
58286 Ops[4] = Op1.getOperand(1);
58287 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58288 SDValue Src = Op1;
58289 SDValue Op10 = Op1.getOperand(0);
58290 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
58291 // res, flags2 = sub 0, (and (xor X, -1), Y)
58292 // cload/cstore ..., cond_ne, flag2
58293 // ->
58294 // res, flags2 = sub 0, (and X, Y)
58295 // cload/cstore ..., cond_e, flag2
58296 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58297 Op1.getOperand(1));
58298 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58299 }
58300 // res, flags2 = sub 0, (and X, Y)
58301 // cload/cstore ..., cc, flag2
58302 // ->
58303 // res, flags2 = cmp (and X, Y), 0
58304 // cload/cstore ..., cc, flag2
58305 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58306 } else {
58307 return SDValue();
58308 }
58309
58310 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58311 cast<MemSDNode>(N)->getMemoryVT(),
58312 cast<MemSDNode>(N)->getMemOperand());
58313}
58314
58317 const X86Subtarget &Subtarget) {
58318 EVT VT = N->getValueType(0);
58319 SDValue Op0 = N->getOperand(0);
58320 SDValue Op1 = N->getOperand(1);
58321 SDLoc DL(N);
58322
58323 auto IsNonOpaqueConstant = [&](SDValue Op) {
58325 /*AllowOpaques*/ false);
58326 };
58327
58328 // X86 can't encode an immediate LHS of a sub. See if we can push the
58329 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58330 // one use and a constant, invert the immediate, saving one register.
58331 // However, ignore cases where C1 is 0, as those will become a NEG.
58332 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58333 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58334 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58335 Op1->hasOneUse()) {
58336 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58337 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58338 SDValue NewAdd =
58339 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58340 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58341 }
58342
58343 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58344 return V;
58345
58346 // Try to synthesize horizontal subs from subs of shuffles.
58347 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58348 return V;
58349
58350 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58351 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58352 X86::isZeroNode(Op1.getOperand(1))) {
58353 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58354 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58355 Op1.getOperand(0), Op1.getOperand(2));
58356 }
58357
58358 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58359 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58360 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58361 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58362 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58363 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58364 Op1.getOperand(1), Op1.getOperand(2));
58365 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58366 }
58367
58368 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58369 return V;
58370
58371 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58372 return V;
58373
58374 return combineSubSetcc(N, DAG);
58375}
58376
58378 const X86Subtarget &Subtarget) {
58379 unsigned Opcode = N->getOpcode();
58380 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58381 "Unknown PCMP opcode");
58382
58383 SDValue LHS = N->getOperand(0);
58384 SDValue RHS = N->getOperand(1);
58385 MVT VT = N->getSimpleValueType(0);
58386 unsigned EltBits = VT.getScalarSizeInBits();
58387 unsigned NumElts = VT.getVectorNumElements();
58388 SDLoc DL(N);
58389
58390 if (LHS == RHS)
58391 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58392 : DAG.getConstant(0, DL, VT);
58393
58394 // Constant Folding.
58395 // PCMPEQ(X,UNDEF) -> UNDEF
58396 // PCMPGT(X,UNDEF) -> 0
58397 // PCMPGT(UNDEF,X) -> 0
58398 APInt LHSUndefs, RHSUndefs;
58399 SmallVector<APInt> LHSBits, RHSBits;
58400 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58401 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58402 APInt Ones = APInt::getAllOnes(EltBits);
58403 APInt Zero = APInt::getZero(EltBits);
58404 SmallVector<APInt> Results(NumElts);
58405 for (unsigned I = 0; I != NumElts; ++I) {
58406 if (Opcode == X86ISD::PCMPEQ) {
58407 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58408 } else {
58409 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58410 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58411 }
58412 }
58413 if (Opcode == X86ISD::PCMPEQ)
58414 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58415 return getConstVector(Results, VT, DAG, DL);
58416 }
58417
58418 return SDValue();
58419}
58420
58421// Helper to determine if we can convert an integer comparison to a float
58422// comparison byt casting the operands.
58423static std::optional<unsigned>
58424CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58425 unsigned NumSignificantBitsRHS) {
58426 MVT SVT = VT.getScalarType();
58427 assert(SVT == MVT::f32 && "Only tested for float so far");
58428 const fltSemantics &Sem = SVT.getFltSemantics();
58429 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58430 "Only PCMPEQ/PCMPGT currently supported");
58431
58432 // TODO: Handle bitcastable integers.
58433
58434 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58435 // a fp value.
58436 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58437 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58438 return ISD::SINT_TO_FP;
58439
58440 return std::nullopt;
58441}
58442
58443/// Helper that combines an array of subvector ops as if they were the operands
58444/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58445/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58448 const X86Subtarget &Subtarget,
58449 unsigned Depth) {
58450 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58451 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58452
58453 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58454 return DAG.getUNDEF(VT);
58455
58456 if (llvm::all_of(Ops, [](SDValue Op) {
58457 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58458 }))
58459 return getZeroVector(VT, Subtarget, DAG, DL);
58460
58462 return SDValue(); // Limit search depth.
58463
58464 SDValue Op0 = Ops[0];
58465 bool IsSplat = llvm::all_equal(Ops);
58466 unsigned NumOps = Ops.size();
58467 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58468 LLVMContext &Ctx = *DAG.getContext();
58469
58470 // Repeated subvectors.
58471 if (IsSplat &&
58472 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58473 // If this broadcast is inserted into both halves, use a larger broadcast.
58474 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58475 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58476
58477 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58478 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58479 (Subtarget.hasAVX2() ||
58481 VT.getScalarType(), Subtarget)))
58482 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58483 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58484 Op0.getOperand(0),
58485 DAG.getVectorIdxConstant(0, DL)));
58486
58487 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58488 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58489 (Subtarget.hasAVX2() ||
58490 (EltSizeInBits >= 32 &&
58491 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58492 Op0.getOperand(0).getValueType() == VT.getScalarType())
58493 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58494
58495 // concat_vectors(extract_subvector(splat(x)),
58496 // extract_subvector(splat(x))) -> splat(x)
58497 // concat_vectors(extract_subvector(subv_broadcast(x)),
58498 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58499 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58500 Op0.getOperand(0).getValueType() == VT) {
58501 SDValue SrcVec = Op0.getOperand(0);
58502 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58503 return SrcVec;
58504 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58505 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58506 return SrcVec;
58507 }
58508
58509 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58510 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58511 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58512 return DAG.getNode(Op0.getOpcode(), DL, VT,
58514 Op0.getOperand(0), Op0.getOperand(0)),
58515 Op0.getOperand(1));
58516 }
58517
58518 // TODO: This should go in combineX86ShufflesRecursively eventually.
58519 if (NumOps == 2) {
58520 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58521 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58522 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58524 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58525 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58526 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58527 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58528 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58529 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58530 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58531 // Only concat of subvector high halves which vperm2x128 is best at or if
58532 // it should fold into a subvector broadcast.
58533 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58534 SrcVT1.is256BitVector()) {
58535 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58536 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58537 "Bad subvector index");
58538 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58539 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58540 unsigned Index = 0;
58541 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58542 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58543 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58544 DAG.getBitcast(VT, Src0.getOperand(0)),
58545 DAG.getBitcast(VT, Src1.getOperand(0)),
58546 DAG.getTargetConstant(Index, DL, MVT::i8));
58547 }
58548 }
58549 // Widen extract_subvector
58550 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58551 // --> extract_subvector(x,lo)
58552 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58553 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58554 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58555 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58556 return DAG.getBitcast(VT,
58558 Src0.getConstantOperandVal(1),
58559 DAG, DL, VT.getSizeInBits()));
58560 }
58561 }
58562 }
58563
58564 // Repeated opcode.
58565 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58566 // but it currently struggles with different vector widths.
58567 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58568 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58569 })) {
58570 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58572 for (SDValue SubOp : SubOps)
58573 Subs.push_back(SubOp.getOperand(I));
58574 // Attempt to peek through bitcasts and concat the original subvectors.
58575 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58576 if (SubVT.isSimple() && SubVT.isVector()) {
58577 MVT ConcatVT =
58579 SubVT.getVectorElementCount() * Subs.size());
58580 for (SDValue &Sub : Subs)
58581 Sub = DAG.getBitcast(SubVT, Sub);
58582 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58583 Subtarget, Depth + 1))
58584 return DAG.getBitcast(VT, ConcatSrc);
58585 return DAG.getBitcast(
58586 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58587 }
58588 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58589 };
58590 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58591 bool AllConstants = true;
58592 bool AllSubs = true;
58593 unsigned VecSize = VT.getSizeInBits();
58594 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58595 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58596 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58597 }))
58598 return true;
58599 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58600 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58601 unsigned SubSize = BC.getValueSizeInBits();
58602 unsigned EltSize = BC.getScalarValueSizeInBits();
58603 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58605 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58606 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58607 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58608 }
58609 return AllConstants || AllSubs;
58610 };
58611 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58612 bool AllConstants = true;
58614 for (SDValue SubOp : SubOps) {
58615 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58616 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58618 Subs.push_back(SubOp.getOperand(I));
58619 }
58620 if (AllConstants)
58621 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58622 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58623 };
58624
58625 unsigned Opcode = Op0.getOpcode();
58626 switch (Opcode) {
58627 case ISD::BITCAST: {
58628 // TODO: Support AVX1/AVX2 bitcasts.
58630 for (SDValue SubOp : Ops)
58631 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58632 EVT InnerVT = SubOps[0].getValueType();
58633 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58634 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58635 (Subtarget.hasBWI() ||
58636 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58637 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58638 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58639 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58640 return Op.getValueType() == InnerVT;
58641 })) {
58642 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58643 MVT ConcatVT = MVT::getVectorVT(
58644 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58645 if (SDValue ConcatSrc = combineConcatVectorOps(
58646 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58647 return DAG.getBitcast(VT, ConcatSrc);
58648 }
58649 break;
58650 }
58651 case ISD::VECTOR_SHUFFLE: {
58652 // TODO: Generalize NumOps support.
58653 if (!IsSplat && NumOps == 2 &&
58654 ((VT.is256BitVector() &&
58655 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58656 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58657 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58658 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58659 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58660 if (Concat0 || Concat1 ||
58661 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58662 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58663 Subtarget.hasVBMI())) {
58664 int NumSubElts = Op0.getValueType().getVectorNumElements();
58665 SmallVector<int> NewMask;
58666 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58667 M = M >= NumSubElts ? M + NumSubElts : M;
58668 NewMask.push_back(M);
58669 }
58670 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58671 if (0 <= M)
58672 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58673 NewMask.push_back(M);
58674 }
58675 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58676 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58677 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58678 }
58679 }
58680 break;
58681 }
58682 case X86ISD::VBROADCAST: {
58683 // TODO: 512-bit VBROADCAST concatenation.
58684 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58685 return Op.getOperand(0).getValueType().is128BitVector();
58686 })) {
58687 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58688 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58689 ConcatSubOperand(VT, Ops, 0),
58690 ConcatSubOperand(VT, Ops, 0));
58691 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58692 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58693 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58695 DL, VT, ConcatSubOperand(VT, Ops, 0),
58696 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58697 }
58698 break;
58699 }
58700 case X86ISD::MOVDDUP:
58701 case X86ISD::MOVSHDUP:
58702 case X86ISD::MOVSLDUP: {
58703 if (!IsSplat && (VT.is256BitVector() ||
58704 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58705 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58706 break;
58707 }
58708 case X86ISD::SHUFP: {
58709 if (!IsSplat &&
58710 (VT == MVT::v8f32 ||
58711 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58712 llvm::all_of(Ops, [Op0](SDValue Op) {
58713 return Op.getOperand(2) == Op0.getOperand(2);
58714 })) {
58715 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58716 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58717 if (Concat0 || Concat1)
58718 return DAG.getNode(Opcode, DL, VT,
58719 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58720 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58721 Op0.getOperand(2));
58722 }
58723 break;
58724 }
58725 case X86ISD::UNPCKH:
58726 case X86ISD::UNPCKL: {
58727 // TODO: UNPCK should use CombineSubOperand
58728 // Don't concatenate build_vector patterns.
58729 if (!IsSplat &&
58730 ((VT.is256BitVector() &&
58731 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58732 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58733 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58734 none_of(Ops, [](SDValue Op) {
58735 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58737 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58739 })) {
58740 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58741 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58742 if (Concat0 || Concat1 ||
58743 (Subtarget.hasInt256() && EltSizeInBits == 64))
58744 return DAG.getNode(Opcode, DL, VT,
58745 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58746 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58747 }
58748 break;
58749 }
58750 case X86ISD::PSHUFHW:
58751 case X86ISD::PSHUFLW:
58752 case X86ISD::PSHUFD:
58753 if (!IsSplat &&
58754 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58755 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58756 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58757 llvm::all_of(Ops, [Op0](SDValue Op) {
58758 return Op.getOperand(1) == Op0.getOperand(1);
58759 })) {
58760 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58761 Op0.getOperand(1));
58762 }
58763 [[fallthrough]];
58764 case X86ISD::VPERMILPI:
58765 if (!IsSplat && EltSizeInBits == 32 &&
58766 (VT.is256BitVector() ||
58767 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58768 all_of(Ops, [&Op0](SDValue Op) {
58769 return Op0.getOperand(1) == Op.getOperand(1);
58770 })) {
58771 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58772 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58773 Res =
58774 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58775 return DAG.getBitcast(VT, Res);
58776 }
58777 break;
58778 case X86ISD::VPERMILPV:
58779 if (!IsSplat && (VT.is256BitVector() ||
58780 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58781 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58782 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58783 if (Concat0 || Concat1)
58784 return DAG.getNode(Opcode, DL, VT,
58785 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58786 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58787 }
58788 break;
58789 case X86ISD::PSHUFB:
58790 case X86ISD::PSADBW:
58791 case X86ISD::VPMADDUBSW:
58792 case X86ISD::VPMADDWD:
58793 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58794 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58795 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58796 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58797 NumOps * SrcVT.getVectorNumElements());
58798 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58799 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58800 if (Concat0 || Concat1)
58801 return DAG.getNode(
58802 Opcode, DL, VT,
58803 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58804 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58805 }
58806 break;
58807 case X86ISD::VPERMV:
58808 // TODO: Handle 256-bit and NumOps == 4 cases.
58809 if (!IsSplat && NumOps == 2 &&
58810 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58811 MVT OpVT = Op0.getSimpleValueType();
58812 int NumSrcElts = OpVT.getVectorNumElements();
58813 SmallVector<int, 64> ConcatMask;
58814 for (unsigned i = 0; i != NumOps; ++i) {
58815 SmallVector<int, 64> SubMask;
58817 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58818 break;
58819 for (int M : SubMask) {
58820 if (0 <= M)
58821 M += i * NumSrcElts;
58822 ConcatMask.push_back(M);
58823 }
58824 }
58825 if (ConcatMask.size() == (NumOps * NumSrcElts))
58826 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58827 ConcatSubOperand(VT, Ops, 1),
58828 DAG.getUNDEF(VT), Subtarget, DAG);
58829 }
58830 break;
58831 case X86ISD::VPERMV3:
58832 // TODO: Handle 256-bit and NumOps == 4 cases.
58833 if (!IsSplat && NumOps == 2 &&
58834 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58835 MVT OpVT = Op0.getSimpleValueType();
58836 int NumSrcElts = OpVT.getVectorNumElements();
58837 SmallVector<int, 64> ConcatMask;
58838 for (unsigned i = 0; i != NumOps; ++i) {
58839 SmallVector<int, 64> SubMask;
58841 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58842 break;
58843 for (int M : SubMask) {
58844 if (0 <= M) {
58845 int Src = M < NumSrcElts ? 0 : 2;
58846 M += M < NumSrcElts ? 0 : NumSrcElts;
58847
58848 // Reference the lowest sub if the upper sub is the same.
58849 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58850 M += i * NumSrcElts;
58851 }
58852 ConcatMask.push_back(M);
58853 }
58854 }
58855 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58856 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58857 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58858 if (Concat0 || Concat1)
58859 return lowerShuffleWithPERMV(
58860 DL, VT, ConcatMask,
58861 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58862 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58863 DAG);
58864 }
58865 }
58866 break;
58867 case X86ISD::VPERM2X128: {
58868 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58869 assert(NumOps == 2 && "Bad concat_vectors operands");
58870 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58871 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58872 // TODO: Handle zero'd subvectors.
58873 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58874 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58875 (int)((Imm1 >> 4) & 0x3)};
58876 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58877 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58878 Ops[0].getOperand(1), DAG, DL);
58879 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58880 Ops[1].getOperand(1), DAG, DL);
58881 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58882 DAG.getBitcast(ShuffleVT, LHS),
58883 DAG.getBitcast(ShuffleVT, RHS),
58884 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58885 return DAG.getBitcast(VT, Res);
58886 }
58887 }
58888 break;
58889 }
58890 case X86ISD::SHUF128: {
58891 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58892 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58893 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58894 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58895 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58896 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58897 Ops[0].getOperand(1), DAG, DL);
58898 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58899 Ops[1].getOperand(1), DAG, DL);
58900 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58901 DAG.getTargetConstant(Imm, DL, MVT::i8));
58902 }
58903 break;
58904 }
58905 case ISD::TRUNCATE:
58906 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58907 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58908 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58909 SrcVT == Ops[1].getOperand(0).getValueType() &&
58910 Subtarget.useAVX512Regs() &&
58911 Subtarget.getPreferVectorWidth() >= 512 &&
58912 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58913 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58914 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58915 ConcatSubOperand(NewSrcVT, Ops, 0));
58916 }
58917 }
58918 break;
58919 case ISD::ANY_EXTEND:
58920 case ISD::SIGN_EXTEND:
58921 case ISD::ZERO_EXTEND:
58922 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58923 if (!IsSplat && NumOps == 2 &&
58924 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58925 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58926 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58927 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58928 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58929 SrcVT == Ops[1].getOperand(0).getValueType()) {
58930 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58931 return DAG.getNode(Opcode, DL, VT,
58932 ConcatSubOperand(NewSrcVT, Ops, 0));
58933 }
58934 }
58935 break;
58939 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58940 if (!IsSplat && NumOps == 2 &&
58941 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58942 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58943 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58945 Op0.getOperand(0).getValueType() ==
58946 Ops[0].getOperand(0).getValueType()) {
58947 EVT SrcVT = Op0.getOperand(0).getValueType();
58948 unsigned NumElts = VT.getVectorNumElements();
58949 MVT UnpackSVT =
58950 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
58951 MVT UnpackVT =
58952 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
58953 SDValue Unpack =
58954 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
58955 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
58956 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
58957 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
58958 DAG.getBitcast(SrcVT, Unpack), DAG);
58959 }
58960 break;
58961 }
58962 case X86ISD::VSHLI:
58963 case X86ISD::VSRLI:
58964 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
58965 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
58966 llvm::all_of(Ops, [](SDValue Op) {
58967 return Op.getConstantOperandAPInt(1) == 32;
58968 })) {
58969 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
58970 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
58971 Res = DAG.getBitcast(MVT::v8i32, Res);
58972 if (Opcode == X86ISD::VSHLI) {
58973 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58974 {8, 0, 8, 2, 8, 4, 8, 6});
58975 } else {
58976 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58977 {1, 8, 3, 8, 5, 8, 7, 8});
58978 }
58979 return DAG.getBitcast(VT, Res);
58980 }
58981 }
58982 [[fallthrough]];
58983 case X86ISD::VSRAI:
58984 case X86ISD::VSHL:
58985 case X86ISD::VSRL:
58986 case X86ISD::VSRA:
58987 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
58988 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58989 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58990 llvm::all_of(Ops, [Op0](SDValue Op) {
58991 return Op0.getOperand(1) == Op.getOperand(1);
58992 })) {
58993 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58994 Op0.getOperand(1));
58995 }
58996 break;
58997 case X86ISD::VPERMI:
58998 case X86ISD::VROTLI:
58999 case X86ISD::VROTRI:
59000 if (!IsSplat &&
59001 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
59002 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59003 llvm::all_of(Ops, [Op0](SDValue Op) {
59004 return Op0.getOperand(1) == Op.getOperand(1);
59005 })) {
59006 assert(!(Opcode == X86ISD::VPERMI &&
59007 Op0.getValueType().is128BitVector()) &&
59008 "Illegal 128-bit X86ISD::VPERMI nodes");
59009 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59010 Op0.getOperand(1));
59011 }
59012 break;
59013 case ISD::AND:
59014 case ISD::OR:
59015 case ISD::XOR:
59016 case X86ISD::ANDNP:
59017 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
59018 if (!IsSplat && (VT.is256BitVector() ||
59019 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59020 // Don't concatenate root AVX1 NOT patterns.
59021 // TODO: Allow NOT folding if Concat0 succeeds.
59022 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
59023 llvm::all_of(Ops, [](SDValue X) {
59024 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
59025 }))
59026 break;
59027 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59028 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59029 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
59030 return DAG.getNode(Opcode, DL, VT,
59031 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59032 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59033 }
59034 break;
59035 case X86ISD::PCMPEQ:
59036 case X86ISD::PCMPGT:
59037 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
59038 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
59039 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59040 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59041 if (Concat0 || Concat1)
59042 return DAG.getNode(Opcode, DL, VT,
59043 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59044 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59045 break;
59046 }
59047
59048 if (!IsSplat && VT == MVT::v8i32) {
59049 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
59050 // TODO: Handle v4f64 as well?
59051 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
59052 for (unsigned I = 0; I != NumOps; ++I) {
59053 MaxSigBitsLHS =
59054 std::max(MaxSigBitsLHS,
59055 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
59056 MaxSigBitsRHS =
59057 std::max(MaxSigBitsRHS,
59058 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
59059 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
59060 break;
59061 }
59062
59063 ISD::CondCode ICC =
59064 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
59065 ISD::CondCode FCC =
59067
59068 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
59069 MVT FpVT = VT.changeVectorElementType(FpSVT);
59070
59071 if (std::optional<unsigned> CastOpc =
59072 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
59073 SDValue LHS = CombineSubOperand(VT, Ops, 0);
59074 SDValue RHS = CombineSubOperand(VT, Ops, 1);
59075 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
59076 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
59077 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
59078 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
59079
59080 bool IsAlwaysSignaling;
59081 unsigned FSETCC =
59082 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
59083 return DAG.getBitcast(
59084 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
59085 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
59086 }
59087 }
59088 break;
59089 case ISD::CTPOP:
59090 case ISD::CTTZ:
59091 case ISD::CTLZ:
59094 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59095 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59096 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
59097 }
59098 break;
59100 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
59101 if (!IsSplat &&
59102 (VT.is256BitVector() ||
59103 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59104 llvm::all_of(Ops, [Op0](SDValue Op) {
59105 return Op0.getOperand(2) == Op.getOperand(2);
59106 })) {
59107 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59108 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
59109 }
59110 break;
59111 case ISD::ADD:
59112 case ISD::SUB:
59113 case ISD::MUL:
59114 // TODO: Add more integer binops?
59115 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59116 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59117 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59118 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59119 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59120 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
59121 return Op.getOperand(0) == Op.getOperand(1);
59122 }))
59123 return DAG.getNode(Opcode, DL, VT,
59124 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59125 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59126 }
59127 break;
59128 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
59129 // their latency are short, so here we don't replace them unless we won't
59130 // introduce extra VINSERT.
59131 case ISD::FADD:
59132 case ISD::FSUB:
59133 case ISD::FMUL:
59134 if (!IsSplat && (VT.is256BitVector() ||
59135 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59136 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59137 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59138 if (Concat0 || Concat1)
59139 return DAG.getNode(Opcode, DL, VT,
59140 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59141 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59142 }
59143 break;
59144 // Always prefer to concatenate high latency FDIV instructions.
59145 case ISD::FDIV:
59146 if (!IsSplat && (VT.is256BitVector() ||
59147 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59148 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59149 ConcatSubOperand(VT, Ops, 1));
59150 }
59151 break;
59152 case X86ISD::HADD:
59153 case X86ISD::HSUB:
59154 case X86ISD::FHADD:
59155 case X86ISD::FHSUB:
59156 if (!IsSplat && VT.is256BitVector() &&
59157 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59158 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59159 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59160 if (Concat0 || Concat1)
59161 return DAG.getNode(Opcode, DL, VT,
59162 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59163 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59164 }
59165 break;
59166 case X86ISD::PACKSS:
59167 case X86ISD::PACKUS:
59168 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59169 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59170 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59171 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59172 NumOps * SrcVT.getVectorNumElements());
59173 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59174 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59175 if (Concat0 || Concat1)
59176 return DAG.getNode(
59177 Opcode, DL, VT,
59178 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59179 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59180 }
59181 break;
59182 case X86ISD::VSHLD:
59183 case X86ISD::VSHRD:
59184 case X86ISD::PALIGNR:
59185 if (!IsSplat &&
59186 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59187 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59188 llvm::all_of(Ops, [Op0](SDValue Op) {
59189 return Op0.getOperand(2) == Op.getOperand(2);
59190 })) {
59191 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59192 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59193 if (Concat0 || Concat1)
59194 return DAG.getNode(Opcode, DL, VT,
59195 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59196 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59197 Op0.getOperand(2));
59198 }
59199 break;
59200 case X86ISD::BLENDI:
59201 if (VT.is256BitVector() && NumOps == 2 &&
59202 (EltSizeInBits >= 32 ||
59203 (Subtarget.hasInt256() &&
59204 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59205 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59206 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59207 if (Concat0 || Concat1) {
59208 unsigned NumElts = VT.getVectorNumElements();
59209 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59210 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59211 Mask = Mask.zextOrTrunc(8);
59212 return DAG.getNode(Opcode, DL, VT,
59213 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59214 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59215 DAG.getTargetConstant(Mask, DL, MVT::i8));
59216 }
59217 }
59218 // TODO: BWI targets should only use CombineSubOperand.
59219 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59220 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59221 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59222 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59223 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59224 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59225 unsigned NumElts = VT.getVectorNumElements();
59226 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59227 for (unsigned I = 1; I != NumOps; ++I)
59228 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59229 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59230 Mask = Mask.zextOrTrunc(NumMaskBits);
59231 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59232 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59233 SDValue Sel =
59234 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59235 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59236 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59237 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59238 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59239 }
59240 }
59241 break;
59242 case ISD::VSELECT:
59243 // TODO: VSELECT should use CombineSubOperand.
59244 if (!IsSplat && Subtarget.hasAVX512() &&
59245 (VT.is256BitVector() ||
59246 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59247 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59248 EVT SelVT = Ops[0].getOperand(0).getValueType();
59249 if (SelVT.getVectorElementType() == MVT::i1) {
59250 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59251 NumOps * SelVT.getVectorNumElements());
59252 if (TLI.isTypeLegal(SelVT))
59253 return DAG.getNode(
59254 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59255 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59256 }
59257 }
59258 [[fallthrough]];
59259 case X86ISD::BLENDV:
59260 // TODO: BLENDV should use CombineSubOperand.
59261 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59262 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59263 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59264 EVT SelVT = Ops[0].getOperand(0).getValueType();
59265 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59266 if (TLI.isTypeLegal(SelVT))
59267 return DAG.getNode(
59268 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59269 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59270 }
59271 break;
59272 }
59273 }
59274
59275 // Fold subvector loads into one.
59276 // If needed, look through bitcasts to get to the load.
59277 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59278 unsigned Fast;
59279 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59280 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59281 *FirstLd->getMemOperand(), &Fast) &&
59282 Fast) {
59283 if (SDValue Ld =
59284 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59285 return Ld;
59286 }
59287 }
59288
59289 // Attempt to fold target constant loads.
59290 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59291 SmallVector<APInt> EltBits;
59292 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59293 for (unsigned I = 0; I != NumOps; ++I) {
59294 APInt OpUndefElts;
59295 SmallVector<APInt> OpEltBits;
59296 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59297 OpEltBits, /*AllowWholeUndefs*/ true,
59298 /*AllowPartialUndefs*/ false))
59299 break;
59300 EltBits.append(OpEltBits);
59301 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59302 }
59303 if (EltBits.size() == VT.getVectorNumElements()) {
59304 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59305 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59306 SDValue CV = DAG.getConstantPool(C, PVT);
59309 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59310 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59312 return Ld;
59313 }
59314 }
59315
59316 // If this simple subvector or scalar/subvector broadcast_load is inserted
59317 // into both halves, use a larger broadcast_load. Update other uses to use
59318 // an extracted subvector.
59319 if (IsSplat &&
59320 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59321 if (ISD::isNormalLoad(Op0.getNode()) ||
59324 auto *Mem = cast<MemSDNode>(Op0);
59325 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59328 if (SDValue BcastLd =
59329 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59330 SDValue BcastSrc =
59331 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59332 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59333 return BcastLd;
59334 }
59335 }
59336 }
59337
59338 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59339 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59340 Subtarget.useAVX512Regs()) {
59341 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59342 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59343 Res = DAG.getBitcast(ShuffleVT, Res);
59344 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59345 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59346 return DAG.getBitcast(VT, Res);
59347 }
59348
59349 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59350 if (!IsSplat &&
59351 ((NumOps == 2 && VT == MVT::v4f64) ||
59352 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59353 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59354 // Collect the individual per-lane v2f64/v4f64 shuffles.
59355 MVT OpVT = Ops[0].getSimpleValueType();
59356 unsigned NumOpElts = OpVT.getVectorNumElements();
59359 if (all_of(seq<int>(NumOps), [&](int I) {
59360 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59361 Depth + 1) &&
59362 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59363 none_of(SrcMasks[I], isUndefOrZero) &&
59364 SrcMasks[I].size() == NumOpElts &&
59365 all_of(SrcOps[I], [&OpVT](SDValue V) {
59366 return V.getValueType() == OpVT;
59367 });
59368 })) {
59369 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59370 bool Unary = true;
59371 unsigned SHUFPDMask = 0;
59373 for (unsigned I = 0; I != NumOps; ++I) {
59374 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59375 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59376 Unary &= LHS[I] == RHS[I];
59377 for (unsigned J = 0; J != NumOpElts; ++J)
59378 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59379 }
59380 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59381 // PERMILPD mask and we can always profitably concatenate them.
59382 SDValue Concat0 =
59383 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59384 SDValue Concat1 =
59385 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59386 if (Unary || Concat0 || Concat1) {
59387 Concat0 =
59388 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59389 Concat1 =
59390 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59391 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59392 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59393 }
59394 }
59395 }
59396
59397 return SDValue();
59398}
59399
59402 const X86Subtarget &Subtarget) {
59403 EVT VT = N->getValueType(0);
59404 EVT SrcVT = N->getOperand(0).getValueType();
59405 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59407
59408 if (VT.getVectorElementType() == MVT::i1) {
59409 // Attempt to constant fold.
59410 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59412 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59414 if (!C) break;
59415 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59416 if (I == (E - 1)) {
59417 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59418 if (TLI.isTypeLegal(IntVT))
59419 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59420 }
59421 }
59422
59423 // Don't do anything else for i1 vectors.
59424 return SDValue();
59425 }
59426
59427 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59428 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59429 Subtarget))
59430 return R;
59431 }
59432
59433 return SDValue();
59434}
59435
59438 const X86Subtarget &Subtarget) {
59439 if (DCI.isBeforeLegalizeOps())
59440 return SDValue();
59441
59442 MVT OpVT = N->getSimpleValueType(0);
59443
59444 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59445
59446 SDLoc dl(N);
59447 SDValue Vec = N->getOperand(0);
59448 SDValue SubVec = N->getOperand(1);
59449
59450 uint64_t IdxVal = N->getConstantOperandVal(2);
59451 MVT SubVecVT = SubVec.getSimpleValueType();
59452 int VecNumElts = OpVT.getVectorNumElements();
59453 int SubVecNumElts = SubVecVT.getVectorNumElements();
59454
59455 if (Vec.isUndef() && SubVec.isUndef())
59456 return DAG.getUNDEF(OpVT);
59457
59458 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59459 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59460 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59461 return getZeroVector(OpVT, Subtarget, DAG, dl);
59462
59464 // If we're inserting into a zero vector and then into a larger zero vector,
59465 // just insert into the larger zero vector directly.
59466 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59468 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59469 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59470 getZeroVector(OpVT, Subtarget, DAG, dl),
59471 SubVec.getOperand(1),
59472 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59473 }
59474
59475 // If we're inserting into a zero vector and our input was extracted from an
59476 // insert into a zero vector of the same type and the extraction was at
59477 // least as large as the original insertion. Just insert the original
59478 // subvector into a zero vector.
59479 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59480 isNullConstant(SubVec.getOperand(1)) &&
59482 SDValue Ins = SubVec.getOperand(0);
59483 if (isNullConstant(Ins.getOperand(2)) &&
59484 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59485 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59486 SubVecVT.getFixedSizeInBits())
59487 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59488 getZeroVector(OpVT, Subtarget, DAG, dl),
59489 Ins.getOperand(1), N->getOperand(2));
59490 }
59491 }
59492
59493 // Stop here if this is an i1 vector.
59494 if (IsI1Vector)
59495 return SDValue();
59496
59497 // Eliminate an intermediate vector widening:
59498 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59499 // insert_subvector X, Y, Idx
59500 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59501 // there?
59502 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59503 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59504 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59505 SubVec.getOperand(1), N->getOperand(2));
59506
59507 // If this is an insert of an extract, combine to a shuffle. Don't do this
59508 // if the insert or extract can be represented with a subregister operation.
59509 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59510 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59511 (IdxVal != 0 ||
59512 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59513 SDValue ExtSrc = SubVec.getOperand(0);
59514 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59515 // Create a shuffle mask matching the extraction and insertion.
59516 SmallVector<int, 64> Mask(VecNumElts);
59517 std::iota(Mask.begin(), Mask.end(), 0);
59518 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59519 ExtIdxVal + VecNumElts);
59520 if (ExtIdxVal != 0)
59521 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59522 // See if we can use a blend instead of extract/insert pair.
59523 SmallVector<int, 64> BlendMask(VecNumElts);
59524 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59525 std::iota(BlendMask.begin() + IdxVal,
59526 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59527 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59528 VecNumElts == (2 * SubVecNumElts)) {
59529 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59530 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59531 SDValue Blend = DAG.getNode(
59532 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59533 DAG.getBitcast(MVT::v8f32, ExtSrc),
59534 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59535 return DAG.getBitcast(OpVT, Blend);
59536 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59537 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59538 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59539 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59540 SDValue Shuffle =
59541 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59542 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59543 return DAG.getBitcast(OpVT, Shuffle);
59544 }
59545 }
59546 }
59547
59548 // Match concat_vector style patterns.
59549 SmallVector<SDValue, 2> SubVectorOps;
59550 if (collectConcatOps(N, SubVectorOps, DAG)) {
59551 if (SDValue Fold =
59552 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59553 return Fold;
59554
59555 // If we're inserting all zeros into the upper half, change this to
59556 // a concat with zero. We will match this to a move
59557 // with implicit upper bit zeroing during isel.
59558 // We do this here because we don't want combineConcatVectorOps to
59559 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59560 if (SubVectorOps.size() == 2 &&
59561 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59562 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59563 getZeroVector(OpVT, Subtarget, DAG, dl),
59564 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59565
59566 // Attempt to recursively combine to a shuffle.
59567 if (all_of(SubVectorOps, [](SDValue SubOp) {
59569 })) {
59570 SDValue Op(N, 0);
59571 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59572 return Res;
59573 }
59574 }
59575
59576 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59577 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59578 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59579
59580 // If this is a broadcast load inserted into an upper undef, use a larger
59581 // broadcast load.
59582 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59583 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59584 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59586 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59587 }
59588
59589 // If we're splatting the lower half subvector of a full vector load into the
59590 // upper half, attempt to create a subvector broadcast.
59591 if ((int)IdxVal == (VecNumElts / 2) &&
59592 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59593 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59594 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59595 if (VecLd && SubLd &&
59597 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59599 SubVecVT, SubLd, 0, DAG);
59600 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59601 BcastLd, DAG.getVectorIdxConstant(0, dl));
59602 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59603 return BcastLd;
59604 }
59605 }
59606
59607 // Attempt to constant fold (if we're not widening).
59608 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59609 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59610 APInt VecUndefElts, SubUndefElts;
59611 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59612 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59613 VecEltBits) &&
59614 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59615 SubEltBits)) {
59616 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59617 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59618 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59619 }
59620 }
59621
59622 // Attempt to recursively combine to a shuffle.
59625 SDValue Op(N, 0);
59626 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59627 return Res;
59628 }
59629
59630 // Match insertion of subvector load that perfectly aliases a base load.
59631 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59632 ISD::isNormalLoad(SubVec.getNode()) &&
59634 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59635 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59636 return Vec;
59637
59638 return SDValue();
59639}
59640
59641/// If we are extracting a subvector of a vector select and the select condition
59642/// is composed of concatenated vectors, try to narrow the select width. This
59643/// is a common pattern for AVX1 integer code because 256-bit selects may be
59644/// legal, but there is almost no integer math/logic available for 256-bit.
59645/// This function should only be called with legal types (otherwise, the calls
59646/// to get simple value types will assert).
59648 SelectionDAG &DAG) {
59649 SDValue Sel = Ext->getOperand(0);
59650 if (Sel.getOpcode() != ISD::VSELECT ||
59651 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59652 return SDValue();
59653
59654 // Note: We assume simple value types because this should only be called with
59655 // legal operations/types.
59656 // TODO: This can be extended to handle extraction to 256-bits.
59657 MVT VT = Ext->getSimpleValueType(0);
59658 if (!VT.is128BitVector())
59659 return SDValue();
59660
59661 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59662 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59663 return SDValue();
59664
59665 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59666 MVT SelVT = Sel.getSimpleValueType();
59667 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59668 "Unexpected vector type with legal operations");
59669
59670 unsigned SelElts = SelVT.getVectorNumElements();
59671 unsigned CastedElts = WideVT.getVectorNumElements();
59672 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59673 if (SelElts % CastedElts == 0) {
59674 // The select has the same or more (narrower) elements than the extract
59675 // operand. The extraction index gets scaled by that factor.
59676 ExtIdx *= (SelElts / CastedElts);
59677 } else if (CastedElts % SelElts == 0) {
59678 // The select has less (wider) elements than the extract operand. Make sure
59679 // that the extraction index can be divided evenly.
59680 unsigned IndexDivisor = CastedElts / SelElts;
59681 if (ExtIdx % IndexDivisor != 0)
59682 return SDValue();
59683 ExtIdx /= IndexDivisor;
59684 } else {
59685 llvm_unreachable("Element count of simple vector types are not divisible?");
59686 }
59687
59688 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59689 unsigned NarrowElts = SelElts / NarrowingFactor;
59690 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59691 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59692 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59693 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59694 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59695 return DAG.getBitcast(VT, NarrowSel);
59696}
59697
59700 const X86Subtarget &Subtarget) {
59701 if (!N->getValueType(0).isSimple())
59702 return SDValue();
59703
59704 MVT VT = N->getSimpleValueType(0);
59705 SDValue InVec = N->getOperand(0);
59706 unsigned IdxVal = N->getConstantOperandVal(1);
59707 EVT InVecVT = InVec.getValueType();
59708 unsigned SizeInBits = VT.getSizeInBits();
59709 unsigned InSizeInBits = InVecVT.getSizeInBits();
59710 unsigned NumSubElts = VT.getVectorNumElements();
59711 unsigned NumInElts = InVecVT.getVectorNumElements();
59712 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59713 SDLoc DL(N);
59714
59715 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59716 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59717 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59718 // We let generic combining take over from there to simplify the
59719 // insert/extract and 'not'.
59720 // This pattern emerges during AVX1 legalization. We handle it before lowering
59721 // to avoid complications like splitting constant vector loads.
59722 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59723 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59724 auto isConcatenatedNot = [](SDValue V) {
59725 V = peekThroughBitcasts(V);
59726 if (!isBitwiseNot(V))
59727 return false;
59728 SDValue NotOp = V->getOperand(0);
59730 };
59731 if (isConcatenatedNot(InVec.getOperand(0)) ||
59732 isConcatenatedNot(InVec.getOperand(1))) {
59733 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59734 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59735 splitVectorIntBinary(InVec, DAG, DL),
59736 N->getOperand(1));
59737 }
59738 }
59739
59740 if (DCI.isBeforeLegalizeOps())
59741 return SDValue();
59742
59743 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59744 return V;
59745
59747 return getZeroVector(VT, Subtarget, DAG, DL);
59748
59749 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59750 if (VT.getScalarType() == MVT::i1)
59751 return DAG.getConstant(1, DL, VT);
59752 return getOnesVector(VT, DAG, DL);
59753 }
59754
59755 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59756 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59757
59758 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59759 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59760 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59761 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59762 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59763 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59764 }
59765
59766 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59767 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59768 // iff SUB is entirely contained in the extraction.
59769 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59770 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59771 SDValue Src = InVec.getOperand(0);
59772 SDValue Sub = InVec.getOperand(1);
59773 EVT SubVT = Sub.getValueType();
59774 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59775 if (IdxVal <= InsIdx &&
59776 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59777 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59778 DAG.getVectorIdxConstant(IdxVal, DL));
59779 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59780 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59781 }
59782 }
59783
59784 // If we're extracting an upper subvector see if we'd get the same elements if
59785 // we extracted the lowest subvector instead which should allow
59786 // SimplifyDemandedVectorElts do more simplifications.
59787 if (IdxVal != 0) {
59788 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59789 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59790 });
59791 if (AllEquiv)
59792 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59793 }
59794
59795 // Check if we're extracting a whole broadcasted subvector.
59796 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59797 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59798 EVT MemVT = MemIntr->getMemoryVT();
59799 if (MemVT == VT) {
59800 // If this is the only use, we can replace with a regular load (this may
59801 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59802 // memory chain).
59803 if (InVec.hasOneUse()) {
59804 SDValue Ld =
59805 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59806 MemIntr->getMemOperand());
59807 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59808 return Ld;
59809 }
59810 }
59811 }
59812
59813 // Attempt to extract from the source of a shuffle vector.
59814 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59815 SmallVector<int, 32> ShuffleMask;
59816 SmallVector<int, 32> ScaledMask;
59817 SmallVector<SDValue, 2> ShuffleInputs;
59818 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59819 // Decode the shuffle mask and scale it so its shuffling subvectors.
59820 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59821 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59822 unsigned SubVecIdx = IdxVal / NumSubElts;
59823 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59824 return DAG.getUNDEF(VT);
59825 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59826 return getZeroVector(VT, Subtarget, DAG, DL);
59827 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59828 if (Src.getValueSizeInBits() == InSizeInBits) {
59829 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59830 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59831 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59832 DL, SizeInBits);
59833 }
59834 }
59835 }
59836
59837 auto IsExtractFree = [](SDValue V) {
59838 if (V.hasOneUse()) {
59840 if (V.getOpcode() == ISD::LOAD)
59841 return true;
59842 }
59843 V = peekThroughBitcasts(V);
59844 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59845 return true;
59847 return true;
59848 return V.isUndef();
59849 };
59850
59851 // If we're extracting the lowest subvector and we're the only user,
59852 // we may be able to perform this with a smaller vector width.
59853 unsigned InOpcode = InVec.getOpcode();
59854 if (InVec.hasOneUse()) {
59855 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59856 // v2f64 CVTDQ2PD(v4i32).
59857 if (InOpcode == ISD::SINT_TO_FP &&
59858 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59859 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59860 }
59861 // v2f64 CVTUDQ2PD(v4i32).
59862 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59863 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59864 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59865 }
59866 // v2f64 CVTPS2PD(v4f32).
59867 if (InOpcode == ISD::FP_EXTEND &&
59868 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59869 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59870 }
59871 }
59872 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59873 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59874 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59875 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59876 Subtarget.hasVLX())) &&
59877 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59878 SDValue Src = InVec.getOperand(0);
59879 if (Src.getValueType().getScalarSizeInBits() == 32)
59880 return DAG.getNode(InOpcode, DL, VT,
59881 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59882 }
59883 if (IdxVal == 0 &&
59884 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59885 (SizeInBits == 128 || SizeInBits == 256) &&
59886 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59887 SDValue Ext = InVec.getOperand(0);
59888 if (Ext.getValueSizeInBits() > SizeInBits)
59889 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59890 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59891 return DAG.getNode(ExtOp, DL, VT, Ext);
59892 }
59893 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59894 InVec.getOperand(0).getValueType().is256BitVector() &&
59895 InVec.getOperand(1).getValueType().is256BitVector() &&
59896 InVec.getOperand(2).getValueType().is256BitVector()) {
59897 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59898 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59899 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59900 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59901 }
59902 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59903 (SizeInBits == 128 || SizeInBits == 256)) {
59904 SDValue InVecSrc = InVec.getOperand(0);
59905 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59906 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59907 return DAG.getNode(InOpcode, DL, VT, Ext);
59908 }
59909
59910 if (SizeInBits == 128 || SizeInBits == 256) {
59911 switch (InOpcode) {
59912 case X86ISD::MOVDDUP:
59913 return DAG.getNode(
59914 InOpcode, DL, VT,
59915 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59916 case X86ISD::PSHUFD:
59917 case X86ISD::VPERMILPI:
59918 if (InVec.getOperand(0).hasOneUse()) {
59919 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59920 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59921 return DAG.getNode(InOpcode, DL, VT,
59922 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59923 DL, SizeInBits),
59924 DAG.getTargetConstant(M, DL, MVT::i8));
59925 }
59926 break;
59927 case X86ISD::PCMPEQ:
59928 case X86ISD::PCMPGT:
59929 case X86ISD::UNPCKH:
59930 case X86ISD::UNPCKL:
59931 if (IsExtractFree(InVec.getOperand(0)) ||
59932 IsExtractFree(InVec.getOperand(1)))
59933 return DAG.getNode(InOpcode, DL, VT,
59934 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59935 DL, SizeInBits),
59936 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59937 DL, SizeInBits));
59938 break;
59939 case X86ISD::CMPP:
59940 if (IsExtractFree(InVec.getOperand(0)) ||
59941 IsExtractFree(InVec.getOperand(1)))
59942 return DAG.getNode(InOpcode, DL, VT,
59943 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59944 DL, SizeInBits),
59945 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59946 DL, SizeInBits),
59947 InVec.getOperand(2));
59948 break;
59949 case X86ISD::BLENDI:
59950 if (IsExtractFree(InVec.getOperand(0)) ||
59951 IsExtractFree(InVec.getOperand(1))) {
59952 uint64_t M = InVec.getConstantOperandVal(2) & 255;
59953 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
59954 return DAG.getNode(InOpcode, DL, VT,
59955 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59956 DL, SizeInBits),
59957 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59958 DL, SizeInBits),
59959 DAG.getTargetConstant(M, DL, MVT::i8));
59960 }
59961 break;
59962 case X86ISD::VPERMV:
59963 if (IdxVal != 0) {
59964 SDValue Mask = InVec.getOperand(0);
59965 SDValue Src = InVec.getOperand(1);
59966 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59967 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59968 DL, InSizeInBits);
59969 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
59970 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59971 }
59972 break;
59973 case X86ISD::VPERMV3:
59974 if (IdxVal != 0) {
59975 SDValue Src0 = InVec.getOperand(0);
59976 SDValue Mask = InVec.getOperand(1);
59977 SDValue Src1 = InVec.getOperand(2);
59978 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59979 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59980 DL, InSizeInBits);
59981 SDValue Shuffle =
59982 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
59983 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59984 }
59985 break;
59986 }
59987 }
59988 }
59989
59990 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
59991 // as this is very likely to fold into a shuffle/truncation.
59992 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
59993 InVecVT.getScalarSizeInBits() == 64 &&
59994 InVec.getConstantOperandAPInt(1) == 32) {
59995 SDValue Ext =
59996 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
59997 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
59998 }
59999
60000 return SDValue();
60001}
60002
60004 const X86Subtarget &Subtarget) {
60005 using namespace SDPatternMatch;
60006 EVT VT = N->getValueType(0);
60007 SDValue Src = N->getOperand(0);
60008 SDLoc DL(N);
60009
60010 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
60011 // This occurs frequently in our masked scalar intrinsic code and our
60012 // floating point select lowering with AVX512.
60013 // TODO: SimplifyDemandedBits instead?
60014 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
60015 isOneConstant(Src.getOperand(1)))
60016 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
60017
60018 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
60019 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60020 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
60021 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
60022 isNullConstant(Src.getOperand(1)))
60023 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
60024 Src.getOperand(1));
60025
60026 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
60027 // TODO: Move to DAGCombine/SimplifyDemandedBits?
60028 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
60029 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
60030 if (Op.getValueType() != MVT::i64)
60031 return SDValue();
60032 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
60033 if (Op.getOpcode() == Opc &&
60034 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
60035 return Op.getOperand(0);
60036 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
60037 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
60038 if (Ld->getExtensionType() == Ext &&
60039 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
60040 return Op;
60041 if (IsZeroExt) {
60042 KnownBits Known = DAG.computeKnownBits(Op);
60043 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
60044 return Op;
60045 }
60046 return SDValue();
60047 };
60048
60049 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
60050 return DAG.getBitcast(
60051 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60052 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
60053
60054 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
60055 return DAG.getBitcast(
60056 VT,
60057 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
60058 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60059 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
60060 }
60061
60062 if (Src.getOpcode() == ISD::BITCAST) {
60063 SDValue SrcOp = Src.getOperand(0);
60064 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
60065 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
60066 return DAG.getBitcast(
60067 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
60068 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
60069 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
60070 return DAG.getBitcast(
60071 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
60072 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
60073 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
60074 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
60075 }
60076
60077 if (VT == MVT::v4i32) {
60078 SDValue HalfSrc;
60079 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
60080 // to remove XMM->GPR->XMM moves.
60081 if (sd_match(Src, m_AnyExt(m_BitCast(
60082 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
60083 return DAG.getBitcast(
60084 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
60085 }
60086
60087 // See if we're broadcasting the scalar value, in which case just reuse that.
60088 // Ensure the same SDValue from the SDNode use is being used.
60089 if (VT.getScalarType() == Src.getValueType())
60090 for (SDNode *User : Src->users())
60091 if (User->getOpcode() == X86ISD::VBROADCAST &&
60092 Src == User->getOperand(0)) {
60093 unsigned SizeInBits = VT.getFixedSizeInBits();
60094 unsigned BroadcastSizeInBits =
60095 User->getValueSizeInBits(0).getFixedValue();
60096 if (BroadcastSizeInBits == SizeInBits)
60097 return SDValue(User, 0);
60098 if (BroadcastSizeInBits > SizeInBits)
60099 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
60100 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
60101 // coverage.
60102 }
60103
60104 // Check for cases where we've ended up with a scalarized shift, typically
60105 // during type legalization.
60106 switch (Src.getOpcode()) {
60107 case ISD::SHL:
60108 case ISD::SRL:
60109 case ISD::SRA:
60110 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
60111 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
60112 Src.hasOneUse()) {
60113 SDValue SrcVec =
60114 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60115 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
60116 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
60117 Amt->getZExtValue(), DAG);
60118 }
60119 }
60120 break;
60121 case ISD::FSHL:
60122 case ISD::FSHR:
60123 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
60124 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
60125 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60126 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60127 Src.hasOneUse()) {
60128 uint64_t AmtVal =
60129 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
60130 SDValue SrcVec0 =
60131 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60132 SDValue SrcVec1 =
60133 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
60134 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
60135 DAG.getConstant(AmtVal, DL, VT));
60136 }
60137 }
60138 break;
60139 }
60140
60141 return SDValue();
60142}
60143
60144// Simplify PMULDQ and PMULUDQ operations.
60147 const X86Subtarget &Subtarget) {
60148 SDValue LHS = N->getOperand(0);
60149 SDValue RHS = N->getOperand(1);
60150
60151 // Canonicalize constant to RHS.
60154 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60155
60156 // Multiply by zero.
60157 // Don't return RHS as it may contain UNDEFs.
60158 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60159 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60160
60161 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60162 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60163 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60164 return SDValue(N, 0);
60165
60166 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60167 // convert it to any_extend_invec, due to the LegalOperations check, do the
60168 // conversion directly to a vector shuffle manually. This exposes combine
60169 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60170 // combineX86ShufflesRecursively on SSE4.1 targets.
60171 // FIXME: This is basically a hack around several other issues related to
60172 // ANY_EXTEND_VECTOR_INREG.
60173 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60174 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60175 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60176 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60177 SDLoc dl(N);
60178 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60179 LHS.getOperand(0), { 0, -1, 1, -1 });
60180 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60181 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60182 }
60183 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60184 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60185 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60186 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60187 SDLoc dl(N);
60188 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60189 RHS.getOperand(0), { 0, -1, 1, -1 });
60190 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60191 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60192 }
60193
60194 return SDValue();
60195}
60196
60197// Simplify VPMADDUBSW/VPMADDWD operations.
60200 MVT VT = N->getSimpleValueType(0);
60201 SDValue LHS = N->getOperand(0);
60202 SDValue RHS = N->getOperand(1);
60203 unsigned Opc = N->getOpcode();
60204 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60206 "Unexpected PMADD opcode");
60207
60208 // Multiply by zero.
60209 // Don't return LHS/RHS as it may contain UNDEFs.
60210 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60212 return DAG.getConstant(0, SDLoc(N), VT);
60213
60214 // Constant folding.
60215 APInt LHSUndefs, RHSUndefs;
60216 SmallVector<APInt> LHSBits, RHSBits;
60217 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60218 unsigned DstEltBits = VT.getScalarSizeInBits();
60219 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60220 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60221 SmallVector<APInt> Result;
60222 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60223 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60224 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60225 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60226 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60227 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60228 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60229 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60230 Result.push_back(Res);
60231 }
60232 return getConstVector(Result, VT, DAG, SDLoc(N));
60233 }
60234
60235 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60236 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60237 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60238 return SDValue(N, 0);
60239
60240 return SDValue();
60241}
60242
60243// Simplify VPMADD52L/VPMADD52H operations.
60246 MVT VT = N->getSimpleValueType(0);
60247
60248 bool AddLow = N->getOpcode() == X86ISD::VPMADD52L;
60249 SDValue Op0 = N->getOperand(0);
60250 SDValue Op1 = N->getOperand(1);
60251 SDValue Op2 = N->getOperand(2);
60252 SDLoc DL(N);
60253
60254 APInt C0, C1;
60255 bool HasC0 = X86::isConstantSplat(Op0, C0),
60256 HasC1 = X86::isConstantSplat(Op1, C1);
60257
60258 // lo/hi(C * X) + Z --> lo/hi(X * C) + Z
60259 if (HasC0 && !HasC1)
60260 return DAG.getNode(N->getOpcode(), DL, VT, Op1, Op0, Op2);
60261
60262 // lo(X * 1) + Z --> lo(X) + Z iff X == lo(X)
60263 if (AddLow && HasC1 && C1.trunc(52).isOne()) {
60264 KnownBits KnownOp0 = DAG.computeKnownBits(Op0);
60265 if (KnownOp0.countMinLeadingZeros() >= 12)
60266 return DAG.getNode(ISD::ADD, DL, VT, Op0, Op2);
60267 }
60268
60269 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60270 unsigned NumEltBits = VT.getScalarSizeInBits();
60271 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60272 DCI))
60273 return SDValue(N, 0);
60274
60275 return SDValue();
60276}
60277
60280 const X86Subtarget &Subtarget) {
60281 EVT VT = N->getValueType(0);
60282 SDValue In = N->getOperand(0);
60283 unsigned Opcode = N->getOpcode();
60284 unsigned InOpcode = In.getOpcode();
60285 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60286 SDLoc DL(N);
60287
60288 // Try to merge vector loads and extend_inreg to an extload.
60289 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60290 In.hasOneUse()) {
60291 auto *Ld = cast<LoadSDNode>(In);
60292 if (Ld->isSimple()) {
60293 MVT SVT = In.getSimpleValueType().getVectorElementType();
60296 : ISD::ZEXTLOAD;
60297 EVT MemVT = VT.changeVectorElementType(SVT);
60298 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60299 SDValue Load = DAG.getExtLoad(
60300 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60301 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60302 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60303 return Load;
60304 }
60305 }
60306 }
60307
60308 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60309 if (Opcode == InOpcode)
60310 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60311
60312 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60313 // -> EXTEND_VECTOR_INREG(X).
60314 // TODO: Handle non-zero subvector indices.
60315 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60316 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60317 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60318 In.getValueSizeInBits())
60319 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60320
60321 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60322 // TODO: Move to DAGCombine?
60323 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60324 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60325 In.getValueSizeInBits() == VT.getSizeInBits()) {
60326 unsigned NumElts = VT.getVectorNumElements();
60327 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60328 EVT EltVT = In.getOperand(0).getValueType();
60329 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60330 for (unsigned I = 0; I != NumElts; ++I)
60331 Elts[I * Scale] = In.getOperand(I);
60332 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60333 }
60334
60335 // Attempt to combine as a shuffle on SSE41+ targets.
60336 if (Subtarget.hasSSE41()) {
60337 SDValue Op(N, 0);
60338 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60339 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60340 return Res;
60341 }
60342
60343 return SDValue();
60344}
60345
60348 EVT VT = N->getValueType(0);
60349 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60350 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60351 return DAG.getConstant(0, SDLoc(N), VT);
60352
60353 // Fold kshiftr(extract_subvector(X,C1),C2)
60354 // --> extract_subvector(kshiftr(X,C1+C2),0)
60355 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60356 if (N->getOpcode() == X86ISD::KSHIFTR) {
60357 SDLoc DL(N);
60358 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60359 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60360 SDValue Src = N->getOperand(0).getOperand(0);
60361 uint64_t Amt = N->getConstantOperandVal(1) +
60362 N->getOperand(0).getConstantOperandVal(1);
60363 EVT SrcVT = Src.getValueType();
60364 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60365 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60366 DAG.getTargetConstant(Amt, DL, MVT::i8));
60367 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60368 DAG.getVectorIdxConstant(0, DL));
60369 }
60370 }
60371 }
60372
60373 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60374 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60375 return SDValue(N, 0);
60376
60377 return SDValue();
60378}
60379
60380// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60381// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60382// extra instructions between the conversion due to going to scalar and back.
60384 const X86Subtarget &Subtarget) {
60385 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60386 return SDValue();
60387
60388 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60389 return SDValue();
60390
60391 if (N->getValueType(0) != MVT::f32 ||
60392 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60393 return SDValue();
60394
60395 SDLoc dl(N);
60396 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60397 N->getOperand(0).getOperand(0));
60398 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60399 DAG.getTargetConstant(4, dl, MVT::i32));
60400 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60401 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60402 DAG.getVectorIdxConstant(0, dl));
60403}
60404
60407 const X86Subtarget &Subtarget) {
60408 EVT VT = N->getValueType(0);
60409 bool IsStrict = N->isStrictFPOpcode();
60410 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60411 EVT SrcVT = Src.getValueType();
60412
60413 SDLoc dl(N);
60414 if (SrcVT.getScalarType() == MVT::bf16) {
60415 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60416 !IsStrict && Src.getOperand(0).getValueType() == VT)
60417 return Src.getOperand(0);
60418
60419 if (!SrcVT.isVector())
60420 return SDValue();
60421
60422 assert(!IsStrict && "Strict FP doesn't support BF16");
60423 if (VT.getVectorElementType() == MVT::f64) {
60424 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60425 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60426 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60427 }
60428 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60429 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60430 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60431 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60432 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60433 return DAG.getBitcast(VT, Src);
60434 }
60435
60436 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60437 return SDValue();
60438
60439 if (Subtarget.hasFP16())
60440 return SDValue();
60441
60442 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60443 return SDValue();
60444
60445 if (VT.getVectorElementType() != MVT::f32 &&
60446 VT.getVectorElementType() != MVT::f64)
60447 return SDValue();
60448
60449 unsigned NumElts = VT.getVectorNumElements();
60450 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60451 return SDValue();
60452
60453 // Convert the input to vXi16.
60454 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60455 Src = DAG.getBitcast(IntVT, Src);
60456
60457 // Widen to at least 8 input elements.
60458 if (NumElts < 8) {
60459 unsigned NumConcats = 8 / NumElts;
60460 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60461 : DAG.getConstant(0, dl, IntVT);
60462 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60463 Ops[0] = Src;
60464 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60465 }
60466
60467 // Destination is vXf32 with at least 4 elements.
60468 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60469 std::max(4U, NumElts));
60470 SDValue Cvt, Chain;
60471 if (IsStrict) {
60472 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60473 {N->getOperand(0), Src});
60474 Chain = Cvt.getValue(1);
60475 } else {
60476 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60477 }
60478
60479 if (NumElts < 4) {
60480 assert(NumElts == 2 && "Unexpected size");
60481 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60482 DAG.getVectorIdxConstant(0, dl));
60483 }
60484
60485 if (IsStrict) {
60486 // Extend to the original VT if necessary.
60487 if (Cvt.getValueType() != VT) {
60488 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60489 {Chain, Cvt});
60490 Chain = Cvt.getValue(1);
60491 }
60492 return DAG.getMergeValues({Cvt, Chain}, dl);
60493 }
60494
60495 // Extend to the original VT if necessary.
60496 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60497}
60498
60499// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60502 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60503 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60504 "Unknown broadcast load type");
60505
60506 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60507 SDValue Ptr = MemIntrin->getBasePtr();
60508 SDValue Chain = MemIntrin->getChain();
60509 EVT VT = N->getSimpleValueType(0);
60510 EVT MemVT = MemIntrin->getMemoryVT();
60511
60512 // Look at other users of our base pointer and try to find a wider broadcast.
60513 // The input chain and the size of the memory VT must match.
60514 for (SDNode *User : Ptr->users())
60515 if (User != N && User->getOpcode() == N->getOpcode() &&
60516 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60517 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60518 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60519 MemVT.getSizeInBits() &&
60520 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60522 MemIntrin->isSimple() && "Illegal broadcast load type");
60524 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60525 VT.getSizeInBits());
60526 Extract = DAG.getBitcast(VT, Extract);
60527 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60528 return Extract;
60529 }
60530
60531 return SDValue();
60532}
60533
60535 const X86Subtarget &Subtarget) {
60536 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60537 return SDValue();
60538
60539 bool IsStrict = N->isStrictFPOpcode();
60540 EVT VT = N->getValueType(0);
60541 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60542 EVT SrcVT = Src.getValueType();
60543
60544 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60545 SrcVT.getVectorElementType() != MVT::f32)
60546 return SDValue();
60547
60548 SDLoc dl(N);
60549
60550 SDValue Cvt, Chain;
60551 unsigned NumElts = VT.getVectorNumElements();
60552 if (Subtarget.hasFP16()) {
60553 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60554 // v4f32 (xint_to_fp v4i64))))
60555 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60556 // v8f16 (CVTXI2P v4i64)))
60557 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60558 Src.getNumOperands() == 2) {
60559 SDValue Cvt0, Cvt1;
60560 SDValue Op0 = Src.getOperand(0);
60561 SDValue Op1 = Src.getOperand(1);
60562 bool IsOp0Strict = Op0->isStrictFPOpcode();
60563 if (Op0.getOpcode() != Op1.getOpcode() ||
60564 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60565 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60566 return SDValue();
60567 }
60568 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60569 if (IsStrict) {
60570 assert(IsOp0Strict && "Op0 must be strict node");
60571 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60574 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60575 {Op0.getOperand(0), Op0.getOperand(1)});
60576 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60577 {Op1.getOperand(0), Op1.getOperand(1)});
60578 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60579 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60580 }
60581 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60583 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60584 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60585 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60586 }
60587 return SDValue();
60588 }
60589
60590 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60591 return SDValue();
60592
60593 // Widen to at least 4 input elements.
60594 if (NumElts < 4)
60595 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60596 DAG.getConstantFP(0.0, dl, SrcVT));
60597
60598 // Destination is v8i16 with at least 8 elements.
60599 EVT CvtVT =
60600 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60601 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60602 if (IsStrict) {
60603 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60604 {N->getOperand(0), Src, Rnd});
60605 Chain = Cvt.getValue(1);
60606 } else {
60607 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60608 }
60609
60610 // Extract down to real number of elements.
60611 if (NumElts < 8) {
60613 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60614 DAG.getVectorIdxConstant(0, dl));
60615 }
60616
60617 Cvt = DAG.getBitcast(VT, Cvt);
60618
60619 if (IsStrict)
60620 return DAG.getMergeValues({Cvt, Chain}, dl);
60621
60622 return Cvt;
60623}
60624
60626 SDValue Src = N->getOperand(0);
60627
60628 // Turn MOVDQ2Q+simple_load into an mmx load.
60629 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60630 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60631
60632 if (LN->isSimple()) {
60633 SDValue NewLd =
60634 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60635 LN->getPointerInfo(), LN->getBaseAlign(),
60636 LN->getMemOperand()->getFlags());
60637 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60638 return NewLd;
60639 }
60640 }
60641
60642 return SDValue();
60643}
60644
60647 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60648 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60649 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60650 return SDValue(N, 0);
60651
60652 return SDValue();
60653}
60654
60655// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60656// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60657// use x86mmx instead.
60659 SDLoc dl(N);
60660
60661 bool MadeChange = false, CastReturnVal = false;
60663 for (const SDValue &Arg : N->op_values()) {
60664 if (Arg.getValueType() == MVT::v1i64) {
60665 MadeChange = true;
60666 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60667 } else
60668 Args.push_back(Arg);
60669 }
60670 SDVTList VTs = N->getVTList();
60671 SDVTList NewVTs = VTs;
60672 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60673 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60674 NewVTArr[0] = MVT::x86mmx;
60675 NewVTs = DAG.getVTList(NewVTArr);
60676 MadeChange = true;
60677 CastReturnVal = true;
60678 }
60679
60680 if (MadeChange) {
60681 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60682 if (CastReturnVal) {
60684 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60685 Returns.push_back(Result.getValue(i));
60686 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60687 return DAG.getMergeValues(Returns, dl);
60688 }
60689 return Result;
60690 }
60691 return SDValue();
60692}
60695 if (!DCI.isBeforeLegalize())
60696 return SDValue();
60697
60698 unsigned IntNo = N->getConstantOperandVal(0);
60699 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60700
60701 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60702 return FixupMMXIntrinsicTypes(N, DAG);
60703
60704 return SDValue();
60705}
60706
60709 if (!DCI.isBeforeLegalize())
60710 return SDValue();
60711
60712 unsigned IntNo = N->getConstantOperandVal(1);
60713 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60714
60715 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60716 return FixupMMXIntrinsicTypes(N, DAG);
60717
60718 return SDValue();
60719}
60720
60723 if (!DCI.isBeforeLegalize())
60724 return SDValue();
60725
60726 unsigned IntNo = N->getConstantOperandVal(1);
60727 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60728
60729 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60730 return FixupMMXIntrinsicTypes(N, DAG);
60731
60732 return SDValue();
60733}
60734
60736 DAGCombinerInfo &DCI) const {
60737 SelectionDAG &DAG = DCI.DAG;
60738 switch (N->getOpcode()) {
60739 // clang-format off
60740 default: break;
60742 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60744 case X86ISD::PEXTRW:
60745 case X86ISD::PEXTRB:
60746 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60748 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60750 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60752 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60753 case ISD::VSELECT:
60754 case ISD::SELECT:
60755 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60756 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60757 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60758 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60759 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60760 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60761 case X86ISD::ADD:
60762 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60763 case X86ISD::CLOAD:
60764 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60765 case X86ISD::SBB: return combineSBB(N, DAG);
60766 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60767 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60768 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60769 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60770 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60771 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60772 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60773 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60774 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60775 case ISD::AVGCEILS:
60776 case ISD::AVGCEILU:
60777 case ISD::AVGFLOORS:
60778 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60779 case X86ISD::BEXTR:
60780 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60781 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60782 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60783 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60784 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60786 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60787 case ISD::SINT_TO_FP:
60789 return combineSIntToFP(N, DAG, DCI, Subtarget);
60790 case ISD::UINT_TO_FP:
60792 return combineUIntToFP(N, DAG, Subtarget);
60793 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60794 case ISD::LRINT:
60795 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60796 case ISD::FADD:
60797 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60798 case X86ISD::VFCMULC:
60799 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60800 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60801 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60802 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60803 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60804 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60805 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60806 case X86ISD::FXOR:
60807 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60808 case X86ISD::FMIN:
60809 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60810 case ISD::FMINNUM:
60811 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60812 case X86ISD::CVTSI2P:
60813 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60814 case X86ISD::CVTP2SI:
60815 case X86ISD::CVTP2UI:
60817 case X86ISD::CVTTP2SI:
60819 case X86ISD::CVTTP2UI:
60820 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60822 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60823 case X86ISD::BT: return combineBT(N, DAG, DCI);
60824 case ISD::ANY_EXTEND:
60825 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60826 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60827 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60831 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60832 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60833 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60834 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60835 case X86ISD::PACKSS:
60836 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60837 case X86ISD::HADD:
60838 case X86ISD::HSUB:
60839 case X86ISD::FHADD:
60840 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60841 case X86ISD::VSHL:
60842 case X86ISD::VSRA:
60843 case X86ISD::VSRL:
60844 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60845 case X86ISD::VSHLI:
60846 case X86ISD::VSRAI:
60847 case X86ISD::VSRLI:
60848 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60850 case X86ISD::PINSRB:
60851 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60852 case X86ISD::SHUFP: // Handle all target specific shuffles
60853 case X86ISD::INSERTPS:
60854 case X86ISD::EXTRQI:
60855 case X86ISD::INSERTQI:
60856 case X86ISD::VALIGN:
60857 case X86ISD::PALIGNR:
60858 case X86ISD::VSHLDQ:
60859 case X86ISD::VSRLDQ:
60860 case X86ISD::BLENDI:
60861 case X86ISD::UNPCKH:
60862 case X86ISD::UNPCKL:
60863 case X86ISD::MOVHLPS:
60864 case X86ISD::MOVLHPS:
60865 case X86ISD::PSHUFB:
60866 case X86ISD::PSHUFD:
60867 case X86ISD::PSHUFHW:
60868 case X86ISD::PSHUFLW:
60869 case X86ISD::MOVSHDUP:
60870 case X86ISD::MOVSLDUP:
60871 case X86ISD::MOVDDUP:
60872 case X86ISD::MOVSS:
60873 case X86ISD::MOVSD:
60874 case X86ISD::MOVSH:
60875 case X86ISD::VBROADCAST:
60876 case X86ISD::VPPERM:
60877 case X86ISD::VPERMI:
60878 case X86ISD::VPERMV:
60879 case X86ISD::VPERMV3:
60880 case X86ISD::VPERMIL2:
60881 case X86ISD::VPERMILPI:
60882 case X86ISD::VPERMILPV:
60883 case X86ISD::VPERM2X128:
60884 case X86ISD::SHUF128:
60885 case X86ISD::VZEXT_MOVL:
60886 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60887 case X86ISD::FMADD_RND:
60888 case X86ISD::FMSUB:
60890 case X86ISD::FMSUB_RND:
60891 case X86ISD::FNMADD:
60893 case X86ISD::FNMADD_RND:
60894 case X86ISD::FNMSUB:
60896 case X86ISD::FNMSUB_RND:
60897 case ISD::FMA:
60898 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60901 case X86ISD::FMADDSUB:
60902 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60903 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60904 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60905 case X86ISD::MGATHER:
60906 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60907 case ISD::MGATHER:
60908 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60909 case X86ISD::PCMPEQ:
60910 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60911 case X86ISD::PMULDQ:
60912 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60913 case X86ISD::VPMADDUBSW:
60914 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60915 case X86ISD::VPMADD52L:
60916 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60917 case X86ISD::KSHIFTL:
60918 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60919 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60921 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60923 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60925 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60926 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60927 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60928 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60929 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60930 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60932 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60933 // clang-format on
60934 }
60935
60936 return SDValue();
60937}
60938
60940 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60941}
60942
60943// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60945 EVT ExtVT) const {
60946 return Subtarget.hasAVX512() || !VT.isVector();
60947}
60948
60950 if (!isTypeLegal(VT))
60951 return false;
60952
60953 // There are no vXi8 shifts.
60954 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
60955 return false;
60956
60957 // TODO: Almost no 8-bit ops are desirable because they have no actual
60958 // size/speed advantages vs. 32-bit ops, but they do have a major
60959 // potential disadvantage by causing partial register stalls.
60960 //
60961 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
60962 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
60963 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
60964 // check for a constant operand to the multiply.
60965 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
60966 return false;
60967
60968 // i16 instruction encodings are longer and some i16 instructions are slow,
60969 // so those are not desirable.
60970 if (VT == MVT::i16) {
60971 switch (Opc) {
60972 default:
60973 break;
60974 case ISD::LOAD:
60975 case ISD::SIGN_EXTEND:
60976 case ISD::ZERO_EXTEND:
60977 case ISD::ANY_EXTEND:
60978 case ISD::MUL:
60979 return false;
60980 case ISD::SHL:
60981 case ISD::SRA:
60982 case ISD::SRL:
60983 case ISD::SUB:
60984 case ISD::ADD:
60985 case ISD::AND:
60986 case ISD::OR:
60987 case ISD::XOR:
60988 // NDD instruction never has "partial register write" issue b/c it has
60989 // destination register's upper bits [63:OSIZE]) zeroed even when
60990 // OSIZE=8/16.
60991 return Subtarget.hasNDD();
60992 }
60993 }
60994
60995 // Any legal type not explicitly accounted for above here is desirable.
60996 return true;
60997}
60998
61000 SDValue Value, SDValue Addr,
61001 int JTI,
61002 SelectionDAG &DAG) const {
61003 const Module *M = DAG.getMachineFunction().getFunction().getParent();
61004 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
61005 if (IsCFProtectionSupported) {
61006 // In case control-flow branch protection is enabled, we need to add
61007 // notrack prefix to the indirect branch.
61008 // In order to do that we create NT_BRIND SDNode.
61009 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
61010 SDValue Chain = Value;
61011 // Jump table debug info is only needed if CodeView is enabled.
61013 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
61014 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
61015 }
61016
61017 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
61018}
61019
61022 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
61024 EVT VT = LogicOp->getValueType(0);
61025 EVT OpVT = SETCC0->getOperand(0).getValueType();
61026 if (!VT.isInteger())
61028
61029 if (VT.isVector())
61034
61035 // Don't use `NotAnd` as even though `not` is generally shorter code size than
61036 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
61037 // `NotAnd` applies, `AddAnd` does as well.
61038 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
61039 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
61041}
61042
61044 EVT VT = Op.getValueType();
61045 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
61046 isa<ConstantSDNode>(Op.getOperand(1));
61047
61048 // i16 is legal, but undesirable since i16 instruction encodings are longer
61049 // and some i16 instructions are slow.
61050 // 8-bit multiply-by-constant can usually be expanded to something cheaper
61051 // using LEA and/or other ALU ops.
61052 if (VT != MVT::i16 && !Is8BitMulByConstant)
61053 return false;
61054
61055 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
61056 if (!Op.hasOneUse())
61057 return false;
61058 SDNode *User = *Op->user_begin();
61060 return false;
61061 auto *Ld = cast<LoadSDNode>(Load);
61062 auto *St = cast<StoreSDNode>(User);
61063 return Ld->getBasePtr() == St->getBasePtr();
61064 };
61065
61066 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
61067 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
61068 return false;
61069 if (!Op.hasOneUse())
61070 return false;
61071 SDNode *User = *Op->user_begin();
61072 if (User->getOpcode() != ISD::ATOMIC_STORE)
61073 return false;
61074 auto *Ld = cast<AtomicSDNode>(Load);
61075 auto *St = cast<AtomicSDNode>(User);
61076 return Ld->getBasePtr() == St->getBasePtr();
61077 };
61078
61079 auto IsFoldableZext = [](SDValue Op) {
61080 if (!Op.hasOneUse())
61081 return false;
61082 SDNode *User = *Op->user_begin();
61083 EVT VT = User->getValueType(0);
61084 return (User->getOpcode() == ISD::ZERO_EXTEND &&
61085 (VT == MVT::i32 || VT == MVT::i64));
61086 };
61087
61088 bool Commute = false;
61089 switch (Op.getOpcode()) {
61090 default: return false;
61091 case ISD::SIGN_EXTEND:
61092 case ISD::ZERO_EXTEND:
61093 case ISD::ANY_EXTEND:
61094 break;
61095 case ISD::SHL:
61096 case ISD::SRA:
61097 case ISD::SRL: {
61098 SDValue N0 = Op.getOperand(0);
61099 // Look out for (store (shl (load), x)).
61100 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
61101 return false;
61102 break;
61103 }
61104 case ISD::MUL:
61105 // When ZU is enabled, we prefer to not promote for MUL by a constant
61106 // when there is an opportunity to fold a zext with imulzu.
61107 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
61108 (isa<ConstantSDNode>(Op.getOperand(0)) ||
61109 isa<ConstantSDNode>(Op.getOperand(1))))
61110 return false;
61111 [[fallthrough]];
61112 case ISD::ADD:
61113 case ISD::AND:
61114 case ISD::OR:
61115 case ISD::XOR:
61116 Commute = true;
61117 [[fallthrough]];
61118 case ISD::SUB: {
61119 SDValue N0 = Op.getOperand(0);
61120 SDValue N1 = Op.getOperand(1);
61121 // Avoid disabling potential load folding opportunities.
61122 if (X86::mayFoldLoad(N1, Subtarget) &&
61123 (!Commute || !isa<ConstantSDNode>(N0) ||
61124 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
61125 return false;
61126 if (X86::mayFoldLoad(N0, Subtarget) &&
61127 ((Commute && !isa<ConstantSDNode>(N1)) ||
61128 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
61129 return false;
61130 if (IsFoldableAtomicRMW(N0, Op) ||
61131 (Commute && IsFoldableAtomicRMW(N1, Op)))
61132 return false;
61133 }
61134 }
61135
61136 PVT = MVT::i32;
61137 return true;
61138}
61139
61140//===----------------------------------------------------------------------===//
61141// X86 Inline Assembly Support
61142//===----------------------------------------------------------------------===//
61143
61146 .Case("{@cca}", X86::COND_A)
61147 .Case("{@ccae}", X86::COND_AE)
61148 .Case("{@ccb}", X86::COND_B)
61149 .Case("{@ccbe}", X86::COND_BE)
61150 .Case("{@ccc}", X86::COND_B)
61151 .Case("{@cce}", X86::COND_E)
61152 .Case("{@ccz}", X86::COND_E)
61153 .Case("{@ccg}", X86::COND_G)
61154 .Case("{@ccge}", X86::COND_GE)
61155 .Case("{@ccl}", X86::COND_L)
61156 .Case("{@ccle}", X86::COND_LE)
61157 .Case("{@ccna}", X86::COND_BE)
61158 .Case("{@ccnae}", X86::COND_B)
61159 .Case("{@ccnb}", X86::COND_AE)
61160 .Case("{@ccnbe}", X86::COND_A)
61161 .Case("{@ccnc}", X86::COND_AE)
61162 .Case("{@ccne}", X86::COND_NE)
61163 .Case("{@ccnz}", X86::COND_NE)
61164 .Case("{@ccng}", X86::COND_LE)
61165 .Case("{@ccnge}", X86::COND_L)
61166 .Case("{@ccnl}", X86::COND_GE)
61167 .Case("{@ccnle}", X86::COND_G)
61168 .Case("{@ccno}", X86::COND_NO)
61169 .Case("{@ccnp}", X86::COND_NP)
61170 .Case("{@ccns}", X86::COND_NS)
61171 .Case("{@cco}", X86::COND_O)
61172 .Case("{@ccp}", X86::COND_P)
61173 .Case("{@ccs}", X86::COND_S)
61175 return Cond;
61176}
61177
61178/// Given a constraint letter, return the type of constraint for this target.
61181 if (Constraint.size() == 1) {
61182 switch (Constraint[0]) {
61183 case 'R':
61184 case 'q':
61185 case 'Q':
61186 case 'f':
61187 case 't':
61188 case 'u':
61189 case 'y':
61190 case 'x':
61191 case 'v':
61192 case 'l':
61193 case 'k': // AVX512 masking registers.
61194 return C_RegisterClass;
61195 case 'a':
61196 case 'b':
61197 case 'c':
61198 case 'd':
61199 case 'S':
61200 case 'D':
61201 case 'A':
61202 return C_Register;
61203 case 'I':
61204 case 'J':
61205 case 'K':
61206 case 'N':
61207 case 'G':
61208 case 'L':
61209 case 'M':
61210 return C_Immediate;
61211 case 'C':
61212 case 'e':
61213 case 'Z':
61214 return C_Other;
61215 default:
61216 break;
61217 }
61218 }
61219 else if (Constraint.size() == 2) {
61220 switch (Constraint[0]) {
61221 default:
61222 break;
61223 case 'W':
61224 if (Constraint[1] != 's')
61225 break;
61226 return C_Other;
61227 case 'Y':
61228 switch (Constraint[1]) {
61229 default:
61230 break;
61231 case 'z':
61232 return C_Register;
61233 case 'i':
61234 case 'm':
61235 case 'k':
61236 case 't':
61237 case '2':
61238 return C_RegisterClass;
61239 }
61240 break;
61241 case 'j':
61242 switch (Constraint[1]) {
61243 default:
61244 break;
61245 case 'r':
61246 case 'R':
61247 return C_RegisterClass;
61248 }
61249 }
61250 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61251 return C_Other;
61252 return TargetLowering::getConstraintType(Constraint);
61253}
61254
61255/// Examine constraint type and operand type and determine a weight value.
61256/// This object must already have been set up with the operand type
61257/// and the current alternative constraint selected.
61260 AsmOperandInfo &Info, const char *Constraint) const {
61262 Value *CallOperandVal = Info.CallOperandVal;
61263 // If we don't have a value, we can't do a match,
61264 // but allow it at the lowest weight.
61265 if (!CallOperandVal)
61266 return CW_Default;
61267 Type *Ty = CallOperandVal->getType();
61268 // Look at the constraint type.
61269 switch (*Constraint) {
61270 default:
61272 [[fallthrough]];
61273 case 'R':
61274 case 'q':
61275 case 'Q':
61276 case 'a':
61277 case 'b':
61278 case 'c':
61279 case 'd':
61280 case 'S':
61281 case 'D':
61282 case 'A':
61283 if (CallOperandVal->getType()->isIntegerTy())
61284 Wt = CW_SpecificReg;
61285 break;
61286 case 'f':
61287 case 't':
61288 case 'u':
61289 if (Ty->isFloatingPointTy())
61290 Wt = CW_SpecificReg;
61291 break;
61292 case 'y':
61293 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61294 Wt = CW_SpecificReg;
61295 break;
61296 case 'Y':
61297 if (StringRef(Constraint).size() != 2)
61298 break;
61299 switch (Constraint[1]) {
61300 default:
61301 return CW_Invalid;
61302 // XMM0
61303 case 'z':
61304 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61305 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61306 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61307 return CW_SpecificReg;
61308 return CW_Invalid;
61309 // Conditional OpMask regs (AVX512)
61310 case 'k':
61311 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61312 return CW_Register;
61313 return CW_Invalid;
61314 // Any MMX reg
61315 case 'm':
61316 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61317 return CW_SpecificReg;
61318 return CW_Invalid;
61319 // Any SSE reg when ISA >= SSE2, same as 'x'
61320 case 'i':
61321 case 't':
61322 case '2':
61323 if (!Subtarget.hasSSE2())
61324 return CW_Invalid;
61325 break;
61326 }
61327 break;
61328 case 'j':
61329 if (StringRef(Constraint).size() != 2)
61330 break;
61331 switch (Constraint[1]) {
61332 default:
61333 return CW_Invalid;
61334 case 'r':
61335 case 'R':
61336 if (CallOperandVal->getType()->isIntegerTy())
61337 Wt = CW_SpecificReg;
61338 break;
61339 }
61340 break;
61341 case 'v':
61342 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61343 Wt = CW_Register;
61344 [[fallthrough]];
61345 case 'x':
61346 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61347 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61348 Wt = CW_Register;
61349 break;
61350 case 'k':
61351 // Enable conditional vector operations using %k<#> registers.
61352 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61353 Wt = CW_Register;
61354 break;
61355 case 'I':
61356 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61357 if (C->getZExtValue() <= 31)
61358 Wt = CW_Constant;
61359 break;
61360 case 'J':
61361 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61362 if (C->getZExtValue() <= 63)
61363 Wt = CW_Constant;
61364 break;
61365 case 'K':
61366 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61367 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61368 Wt = CW_Constant;
61369 break;
61370 case 'L':
61371 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61372 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61373 Wt = CW_Constant;
61374 break;
61375 case 'M':
61376 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61377 if (C->getZExtValue() <= 3)
61378 Wt = CW_Constant;
61379 break;
61380 case 'N':
61381 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61382 if (C->getZExtValue() <= 0xff)
61383 Wt = CW_Constant;
61384 break;
61385 case 'G':
61386 case 'C':
61387 if (isa<ConstantFP>(CallOperandVal))
61388 Wt = CW_Constant;
61389 break;
61390 case 'e':
61391 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61392 if ((C->getSExtValue() >= -0x80000000LL) &&
61393 (C->getSExtValue() <= 0x7fffffffLL))
61394 Wt = CW_Constant;
61395 break;
61396 case 'Z':
61397 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61398 if (C->getZExtValue() <= 0xffffffff)
61399 Wt = CW_Constant;
61400 break;
61401 }
61402 return Wt;
61403}
61404
61405/// Try to replace an X constraint, which matches anything, with another that
61406/// has more specific requirements based on the type of the corresponding
61407/// operand.
61409LowerXConstraint(EVT ConstraintVT) const {
61410 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61411 // 'f' like normal targets.
61412 if (ConstraintVT.isFloatingPoint()) {
61413 if (Subtarget.hasSSE1())
61414 return "x";
61415 }
61416
61417 return TargetLowering::LowerXConstraint(ConstraintVT);
61418}
61419
61420// Lower @cc targets via setcc.
61422 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61423 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61424 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
61425 if (Cond == X86::COND_INVALID)
61426 return SDValue();
61427 // Check that return type is valid.
61428 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61429 OpInfo.ConstraintVT.getSizeInBits() < 8)
61430 report_fatal_error("Glue output operand is of invalid type");
61431
61432 // Get EFLAGS register. Only update chain when copyfrom is glued.
61433 if (Glue.getNode()) {
61434 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61435 Chain = Glue.getValue(1);
61436 } else
61437 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61438 // Extract CC code.
61439 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61440 // Extend to 32-bits
61441 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61442
61443 return Result;
61444}
61445
61446/// Lower the specified operand into the Ops vector.
61447/// If it is invalid, don't add anything to Ops.
61449 StringRef Constraint,
61450 std::vector<SDValue> &Ops,
61451 SelectionDAG &DAG) const {
61452 SDValue Result;
61453 char ConstraintLetter = Constraint[0];
61454 switch (ConstraintLetter) {
61455 default: break;
61456 case 'I':
61457 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61458 if (C->getZExtValue() <= 31) {
61459 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61460 Op.getValueType());
61461 break;
61462 }
61463 }
61464 return;
61465 case 'J':
61466 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61467 if (C->getZExtValue() <= 63) {
61468 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61469 Op.getValueType());
61470 break;
61471 }
61472 }
61473 return;
61474 case 'K':
61475 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61476 if (isInt<8>(C->getSExtValue())) {
61477 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61478 Op.getValueType());
61479 break;
61480 }
61481 }
61482 return;
61483 case 'L':
61484 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61485 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61486 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61487 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61488 Op.getValueType());
61489 break;
61490 }
61491 }
61492 return;
61493 case 'M':
61494 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61495 if (C->getZExtValue() <= 3) {
61496 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61497 Op.getValueType());
61498 break;
61499 }
61500 }
61501 return;
61502 case 'N':
61503 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61504 if (C->getZExtValue() <= 255) {
61505 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61506 Op.getValueType());
61507 break;
61508 }
61509 }
61510 return;
61511 case 'O':
61512 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61513 if (C->getZExtValue() <= 127) {
61514 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61515 Op.getValueType());
61516 break;
61517 }
61518 }
61519 return;
61520 case 'e': {
61521 // 32-bit signed value
61522 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61524 C->getSExtValue())) {
61525 // Widen to 64 bits here to get it sign extended.
61526 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61527 break;
61528 }
61529 // FIXME gcc accepts some relocatable values here too, but only in certain
61530 // memory models; it's complicated.
61531 }
61532 return;
61533 }
61534 case 'W': {
61535 assert(Constraint[1] == 's');
61536 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61537 // offset.
61538 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61539 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61540 BA->getValueType(0)));
61541 } else {
61542 int64_t Offset = 0;
61543 if (Op->getOpcode() == ISD::ADD &&
61544 isa<ConstantSDNode>(Op->getOperand(1))) {
61545 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61546 Op = Op->getOperand(0);
61547 }
61548 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61549 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61550 GA->getValueType(0), Offset));
61551 }
61552 return;
61553 }
61554 case 'Z': {
61555 // 32-bit unsigned value
61556 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61558 C->getZExtValue())) {
61559 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61560 Op.getValueType());
61561 break;
61562 }
61563 }
61564 // FIXME gcc accepts some relocatable values here too, but only in certain
61565 // memory models; it's complicated.
61566 return;
61567 }
61568 case 'i': {
61569 // Literal immediates are always ok.
61570 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61571 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61572 BooleanContent BCont = getBooleanContents(MVT::i64);
61573 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61575 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61576 : CST->getSExtValue();
61577 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61578 break;
61579 }
61580
61581 // In any sort of PIC mode addresses need to be computed at runtime by
61582 // adding in a register or some sort of table lookup. These can't
61583 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61584 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61586 return;
61587
61588 // If we are in non-pic codegen mode, we allow the address of a global (with
61589 // an optional displacement) to be used with 'i'.
61590 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61591 // If we require an extra load to get this address, as in PIC mode, we
61592 // can't accept it.
61594 Subtarget.classifyGlobalReference(GA->getGlobal())))
61595 return;
61596 break;
61597 }
61598 }
61599
61600 if (Result.getNode()) {
61601 Ops.push_back(Result);
61602 return;
61603 }
61604 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61605}
61606
61607/// Check if \p RC is a general purpose register class.
61608/// I.e., GR* or one of their variant.
61609static bool isGRClass(const TargetRegisterClass &RC) {
61610 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61611 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61612 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61613 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61614 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61615}
61616
61617/// Check if \p RC is a vector register class.
61618/// I.e., FR* / VR* or one of their variant.
61619static bool isFRClass(const TargetRegisterClass &RC) {
61620 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61621 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61622 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61623 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61624 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61625 RC.hasSuperClassEq(&X86::VR512RegClass);
61626}
61627
61628/// Check if \p RC is a mask register class.
61629/// I.e., VK* or one of their variant.
61630static bool isVKClass(const TargetRegisterClass &RC) {
61631 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61632 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61633 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61634 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61635 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61636 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61637 RC.hasSuperClassEq(&X86::VK64RegClass);
61638}
61639
61640static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61641 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61642}
61643
61644std::pair<unsigned, const TargetRegisterClass *>
61646 StringRef Constraint,
61647 MVT VT) const {
61648 // First, see if this is a constraint that directly corresponds to an LLVM
61649 // register class.
61650 if (Constraint.size() == 1) {
61651 // GCC Constraint Letters
61652 switch (Constraint[0]) {
61653 default: break;
61654 // 'A' means [ER]AX + [ER]DX.
61655 case 'A':
61656 if (Subtarget.is64Bit())
61657 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61658 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61659 "Expecting 64, 32 or 16 bit subtarget");
61660 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61661
61662 // TODO: Slight differences here in allocation order and leaving
61663 // RIP in the class. Do they matter any more here than they do
61664 // in the normal allocation?
61665 case 'k':
61666 if (Subtarget.hasAVX512()) {
61667 if (VT == MVT::v1i1 || VT == MVT::i1)
61668 return std::make_pair(0U, &X86::VK1RegClass);
61669 if (VT == MVT::v8i1 || VT == MVT::i8)
61670 return std::make_pair(0U, &X86::VK8RegClass);
61671 if (VT == MVT::v16i1 || VT == MVT::i16)
61672 return std::make_pair(0U, &X86::VK16RegClass);
61673 }
61674 if (Subtarget.hasBWI()) {
61675 if (VT == MVT::v32i1 || VT == MVT::i32)
61676 return std::make_pair(0U, &X86::VK32RegClass);
61677 if (VT == MVT::v64i1 || VT == MVT::i64)
61678 return std::make_pair(0U, &X86::VK64RegClass);
61679 }
61680 break;
61681 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61682 if (Subtarget.is64Bit()) {
61683 if (VT == MVT::i8 || VT == MVT::i1)
61684 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61685 ? &X86::GR8RegClass
61686 : &X86::GR8_NOREX2RegClass);
61687 if (VT == MVT::i16)
61688 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61689 ? &X86::GR16RegClass
61690 : &X86::GR16_NOREX2RegClass);
61691 if (VT == MVT::i32 || VT == MVT::f32)
61692 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61693 ? &X86::GR32RegClass
61694 : &X86::GR32_NOREX2RegClass);
61695 if (VT != MVT::f80 && !VT.isVector())
61696 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61697 ? &X86::GR64RegClass
61698 : &X86::GR64_NOREX2RegClass);
61699 break;
61700 }
61701 [[fallthrough]];
61702 // 32-bit fallthrough
61703 case 'Q': // Q_REGS
61704 if (VT == MVT::i8 || VT == MVT::i1)
61705 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61706 if (VT == MVT::i16)
61707 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61708 if (VT == MVT::i32 || VT == MVT::f32 ||
61709 (!VT.isVector() && !Subtarget.is64Bit()))
61710 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61711 if (VT != MVT::f80 && !VT.isVector())
61712 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61713 break;
61714 case 'r': // GENERAL_REGS
61715 case 'l': // INDEX_REGS
61716 if (VT == MVT::i8 || VT == MVT::i1)
61717 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61718 ? &X86::GR8RegClass
61719 : &X86::GR8_NOREX2RegClass);
61720 if (VT == MVT::i16)
61721 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61722 ? &X86::GR16RegClass
61723 : &X86::GR16_NOREX2RegClass);
61724 if (VT == MVT::i32 || VT == MVT::f32 ||
61725 (!VT.isVector() && !Subtarget.is64Bit()))
61726 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61727 ? &X86::GR32RegClass
61728 : &X86::GR32_NOREX2RegClass);
61729 if (VT != MVT::f80 && !VT.isVector())
61730 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61731 ? &X86::GR64RegClass
61732 : &X86::GR64_NOREX2RegClass);
61733 break;
61734 case 'R': // LEGACY_REGS
61735 if (VT == MVT::i8 || VT == MVT::i1)
61736 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61737 if (VT == MVT::i16)
61738 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61739 if (VT == MVT::i32 || VT == MVT::f32 ||
61740 (!VT.isVector() && !Subtarget.is64Bit()))
61741 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61742 if (VT != MVT::f80 && !VT.isVector())
61743 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61744 break;
61745 case 'f': // FP Stack registers.
61746 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61747 // value to the correct fpstack register class.
61748 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61749 return std::make_pair(0U, &X86::RFP32RegClass);
61750 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61751 return std::make_pair(0U, &X86::RFP64RegClass);
61752 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61753 return std::make_pair(0U, &X86::RFP80RegClass);
61754 break;
61755 case 'y': // MMX_REGS if MMX allowed.
61756 if (!Subtarget.hasMMX()) break;
61757 return std::make_pair(0U, &X86::VR64RegClass);
61758 case 'v':
61759 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61760 if (!Subtarget.hasSSE1()) break;
61761 bool VConstraint = (Constraint[0] == 'v');
61762
61763 switch (VT.SimpleTy) {
61764 default: break;
61765 // Scalar SSE types.
61766 case MVT::f16:
61767 if (VConstraint && Subtarget.hasFP16())
61768 return std::make_pair(0U, &X86::FR16XRegClass);
61769 break;
61770 case MVT::f32:
61771 case MVT::i32:
61772 if (VConstraint && Subtarget.hasVLX())
61773 return std::make_pair(0U, &X86::FR32XRegClass);
61774 return std::make_pair(0U, &X86::FR32RegClass);
61775 case MVT::f64:
61776 case MVT::i64:
61777 if (VConstraint && Subtarget.hasVLX())
61778 return std::make_pair(0U, &X86::FR64XRegClass);
61779 return std::make_pair(0U, &X86::FR64RegClass);
61780 case MVT::i128:
61781 if (Subtarget.is64Bit()) {
61782 if (VConstraint && Subtarget.hasVLX())
61783 return std::make_pair(0U, &X86::VR128XRegClass);
61784 return std::make_pair(0U, &X86::VR128RegClass);
61785 }
61786 break;
61787 // Vector types and fp128.
61788 case MVT::v8f16:
61789 if (!Subtarget.hasFP16())
61790 break;
61791 if (VConstraint)
61792 return std::make_pair(0U, &X86::VR128XRegClass);
61793 return std::make_pair(0U, &X86::VR128RegClass);
61794 case MVT::v8bf16:
61795 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61796 break;
61797 if (VConstraint)
61798 return std::make_pair(0U, &X86::VR128XRegClass);
61799 return std::make_pair(0U, &X86::VR128RegClass);
61800 case MVT::f128:
61801 if (!Subtarget.is64Bit())
61802 break;
61803 [[fallthrough]];
61804 case MVT::v16i8:
61805 case MVT::v8i16:
61806 case MVT::v4i32:
61807 case MVT::v2i64:
61808 case MVT::v4f32:
61809 case MVT::v2f64:
61810 if (VConstraint && Subtarget.hasVLX())
61811 return std::make_pair(0U, &X86::VR128XRegClass);
61812 return std::make_pair(0U, &X86::VR128RegClass);
61813 // AVX types.
61814 case MVT::v16f16:
61815 if (!Subtarget.hasFP16())
61816 break;
61817 if (VConstraint)
61818 return std::make_pair(0U, &X86::VR256XRegClass);
61819 return std::make_pair(0U, &X86::VR256RegClass);
61820 case MVT::v16bf16:
61821 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61822 break;
61823 if (VConstraint)
61824 return std::make_pair(0U, &X86::VR256XRegClass);
61825 return std::make_pair(0U, &X86::VR256RegClass);
61826 case MVT::v32i8:
61827 case MVT::v16i16:
61828 case MVT::v8i32:
61829 case MVT::v4i64:
61830 case MVT::v8f32:
61831 case MVT::v4f64:
61832 if (VConstraint && Subtarget.hasVLX())
61833 return std::make_pair(0U, &X86::VR256XRegClass);
61834 if (Subtarget.hasAVX())
61835 return std::make_pair(0U, &X86::VR256RegClass);
61836 break;
61837 case MVT::v32f16:
61838 if (!Subtarget.hasFP16())
61839 break;
61840 if (VConstraint)
61841 return std::make_pair(0U, &X86::VR512RegClass);
61842 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61843 case MVT::v32bf16:
61844 if (!Subtarget.hasBF16())
61845 break;
61846 if (VConstraint)
61847 return std::make_pair(0U, &X86::VR512RegClass);
61848 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61849 case MVT::v64i8:
61850 case MVT::v32i16:
61851 case MVT::v8f64:
61852 case MVT::v16f32:
61853 case MVT::v16i32:
61854 case MVT::v8i64:
61855 if (!Subtarget.hasAVX512()) break;
61856 if (VConstraint)
61857 return std::make_pair(0U, &X86::VR512RegClass);
61858 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61859 }
61860 break;
61861 }
61862 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61863 switch (Constraint[1]) {
61864 default:
61865 break;
61866 case 'i':
61867 case 't':
61868 case '2':
61869 return getRegForInlineAsmConstraint(TRI, "x", VT);
61870 case 'm':
61871 if (!Subtarget.hasMMX()) break;
61872 return std::make_pair(0U, &X86::VR64RegClass);
61873 case 'z':
61874 if (!Subtarget.hasSSE1()) break;
61875 switch (VT.SimpleTy) {
61876 default: break;
61877 // Scalar SSE types.
61878 case MVT::f16:
61879 if (!Subtarget.hasFP16())
61880 break;
61881 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61882 case MVT::f32:
61883 case MVT::i32:
61884 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61885 case MVT::f64:
61886 case MVT::i64:
61887 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61888 case MVT::v8f16:
61889 if (!Subtarget.hasFP16())
61890 break;
61891 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61892 case MVT::v8bf16:
61893 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61894 break;
61895 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61896 case MVT::f128:
61897 case MVT::v16i8:
61898 case MVT::v8i16:
61899 case MVT::v4i32:
61900 case MVT::v2i64:
61901 case MVT::v4f32:
61902 case MVT::v2f64:
61903 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61904 // AVX types.
61905 case MVT::v16f16:
61906 if (!Subtarget.hasFP16())
61907 break;
61908 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61909 case MVT::v16bf16:
61910 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61911 break;
61912 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61913 case MVT::v32i8:
61914 case MVT::v16i16:
61915 case MVT::v8i32:
61916 case MVT::v4i64:
61917 case MVT::v8f32:
61918 case MVT::v4f64:
61919 if (Subtarget.hasAVX())
61920 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61921 break;
61922 case MVT::v32f16:
61923 if (!Subtarget.hasFP16())
61924 break;
61925 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61926 case MVT::v32bf16:
61927 if (!Subtarget.hasBF16())
61928 break;
61929 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61930 case MVT::v64i8:
61931 case MVT::v32i16:
61932 case MVT::v8f64:
61933 case MVT::v16f32:
61934 case MVT::v16i32:
61935 case MVT::v8i64:
61936 if (Subtarget.hasAVX512())
61937 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61938 break;
61939 }
61940 break;
61941 case 'k':
61942 // This register class doesn't allocate k0 for masked vector operation.
61943 if (Subtarget.hasAVX512()) {
61944 if (VT == MVT::v1i1 || VT == MVT::i1)
61945 return std::make_pair(0U, &X86::VK1WMRegClass);
61946 if (VT == MVT::v8i1 || VT == MVT::i8)
61947 return std::make_pair(0U, &X86::VK8WMRegClass);
61948 if (VT == MVT::v16i1 || VT == MVT::i16)
61949 return std::make_pair(0U, &X86::VK16WMRegClass);
61950 }
61951 if (Subtarget.hasBWI()) {
61952 if (VT == MVT::v32i1 || VT == MVT::i32)
61953 return std::make_pair(0U, &X86::VK32WMRegClass);
61954 if (VT == MVT::v64i1 || VT == MVT::i64)
61955 return std::make_pair(0U, &X86::VK64WMRegClass);
61956 }
61957 break;
61958 }
61959 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
61960 switch (Constraint[1]) {
61961 default:
61962 break;
61963 case 'r':
61964 if (VT == MVT::i8 || VT == MVT::i1)
61965 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
61966 if (VT == MVT::i16)
61967 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
61968 if (VT == MVT::i32 || VT == MVT::f32)
61969 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
61970 if (VT != MVT::f80 && !VT.isVector())
61971 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
61972 break;
61973 case 'R':
61974 if (VT == MVT::i8 || VT == MVT::i1)
61975 return std::make_pair(0U, &X86::GR8RegClass);
61976 if (VT == MVT::i16)
61977 return std::make_pair(0U, &X86::GR16RegClass);
61978 if (VT == MVT::i32 || VT == MVT::f32)
61979 return std::make_pair(0U, &X86::GR32RegClass);
61980 if (VT != MVT::f80 && !VT.isVector())
61981 return std::make_pair(0U, &X86::GR64RegClass);
61982 break;
61983 }
61984 }
61985
61986 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61987 return std::make_pair(0U, &X86::GR32RegClass);
61988
61989 // Use the default implementation in TargetLowering to convert the register
61990 // constraint into a member of a register class.
61991 std::pair<Register, const TargetRegisterClass*> Res;
61993
61994 // Not found as a standard register?
61995 if (!Res.second) {
61996 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
61997 // to/from f80.
61998 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
61999 // Map st(0) -> st(7) -> ST0
62000 if (Constraint.size() == 7 && Constraint[0] == '{' &&
62001 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
62002 Constraint[3] == '(' &&
62003 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
62004 Constraint[5] == ')' && Constraint[6] == '}') {
62005 // st(7) is not allocatable and thus not a member of RFP80. Return
62006 // singleton class in cases where we have a reference to it.
62007 if (Constraint[4] == '7')
62008 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
62009 return std::make_pair(X86::FP0 + Constraint[4] - '0',
62010 &X86::RFP80RegClass);
62011 }
62012
62013 // GCC allows "st(0)" to be called just plain "st".
62014 if (StringRef("{st}").equals_insensitive(Constraint))
62015 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
62016 }
62017
62018 // flags -> EFLAGS
62019 if (StringRef("{flags}").equals_insensitive(Constraint))
62020 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
62021
62022 // dirflag -> DF
62023 // Only allow for clobber.
62024 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
62025 VT == MVT::Other)
62026 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
62027
62028 // fpsr -> FPSW
62029 // Only allow for clobber.
62030 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
62031 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
62032
62033 return Res;
62034 }
62035
62036 // Make sure it isn't a register that requires 64-bit mode.
62037 if (!Subtarget.is64Bit() &&
62038 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
62039 TRI->getEncodingValue(Res.first) >= 8) {
62040 // Register requires REX prefix, but we're in 32-bit mode.
62041 return std::make_pair(0, nullptr);
62042 }
62043
62044 // Make sure it isn't a register that requires AVX512.
62045 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
62046 TRI->getEncodingValue(Res.first) & 0x10) {
62047 // Register requires EVEX prefix.
62048 return std::make_pair(0, nullptr);
62049 }
62050
62051 // Otherwise, check to see if this is a register class of the wrong value
62052 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
62053 // turn into {ax},{dx}.
62054 // MVT::Other is used to specify clobber names.
62055 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
62056 return Res; // Correct type already, nothing to do.
62057
62058 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
62059 // return "eax". This should even work for things like getting 64bit integer
62060 // registers when given an f64 type.
62061 const TargetRegisterClass *Class = Res.second;
62062 // The generic code will match the first register class that contains the
62063 // given register. Thus, based on the ordering of the tablegened file,
62064 // the "plain" GR classes might not come first.
62065 // Therefore, use a helper method.
62066 if (isGRClass(*Class)) {
62067 unsigned Size = VT.getSizeInBits();
62068 if (Size == 1) Size = 8;
62069 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
62070 return std::make_pair(0, nullptr);
62071 Register DestReg = getX86SubSuperRegister(Res.first, Size);
62072 if (DestReg.isValid()) {
62073 bool is64Bit = Subtarget.is64Bit();
62074 const TargetRegisterClass *RC =
62075 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
62076 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
62077 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
62078 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
62079 if (Size == 64 && !is64Bit) {
62080 // Model GCC's behavior here and select a fixed pair of 32-bit
62081 // registers.
62082 switch (DestReg) {
62083 case X86::RAX:
62084 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
62085 case X86::RDX:
62086 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
62087 case X86::RCX:
62088 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
62089 case X86::RBX:
62090 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
62091 case X86::RSI:
62092 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
62093 case X86::RDI:
62094 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
62095 case X86::RBP:
62096 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
62097 default:
62098 return std::make_pair(0, nullptr);
62099 }
62100 }
62101 if (RC && RC->contains(DestReg))
62102 return std::make_pair(DestReg, RC);
62103 return Res;
62104 }
62105 // No register found/type mismatch.
62106 return std::make_pair(0, nullptr);
62107 } else if (isFRClass(*Class)) {
62108 // Handle references to XMM physical registers that got mapped into the
62109 // wrong class. This can happen with constraints like {xmm0} where the
62110 // target independent register mapper will just pick the first match it can
62111 // find, ignoring the required type.
62112
62113 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
62114 if (VT == MVT::f16)
62115 Res.second = &X86::FR16XRegClass;
62116 else if (VT == MVT::f32 || VT == MVT::i32)
62117 Res.second = &X86::FR32XRegClass;
62118 else if (VT == MVT::f64 || VT == MVT::i64)
62119 Res.second = &X86::FR64XRegClass;
62120 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
62121 Res.second = &X86::VR128XRegClass;
62122 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
62123 Res.second = &X86::VR256XRegClass;
62124 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
62125 Res.second = &X86::VR512RegClass;
62126 else {
62127 // Type mismatch and not a clobber: Return an error;
62128 Res.first = 0;
62129 Res.second = nullptr;
62130 }
62131 } else if (isVKClass(*Class)) {
62132 if (VT == MVT::v1i1 || VT == MVT::i1)
62133 Res.second = &X86::VK1RegClass;
62134 else if (VT == MVT::v8i1 || VT == MVT::i8)
62135 Res.second = &X86::VK8RegClass;
62136 else if (VT == MVT::v16i1 || VT == MVT::i16)
62137 Res.second = &X86::VK16RegClass;
62138 else if (VT == MVT::v32i1 || VT == MVT::i32)
62139 Res.second = &X86::VK32RegClass;
62140 else if (VT == MVT::v64i1 || VT == MVT::i64)
62141 Res.second = &X86::VK64RegClass;
62142 else {
62143 // Type mismatch and not a clobber: Return an error;
62144 Res.first = 0;
62145 Res.second = nullptr;
62146 }
62147 }
62148
62149 return Res;
62150}
62151
62152bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
62153 // Integer division on x86 is expensive. However, when aggressively optimizing
62154 // for code size, we prefer to use a div instruction, as it is usually smaller
62155 // than the alternative sequence.
62156 // The exception to this is vector division. Since x86 doesn't have vector
62157 // integer division, leaving the division as-is is a loss even in terms of
62158 // size, because it will have to be scalarized, while the alternative code
62159 // sequence can be performed in vector form.
62160 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62161 return OptSize && !VT.isVector();
62162}
62163
62164void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62165 if (!Subtarget.is64Bit())
62166 return;
62167
62168 // Update IsSplitCSR in X86MachineFunctionInfo.
62170 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62171 AFI->setIsSplitCSR(true);
62172}
62173
62174void X86TargetLowering::insertCopiesSplitCSR(
62175 MachineBasicBlock *Entry,
62176 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62177 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62178 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62179 if (!IStart)
62180 return;
62181
62182 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62183 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62184 MachineBasicBlock::iterator MBBI = Entry->begin();
62185 for (const MCPhysReg *I = IStart; *I; ++I) {
62186 const TargetRegisterClass *RC = nullptr;
62187 if (X86::GR64RegClass.contains(*I))
62188 RC = &X86::GR64RegClass;
62189 else
62190 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62191
62192 Register NewVR = MRI->createVirtualRegister(RC);
62193 // Create copy from CSR to a virtual register.
62194 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62195 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62196 // nounwind. If we want to generalize this later, we may need to emit
62197 // CFI pseudo-instructions.
62198 assert(
62199 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62200 "Function should be nounwind in insertCopiesSplitCSR!");
62201 Entry->addLiveIn(*I);
62202 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62203 .addReg(*I);
62204
62205 // Insert the copy-back instructions right before the terminator.
62206 for (auto *Exit : Exits)
62207 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62208 TII->get(TargetOpcode::COPY), *I)
62209 .addReg(NewVR);
62210 }
62211}
62212
62214 return Subtarget.is64Bit();
62215}
62216
62220 const TargetInstrInfo *TII) const {
62221 assert(MBBI->isCall() && MBBI->getCFIType() &&
62222 "Invalid call instruction for a KCFI check");
62223
62224 MachineFunction &MF = *MBB.getParent();
62225 // If the call target is a memory operand, unfold it and use R11 for the
62226 // call, so KCFI_CHECK won't have to recompute the address.
62227 switch (MBBI->getOpcode()) {
62228 case X86::CALL64m:
62229 case X86::CALL64m_NT:
62230 case X86::TAILJMPm64:
62231 case X86::TAILJMPm64_REX: {
62234 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62235 /*UnfoldStore=*/false, NewMIs))
62236 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62237 for (auto *NewMI : NewMIs)
62238 MBBI = MBB.insert(OrigCall, NewMI);
62239 assert(MBBI->isCall() &&
62240 "Unexpected instruction after memory operand unfolding");
62241 if (OrigCall->shouldUpdateAdditionalCallInfo())
62242 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62243 MBBI->setCFIType(MF, OrigCall->getCFIType());
62244 OrigCall->eraseFromParent();
62245 break;
62246 }
62247 default:
62248 break;
62249 }
62250
62251 MachineOperand &Target = MBBI->getOperand(0);
62252 Register TargetReg;
62253 switch (MBBI->getOpcode()) {
62254 case X86::CALL64r:
62255 case X86::CALL64r_ImpCall:
62256 case X86::CALL64r_NT:
62257 case X86::TAILJMPr64:
62258 case X86::TAILJMPr64_REX:
62259 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62260 Target.setIsRenamable(false);
62261 TargetReg = Target.getReg();
62262 break;
62263 case X86::CALL64pcrel32:
62264 case X86::TAILJMPd64:
62265 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62266 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62267 // 64-bit indirect thunk calls.
62268 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62269 "Unexpected register for an indirect thunk call");
62270 TargetReg = X86::R11;
62271 break;
62272 default:
62273 llvm_unreachable("Unexpected CFI call opcode");
62274 break;
62275 }
62276
62277 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62278 .addReg(TargetReg)
62279 .addImm(MBBI->getCFIType())
62280 .getInstr();
62281}
62282
62283/// Returns true if stack probing through a function call is requested.
62287
62288/// Returns true if stack probing through inline assembly is requested.
62290
62291 // No inline stack probe for Windows, they have their own mechanism.
62292 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62293 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62294 return false;
62295
62296 // If the function specifically requests inline stack probes, emit them.
62297 if (MF.getFunction().hasFnAttribute("probe-stack"))
62298 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62299 "inline-asm";
62300
62301 return false;
62302}
62303
62304/// Returns the name of the symbol used to emit stack probes or the empty
62305/// string if not applicable.
62308 // Inline Stack probes disable stack probe call
62309 if (hasInlineStackProbe(MF))
62310 return "";
62311
62312 // If the function specifically requests stack probes, emit them.
62313 if (MF.getFunction().hasFnAttribute("probe-stack"))
62314 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62315
62316 // Generally, if we aren't on Windows, the platform ABI does not include
62317 // support for stack probes, so don't emit them.
62318 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62319 Subtarget.isTargetMachO() ||
62320 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62321 return "";
62322
62323 // We need a stack probe to conform to the Windows ABI. Choose the right
62324 // symbol.
62325 if (Subtarget.is64Bit())
62326 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62327 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62328}
62329
62330unsigned
62332 // The default stack probe size is 4096 if the function has no stackprobesize
62333 // attribute.
62334 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62335 4096);
62336}
62337
62339 if (ML && ML->isInnermost() &&
62340 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62343}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISDPAC...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code?
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
The Input class is used to parse a yaml document into in-memory structs and vectors.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition APFloat.cpp:6082
void clearSign()
Definition APFloat.h:1298
opStatus next(bool nextDown)
Definition APFloat.h:1254
void changeSign()
Definition APFloat.h:1297
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
APInt abs() const
Get the absolute value.
Definition APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition APInt.h:1079
int32_t exactLogBase2() const
Definition APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
unsigned countTrailingZeros() const
Definition APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition APInt.h:1435
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:471
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
This is an SDNode representing atomic operations.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition BitVector.h:162
bool any() const
any - Returns true if any bit is set.
Definition BitVector.h:170
bool none() const
none - Returns true if none of the bits are set.
Definition BitVector.h:188
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_NE
not equal
Definition InstrTypes.h:700
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
unsigned size() const
Definition DenseMap.h:108
bool empty() const
Definition DenseMap.h:107
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
Tagged union holding either a T or a Error.
Definition Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:436
ThreadLocalMode getThreadLocalMode() const
Module * getParent()
Get the module that this global value is contained inside of...
This class is used to form a handle around another node that is persistent and is updated across invo...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
succ_reverse_iterator succ_rbegin()
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
size_type size() const
Definition SmallSet.h:170
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
static constexpr size_t npos
Definition StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition StringRef.h:172
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition Triple.h:774
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
use_iterator use_begin()
Definition Value.h:364
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
Register getPtrSizedStackRegister(const MachineFunction &MF) const
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
bool hasAnyFMA() const
bool hasSSE1() const
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
bool hasBitScanPassThrough() const
bool hasSSE42() const
const X86TargetLowering * getTargetLowering() const override
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
bool canUseCMOV() const
bool isTargetDarwin() const
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
const X86InstrInfo * getInstrInfo() const override
bool useAVX512Regs() const
bool hasSSE3() const
bool isCallingConvWin64(CallingConv::ID CC) const
bool hasAVX512() const
bool canExtendTo512DQ() const
bool hasSSE41() const
bool hasSSE2() const
bool hasSSSE3() const
bool hasInt256() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
unsigned getPreferVectorWidth() const
const X86FrameLowering * getFrameLowering() const override
bool useBWIRegs() const
bool hasAVX2() const
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:169
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define INT64_MIN
Definition DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
@ COND_NE
Not equal.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ X86_ThisCall
Similar to X86_StdCall.
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition ISDOpcodes.h:140
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:892
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition ISDOpcodes.h:151
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:881
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isExtVecInRegOpcode(unsigned Opcode)
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Opcode_match m_Opc(unsigned Opcode)
BinaryOpc_match< LHS, RHS > m_Srl(const LHS &L, const RHS &R)
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
TernaryOpc_match< LHS, RHS, IDX > m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx)
UnaryOpc_match< Opnd > m_Abs(const Opnd &Op)
Or< Preds... > m_AnyOf(const Preds &...preds)
And< Preds... > m_AllOf(const Preds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
UnaryOpc_match< Opnd > m_AnyExt(const Opnd &Op)
auto m_Node(unsigned Opcode, const OpndPreds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
CondCode_match m_SpecificCondCode(ISD::CondCode CC)
Match a conditional code SDNode with a specific ISD::CondCode.
auto m_SpecificVectorElementVT(EVT RefVT, const Pattern &P)
Match a vector ValueType.
CondCode_match m_CondCode()
Match any conditional code SDNode.
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
Invariant opcodes: All instruction sets have these as their low opcodes.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ PTR32_UPTR
Definition X86.h:217
@ PTR64
Definition X86.h:218
@ PTR32_SPTR
Definition X86.h:216
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
Define some predicates that are used for node matching.
@ AddrNumOperands
Definition X86BaseInfo.h:36
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getRoundingModeX86(unsigned RM)
Convert LLVM rounding mode to X86 rounding mode.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
constexpr double e
Definition MathExtras.h:47
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition Utils.cpp:1607
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2056
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:314
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1757
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1974
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
To bit_cast(const From &from) noexcept
Definition bit.h:90
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1840
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1934
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1815
constexpr unsigned BitWidth
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2088
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition STLExtras.h:1584
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
@ SM_SentinelUndef
@ SM_SentinelZero
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
#define EQ(a, b)
Definition regexec.c:65
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition APFloat.h:308
static LLVM_ABI const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition APFloat.cpp:289
static LLVM_ABI const fltSemantics & IEEEquad() LLVM_READNONE
Definition APFloat.cpp:268
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
opStatus
IEEE-754R 7: Default exception handling.
Definition APFloat.h:320
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition KnownBits.h:186
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition KnownBits.h:274
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition KnownBits.h:289
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:111
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition KnownBits.h:196
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
void setNoSignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
X86AddressMode - This struct holds a generalized full x86 address mode.